# Titanic Machine learning

## Loading packages

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV

## Importing Data

In [2]:
train = pd.read_csv("data/train.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Quick EDA

### Finding the survival rate

In [4]:
train.Survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

### Dealing with missing values

In [5]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Filling in median age

In [6]:
# basing it off class and sex
mean_ages = train.groupby(["Pclass", "Sex"])['Age'].median().reset_index()

mean_ages = mean_ages.rename(columns = {'Age' : 'age_fill'})

train = pd.merge(train, mean_ages)

train['Age'] = np.where(train.Age.isnull(), train.age_fill, train.Age)

train.drop('age_fill', axis = 1, inplace=True)

In [7]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
2,6,0,3,"Moran, Mr. James",male,25.0,0,0,330877,8.4583,,Q
3,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
4,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,,S


Embarked

In [8]:
train.Embarked.value_counts(normalize=True)

S    0.724409
C    0.188976
Q    0.086614
Name: Embarked, dtype: float64

Since there are only two missing values I will just put them in S as it's the most populated

In [9]:
train.Embarked = train.Embarked.fillna('S')

## Feature Engineering

Perhaps travelling by yourself changes changes on surviving

In [10]:
train['alone'] = np.where( (train.SibSp == 0) & (train.Parch == 0), 1, 0)

creating family size feature

In [11]:
train['family_size'] =  1 + train.SibSp + train.Parch

Capping family size

In [12]:
train.family_size = np.where(train.family_size > 8, 8, train.family_size)

Capping number of siblings and parents

In [13]:
train.SibSp = np.where(train.SibSp > 3, 3, train.SibSp)

train.Parch = np.where(train.Parch > 3, 3, train.Parch)

Creating a child gender

In [14]:
train.Sex = np.where(train.Age < 16, 'child', train.Sex)

## Preparing data for modelling

In [15]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,alone,family_size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,2
1,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1
2,6,0,3,"Moran, Mr. James",male,25.0,0,0,330877,8.4583,,Q,1,1
3,8,0,3,"Palsson, Master. Gosta Leonard",child,2.0,3,1,349909,21.075,,S,0,5
4,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.05,,S,1,1


In [16]:
x = train.drop('Survived', axis = 1)
y = train.Survived

In [17]:
transformations = [('Embarked', LabelBinarizer()),
                     ('Fare', None),
                     ('Parch', None),
                     ('Pclass', LabelBinarizer()),
                    ('Sex', LabelBinarizer()),
                     ('SibSp', None),
                     ('family_size', None),
                     ('Age', None)]

mapper = DataFrameMapper(transformations)

x = mapper.fit_transform(x)

In [18]:
lr = LogisticRegression()

In [19]:
# pipeline = Pipeline([('featurize', mapper),
#                      ('logistic', lr)])

In [20]:
#model = pipeline.fit(X=x, y=y)

In [43]:
param_grid = {'C' : np.linspace(1,10,11),
              'penalty' : ['l1']}

cv_log = GridSearchCV(lr, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1, return_train_score=True)

cv_log.fit(x,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([  1. ,   1.9,   2.8,   3.7,   4.6,   5.5,   6.4,   7.3,   8.2,
         9.1,  10. ]), 'penalty': ['l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [44]:
cv_log.best_score_

0.78563411896745228

In [45]:
cv_log.best_estimator_

LogisticRegression(C=5.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [39]:
results = pd.DataFrame(cv_log.best_estimator_.coef_[0],
                mapper.transformed_names_)

results

Unnamed: 0,0
Embarked_C,0.391985
Embarked_Q,0.215385
Embarked_S,0.0
Fare,0.002474
Parch,1.055547
Pclass_1,1.440258
Pclass_2,0.394191
Pclass_3,-0.696754
Sex_child,2.380926
Sex_female,2.571039


# Testing

In [111]:
test = pd.read_csv("data/test.csv")

In [112]:
test = pd.merge(test, mean_ages)
test['Age'] = np.where(test.Age.isnull(), test.age_fill, test.Age)


In [113]:
test.drop('age_fill', axis = 1, inplace=True)

In [114]:
test.Embarked = test.Embarked.fillna('S')

In [115]:
test['alone'] = np.where( (test.SibSp == 0) & (test.Parch == 0), 1, 0)

In [116]:
test['family_size'] =  1 + test.SibSp + test.Parch

In [117]:
test.family_size = np.where(test.family_size > 8, 8, test.family_size)

In [118]:
test.SibSp = np.where(test.SibSp > 3, 3, test.SibSp)

test.Parch = np.where(test.Parch > 3, 3, test.Parch)

In [119]:
test.Sex = np.where(test.Age < 16, 'child', test.Sex)

In [120]:
test = test[test.Fare.notnull()]

In [121]:
test = mapper.fit_transform(test)

In [134]:
a= pd.Series(cv_log.predict(test))

In [138]:
a.head()

0    0
1    0
2    1
3    0
4    0
dtype: int64

In [139]:
a.value_counts(normalize=True)


0    0.604317
1    0.395683
dtype: float64

In [141]:
train.Survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64