In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Preparing data for feature engineering

In [3]:
passengerId = test['PassengerId'] # Passenger Id column will be used for the output dataframe
survived = train['Survived'] # Survived column will be used in the training

train.drop(columns='Survived', inplace=True)
data = pd.concat([train, test]);
data.drop(columns='PassengerId', inplace=True)
data.reset_index(drop=True, inplace=True)

# Name

In [4]:
# Extracting titles from full names
data['Title'] = data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
data.drop(columns='Name', inplace=True)
data['Title'].head()

0      Mr
1     Mrs
2    Miss
3     Mrs
4      Mr
Name: Title, dtype: object

In [5]:
data['Title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Ms                2
Mlle              2
Major             2
Don               1
the Countess      1
Sir               1
Mme               1
Capt              1
Jonkheer          1
Lady              1
Dona              1
Name: Title, dtype: int64

In [6]:
# Grouping the titles in more general groups
grouped_titles = {
    'Mr' :           'Mr',
    
    'Mrs':           'Mrs',
    'Mme':           'Mrs',
    'Ms':            'Mrs',
    
    'Miss':          'Miss',
    'Mlle':          'Miss',
    
    'Master':        'Master',
    
    'Don':           'Royalty',
    'Lady':          'Royalty',
    'Sir':           'Royalty',
    'the Countess':  'Royalty',
    'Jonkheer':      'Royalty',
    'Dona':          'Royalty',
    
    'Rev':           'Crew',
    'Dr':            'Crew',
    'Major':         'Crew',
    'Col':           'Crew',
    'Capt':          'Crew' 
}

In [7]:
# Updating titles
data['Title'] = data['Title'].map(grouped_titles)
data['Title'].value_counts()

Mr         757
Miss       262
Mrs        200
Master      61
Crew        23
Royalty      6
Name: Title, dtype: int64

# Age column

In [8]:
# We assume that people's age depends on their Sex, Pclass and Title
grouped = data.groupby(['Sex','Pclass', 'Title'])
print(grouped.Age.median())

Sex     Pclass  Title  
female  1       Crew       49.0
                Miss       30.0
                Mrs        45.0
                Royalty    39.0
        2       Miss       20.0
                Mrs        30.0
        3       Miss       18.0
                Mrs        31.0
male    1       Crew       52.0
                Master      6.0
                Mr         41.5
                Royalty    40.0
        2       Crew       41.5
                Master      2.0
                Mr         30.0
        3       Master      6.0
                Mr         26.0
Name: Age, dtype: float64


In [9]:
# Transforming each group to dictionary key where values are median ages in that groups
dict_ages_grouped = grouped.Age.median().apply(lambda x: x).to_dict()
dict_ages_grouped

{('female', 1, 'Crew'): 49.0,
 ('female', 1, 'Miss'): 30.0,
 ('female', 1, 'Mrs'): 45.0,
 ('female', 1, 'Royalty'): 39.0,
 ('female', 2, 'Miss'): 20.0,
 ('female', 2, 'Mrs'): 30.0,
 ('female', 3, 'Miss'): 18.0,
 ('female', 3, 'Mrs'): 31.0,
 ('male', 1, 'Crew'): 52.0,
 ('male', 1, 'Master'): 6.0,
 ('male', 1, 'Mr'): 41.5,
 ('male', 1, 'Royalty'): 40.0,
 ('male', 2, 'Crew'): 41.5,
 ('male', 2, 'Master'): 2.0,
 ('male', 2, 'Mr'): 30.0,
 ('male', 3, 'Master'): 6.0,
 ('male', 3, 'Mr'): 26.0}

In [10]:
for groups in dict_ages_grouped:
    ind_Sex = data['Sex']==groups[0]
    ind_Pclass = data['Pclass']==groups[1]
    ind_Title = data['Title']==groups[2]
    data.loc[(ind_Sex) & (ind_Pclass) & (ind_Title), 'Age'] = data.loc[(ind_Sex) & (ind_Pclass) & (ind_Title), 'Age'].fillna(dict_ages_grouped[groups])

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Title       1309 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 102.3+ KB


Missing ages are imputed.

# Cabin column
As was shown in the EDA, we will use just the first letter of the cabin. We will impute missing values in the cabin column with "M" (Missed).

In [13]:
data['Cabin'] = data['Cabin'].fillna('M')
data['Cabin_'] = data['Cabin'].apply(lambda x: str(x)[0])

In [14]:
data.drop(columns='Cabin', inplace=True)
data.rename(columns={"Cabin_": "Cabin"}, inplace=True)
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Cabin
0,3,male,22.0,1,0,A/5 21171,7.25,S,Mr,M
1,1,female,38.0,1,0,PC 17599,71.2833,C,Mrs,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss,M
3,1,female,35.0,1,0,113803,53.1,S,Mrs,C
4,3,male,35.0,0,0,373450,8.05,S,Mr,M


# Ticket column

In [15]:
# As discussed in EDA file, we will drop the Ticket column.
data.drop(columns='Ticket', inplace=True)

# Fare column

In [16]:
# The Fare column contains just one missing value. 
# Thus, we will impute it with the median value as it is less
# vulnerable to outliers than the mean value.
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

# Embarked column
It has just two missing values so we will impute them with the most popular port of embarkation

In [17]:
data['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [18]:
data['Embarked'] = data['Embarked'].fillna('S')

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1309 non-null float64
Embarked    1309 non-null object
Title       1309 non-null object
Cabin       1309 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 92.1+ KB


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
Pclass      1309 non-null int64
Sex         1309 non-null object
Age         1309 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1309 non-null float64
Embarked    1309 non-null object
Title       1309 non-null object
Cabin       1309 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 92.1+ KB


Now our data contain no missing values.

In [21]:
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
0,3,male,22.0,1,0,7.25,S,Mr,M
1,1,female,38.0,1,0,71.2833,C,Mrs,C
2,3,female,26.0,0,0,7.925,S,Miss,M
3,1,female,35.0,1,0,53.1,S,Mrs,C
4,3,male,35.0,0,0,8.05,S,Mr,M


# "SibSp" and "Parch" columns

In [22]:
# Adding number of family members to our data (see EDA for the details)
data['FamSize'] = data['SibSp'] + data['Parch'] + 1
data.drop(columns=['SibSp', 'Parch'], inplace=True)

In [23]:
data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Cabin,FamSize
0,3,male,22.0,7.25,S,Mr,M,2
1,1,female,38.0,71.2833,C,Mrs,C,2
2,3,female,26.0,7.925,S,Miss,M,1
3,1,female,35.0,53.1,S,Mrs,C,2
4,3,male,35.0,8.05,S,Mr,M,1


In [24]:
# As discussed in the EDA it makes sense to divide FamSize column into
# three categories: "solo" (1), "small" (2-4) and "big" (5+)
data['FamSize'] = np.where(data['FamSize'] == 1 , 'solo',
                         np.where(data['FamSize'] <= 4,'small', 
                         'big'))
data.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Cabin,FamSize
0,3,male,22.0,7.25,S,Mr,M,small
1,1,female,38.0,71.2833,C,Mrs,C,small
2,3,female,26.0,7.925,S,Miss,M,solo
3,1,female,35.0,53.1,S,Mrs,C,small
4,3,male,35.0,8.05,S,Mr,M,solo


# Now we will create dummy variables for categorical predictors. 
(Pclass will be treated as categorical feature).

In [25]:
# As Sex column in this case represents binary values, we will not need dummy variables for it
data['Sex'] = data['Sex'].map({'male': 0, 'female':1})

In [26]:
dum_Pclass   = pd.get_dummies(data['Pclass'], prefix='Pclass')
dum_Embarked = pd.get_dummies(data['Embarked'], prefix='Embarked')
dum_Title    = pd.get_dummies(data['Title'], prefix='Title')
dum_Cabin    = pd.get_dummies(data['Cabin'], prefix='Cabin')
dum_FamSize    = pd.get_dummies(data['FamSize'], prefix='FamSize')

In [27]:
# Adding obtained dummy variables to our dataset
data = pd.concat([data, dum_Pclass, dum_Embarked, 
                  dum_Title, dum_Cabin, dum_FamSize], axis=1)

In [28]:
data.drop(columns=['Pclass', 'Embarked', 'Title', 'Cabin', 'FamSize'], inplace=True)

In [29]:
data.head()

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_Crew,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_M,Cabin_T,FamSize_big,FamSize_small,FamSize_solo
0,0,22.0,7.25,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
1,1,38.0,71.2833,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,1,26.0,7.925,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
3,1,35.0,53.1,1,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
4,0,35.0,8.05,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1


# Restoring train and test data

In [30]:
x_train = data.iloc[:survived.shape[0],:].values
y = survived.values
x_test = data.iloc[survived.shape[0]:,:].values

In [31]:
print('x_train shape:', x_train.shape)
print('y shape:', y.shape)
print('x_test shape:', x_test.shape)

x_train shape: (891, 27)
y shape: (891,)
x_test shape: (418, 27)


# Logistic Regression Classifier

In [32]:
lr_params = {'C' : np.geomspace(1e-6,1e+6,num=13),
             'penalty' : ['l1', 'l2']
            }

In [33]:
lr = LogisticRegression()
lr_cv = GridSearchCV(estimator=lr, param_grid=lr_params, cv=10, n_jobs=-1);
lr_cv.fit(x_train, y);

In [34]:
print('Logistic Regression best score: ', lr_cv.best_score_)
print('Logistic Regression best parameters: ', lr_cv.best_params_)

Logistic Regression best score:  0.8338945005611672
Logistic Regression best parameters:  {'C': 1.0, 'penalty': 'l2'}


# Random Forest Classifier

In [35]:
rf_params = {'n_estimators' : np.arange(3,x_train.shape[1]+2,3),
             'max_depth': np.arange(2, 10, 1),
             'min_samples_split' : np.arange(2, 5, 1),
             'min_samples_leaf' : np.arange(1, 5, 1),
             'max_features': ['auto', 'sqrt', 1/3]
            }

In [36]:
rf = RandomForestClassifier()
rf_cv = GridSearchCV(estimator=rf, param_grid=rf_params, cv=10, n_jobs=-1);
rf_cv.fit(x_train, y);

In [37]:
print('Random Forest best score: ', rf_cv.best_score_)
print('Random Forest best parameters: ', rf_cv.best_params_)

Random Forest best score:  0.8496071829405163
Random Forest best parameters:  {'max_depth': 9, 'max_features': 0.3333333333333333, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 12}


# Final Submission

Random Forest showed better perfomance, hence it will be used for the final submission.

In [38]:
predictions = rf_cv.predict(x_test)

In [39]:
submission = pd.DataFrame({'PassengerId': passengerId, 
                          'Survived': predictions})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [40]:
submission.to_csv('Data/titanic_submission.csv', index=False)