In [35]:
import pandas as pd
import numpy as np
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier,GradientBoostingClassifier
from IPython.display import display
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute._base import SimpleImputer as Imputer
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [69]:
titanic_df = pd.read_csv('titanic_train.csv')
titanic_df = titanic_df.drop(['PassengerId', "Name"], axis=1)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [70]:
# Dropping Passenger ID as its not required feature

titanic_df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [71]:
# Split the data into train and test.
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [72]:
numeric_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
# name_feature = ['Name']
cabin_feature = ['Cabin']

In [73]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [74]:
cabin_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='U')),
    ('labelEncoder', LabelEncoder()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [75]:
class CabinFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the CabinFeatureTransformer init method: ')
        
    def fit(self, x, y=None):
        x.Cabin.fillna('U', inplace=True)
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin')    
        self.cabin_columns=  cabin_dummies.columns
        return self

    def transform(self, x):
        # replacing missing cabins with U (for Uknown)
        x.Cabin.fillna('U', inplace=True)
    
        # mapping each Cabin value with the cabin letter
        x['Cabin'] = x['Cabin'].map(lambda c: c[0])
        
        cabin_dummies = pd.get_dummies(x['Cabin'], prefix='Cabin') 
        cabin_dummies = cabin_dummies.reindex(columns = self.cabin_columns, fill_value=0)
        
        x = pd.concat([x, cabin_dummies], axis=1)

        x.drop('Cabin', axis=1, inplace=True)
    
        return x

In [76]:
class NameFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print('in the NameFeatureTransformer Init method: ')
        
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        Title_Dictionary = {
                "Capt": "Officer", "Col": "Officer", "Major": "Officer","Jonkheer": "Royalty",
                "Don": "Royalty","Sir" : "Royalty","Dr": "Officer","Rev": "Officer","the Countess":"Royalty",
                "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs", "Mr" : "Mr", "Mrs" : "Mrs", "Miss" : "Miss",
                "Master" : "Master", "Lady" : "Royalty"}
        
        x['Title'] = x['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
        x['Title'] = x.Title.map(Title_Dictionary)
        
        x.drop('Name', axis=1, inplace=True)
    
        titles_dummies = pd.get_dummies(x['Title'], prefix='Title')
        titles_dummies = titles_dummies.reindex(columns = titles_dummies.columns, fill_value=0)
        x = pd.concat([x, titles_dummies], axis=1)
    
        x.drop('Title', axis=1, inplace=True)
        return x.values

In [80]:
transformer = ColumnTransformer(
    transformers=[
        ('numeric_data_preprocessing', numeric_transformer, numeric_features),
        ('categorical_data_preprocessing', categorical_transformer, categorical_features),
        ('cabin_data_preprocessing', CabinFeatureTransformer(), cabin_feature)
#         ,
#         ('name_data_preprocessing', NameFeatureTransformer(), name_feature)
    ])

in the CabinFeatureTransformer init method: 


In [81]:
final_pipeline = Pipeline(steps=[('transformer', transformer),
                      ('rf_estimator', RandomForestClassifier())])

In [82]:
final_pipeline.fit(X_train, y_train)

in the CabinFeatureTransformer init method: 


Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('numeric_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare']),
                                                 ('categorical_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                             

In [83]:
y_pred = final_pipeline.predict(X_test)

In [65]:
X_train

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
170,1,"Van der hoef, Mr. Wyckoff",male,61.00,0,0,111240,33.5000,B19,S
626,2,"Kirkland, Rev. Charles Leonard",male,57.00,0,0,219533,12.3500,,Q
661,3,"Badt, Mr. Mohamed",male,40.00,0,0,2623,7.2250,,C
673,2,"Wilhelms, Mr. Charles",male,31.00,0,0,244270,13.0000,,S
219,2,"Harris, Mr. Walter",male,30.00,0,0,W/C 14208,10.5000,,S
...,...,...,...,...,...,...,...,...,...,...
480,3,"Goodwin, Master. Harold Victor",male,9.00,5,2,CA 2144,46.9000,,S
726,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30.00,3,0,31027,21.0000,,S
475,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52.0000,A14,S
644,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C


In [84]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))

Accuracy Score:  0.7486033519553073
F1 Score:  0.7460610697105482
Precision Score:  0.7447425693866212
Recall Score:  0.7486033519553073


In [85]:
import pickle
with open('Titanic_pipeline.pkl', 'wb') as f:
    pickle.dump(final_pipeline, f)

In [86]:
rf_pickle = pickle.load(open('Titanic_pipeline.pkl','rb'))
y_pred = rf_pickle.predict(X_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))

Accuracy Score:  0.7486033519553073
F1 Score:  0.7460610697105482
Precision Score:  0.7447425693866212
Recall Score:  0.7486033519553073


In [107]:
from sklearn.linear_model import LogisticRegression
param_grid = [
    {
        "rf_estimator__C": [0.1, 1.0, 10.0, 100.0],
        "rf_estimator": [LogisticRegression()]
    },
    {
        "rf_estimator__n_estimators": [10, 100, 1000],
        "rf_estimator": [RandomForestClassifier()]
    }
]

In [108]:

from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
grid_search = GridSearchCV(final_pipeline, param_grid, cv=10, verbose=1,n_jobs=-1)
grid_search.fit(X_train, y_train)

in the CabinFeatureTransformer init method: 
Fitting 10 folds for each of 7 candidates, totalling 70 fits
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init method: 
in the CabinFeatureTransformer init met

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(transformers=[('numeric_data_preprocessing',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Age',
                                                                          'Fare']),
                                                                        ('categorical_data_preprocessing',
                                                                         Pipeline(steps=[('imputer',
            

In [101]:
final_pipeline

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('numeric_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare']),
                                                 ('categorical_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                             

In [109]:
print(f"Best params:")
print(grid_search.best_params_)
print("Best score in grid search:")
print(grid_search.best_score_)
print("best logistic regression from grid search:")
print(grid_search.score(X_test, y_test))

Best params:
{'rf_estimator': LogisticRegression(), 'rf_estimator__C': 1.0}
Best score in grid search:
0.8187793427230048
best logistic regression from grid search:
0.770949720670391
