# Kaggle Submission Project 2: ML Classification

The notebook explains how to create a ML model to predict death and survival for the Titanic passengers. Passengers' data are available here: https://www.kaggle.com/c/titanic. The notebook was developed as as study project for the Spiced Academy Data Science Bootcamp.

## Imports

In [188]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_roc_curve

## Titanic dataset

In [189]:
df =pd.read_csv("train.csv", index_col = 0)

In [190]:
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Feature Engineering

In [191]:
#Checking the data available for feature engineering
df.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [192]:
# Drop the 'Survived' column
#It cannot be used for feature engineering because it is the variable the model should predict
df_feature_eng_titanic = df.drop(columns =['Survived'], axis =1)


In [193]:
df_feature_eng_titanic.columns


Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

** In the logistic regression model I will include only the data related to travel class, gender and age that the exploratory data analysis suggested as key factors for survival.

In [194]:
Xtrain = df_feature_eng_titanic
ytrain = df['Survived']

In [195]:
Xtrain.shape, ytrain.shape

print(Xtrain)
print(ytrain)

             Pclass                                               Name  \
PassengerId                                                              
1                 3                            Braund, Mr. Owen Harris   
2                 1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
3                 3                             Heikkinen, Miss. Laina   
4                 1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
5                 3                           Allen, Mr. William Henry   
...             ...                                                ...   
887               2                              Montvila, Rev. Juozas   
888               1                       Graham, Miss. Margaret Edith   
889               3           Johnston, Miss. Catherine Helen "Carrie"   
890               1                              Behr, Mr. Karl Howell   
891               3                                Dooley, Mr. Patrick   

                Sex   Age  SibSp  Par

In [196]:
# Checking missing data
print("Null values in column 'Pclass': ", df_feature_eng_titanic['Pclass'].isna().sum())
print("Null values in column 'Sex': ", df_feature_eng_titanic['Sex'].isna().sum())
print("Null values in column 'Age': ", df_feature_eng_titanic['Age'].isna().sum())
print("Null values in column 'Fare': ", df_feature_eng_titanic['Fare'].isna().sum())
print("Null values in column 'Cabin': ", df_feature_eng_titanic['Cabin'].isna().sum())
print("Null values in column 'Embarked': ", df_feature_eng_titanic['Embarked'].isna().sum())
print("Null values in column 'SibSp': ", df_feature_eng_titanic['SibSp'].isna().sum())

Null values in column 'Pclass':  0
Null values in column 'Sex':  0
Null values in column 'Age':  177
Null values in column 'Fare':  0
Null values in column 'Cabin':  687
Null values in column 'Embarked':  2
Null values in column 'SibSp':  0


In [197]:
pipeline_age = make_pipeline(
    SimpleImputer(strategy='mean'), 
    KBinsDiscretizer(n_bins=5, encode='onehot', strategy='uniform'),
)


In [198]:
def name_cabin(df):
    cabin = df['Cabin'].fillna('X')
    name = cabin.str[0]
    return name.values.reshape(-1, 1)


pipeline_cabin = make_pipeline(
    FunctionTransformer(name_cabin),
    OneHotEncoder(sparse=False, handle_unknown='ignore'),
)


In [199]:
pipeline_embarked = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='X' ),
    OneHotEncoder(sparse=False, handle_unknown='ignore'),
)

In [200]:
pipeline_fare = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler(),
)

In [201]:
#ColumnTransformer automatically drops the columns not explicitly mentioned (unless other param. are given)
trans = ColumnTransformer([
   ('one_hot_enc', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Pclass','Sex']),
    ('impute_and_bin', pipeline_age, ['Age']),
    ('scale', pipeline_fare, ['Fare']),
    ('cabin', pipeline_cabin, ['Cabin']),
    ('embarked', pipeline_embarked, ['Embarked']),
])

In [202]:
trans.fit(Xtrain, ytrain)
Xtrain_transform = trans.transform(Xtrain)  # result is a single numpy array
Xtrain_transform.shape



(891, 20)

## Logistic regression

In [203]:
model = LogisticRegression()
model.fit(Xtrain_transform, ytrain)

LogisticRegression()

In [204]:
test = pd.read_csv('test.csv', index_col =0)
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [205]:
test.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [206]:
print("Null values in column 'Pclass': ", test['Pclass'].isna().sum())
print("Null values in column 'Sex': ", test['Sex'].isna().sum())
print("Null values in column 'Age': ", test['Age'].isna().sum())
print("Null values in column 'Fare': ", test['Fare'].isna().sum())
print("Null values in column 'Cabin': ", test['Cabin'].isna().sum())
print("Null values in column 'Embarked': ", test['Embarked'].isna().sum())
print("Null values in column 'SibSp': ", test['SibSp'].isna().sum())

Null values in column 'Pclass':  0
Null values in column 'Sex':  0
Null values in column 'Age':  86
Null values in column 'Fare':  1
Null values in column 'Cabin':  327
Null values in column 'Embarked':  0
Null values in column 'SibSp':  0


In [207]:
Xtest_transform = trans.transform(test) 
print(Xtest_transform)

[[0. 0. 1. ... 0. 0. 1.]
 [0. 0. 1. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 0. 1.]
 ...
 [0. 0. 1. ... 0. 0. 1.]
 [0. 0. 1. ... 0. 0. 1.]
 [0. 0. 1. ... 0. 0. 1.]]


In [208]:
Xtest_transform.shape

(418, 20)

In [209]:
ypred = model.predict(Xtest_transform)


In [210]:
submission = pd.DataFrame(ypred, index=test.index, columns=['Survived'])
submission.to_csv('submission.csv')