# Kaggle Submission Project 2: ML Classification

## Imports

In [79]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_roc_curve

## Titanic dataset

In [80]:
df =pd.read_csv("train.csv", index_col = 0)

## Feature Engineering

In [81]:
# Drop the 'Survived' column
#It cannot be used for feature engineering because it is the variable the model should predict
df_feature_eng_titanic = df.drop(columns =['Survived'], axis =1)


In [82]:
Xtrain = df_feature_eng_titanic
ytrain = df['Survived']

In [83]:
Xtrain.shape, ytrain.shape

((891, 10), (891,))

In [84]:
# Checking missing data
print("Null values in column 'Pclass': ", df_feature_eng_titanic['Pclass'].isna().sum())
print("Null values in column 'Sex': ", df_feature_eng_titanic['Sex'].isna().sum())
print("Null values in column 'Age': ", df_feature_eng_titanic['Age'].isna().sum())
print("Null values in column 'Fare': ", df_feature_eng_titanic['Fare'].isna().sum())
print("Null values in column 'Cabin': ", df_feature_eng_titanic['Cabin'].isna().sum())
print("Null values in column 'Embarked': ", df_feature_eng_titanic['Embarked'].isna().sum())
print("Null values in column 'SibSp': ", df_feature_eng_titanic['SibSp'].isna().sum())

Null values in column 'Pclass':  0
Null values in column 'Sex':  0
Null values in column 'Age':  177
Null values in column 'Fare':  0
Null values in column 'Cabin':  687
Null values in column 'Embarked':  2
Null values in column 'SibSp':  0


In [85]:
pipeline_age = make_pipeline(
    SimpleImputer(strategy='mean'), 
    KBinsDiscretizer(n_bins=5, encode='onehot', strategy='uniform'),
)


In [86]:
def name_cabin(df):
    cabin = df['Cabin'].fillna('X')
    name = cabin.str[0]
    return name.values.reshape(-1, 1)


pipeline_cabin = make_pipeline(
    FunctionTransformer(name_cabin),
    OneHotEncoder(sparse=False, handle_unknown='ignore'),
)


In [87]:
pipeline_embarked = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='X' ),
    OneHotEncoder(sparse=False, handle_unknown='ignore'),
)

In [88]:
pipeline_fare = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler(),
)

In [89]:
#ColumnTransformer automatically drops the columns not explicitly mentioned (unless other param. are given)
trans = ColumnTransformer([
   ('one_hot_enc', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Pclass','Sex']),
    ('impute_and_bin', pipeline_age, ['Age']),
    ('scale', pipeline_fare, ['Fare']),
    ('cabin', pipeline_cabin, ['Cabin']),
    ('embarked', pipeline_embarked, ['Embarked']),
])

In [90]:
trans.fit(Xtrain, ytrain)
Xtrain_transform = trans.transform(Xtrain)  # result is a single numpy array
Xtrain_transform.shape



(891, 24)

## Logistic regression

In [91]:
model_lr = LogisticRegression()
model_lr.fit(Xtrain_transform, ytrain)

LogisticRegression()

In [92]:
test = pd.read_csv('test.csv', index_col =0)

In [93]:
print("Null values in column 'Pclass': ", test['Pclass'].isna().sum())
print("Null values in column 'Sex': ", test['Sex'].isna().sum())
print("Null values in column 'Age': ", test['Age'].isna().sum())
print("Null values in column 'Fare': ", test['Fare'].isna().sum())
print("Null values in column 'Cabin': ", test['Cabin'].isna().sum())
print("Null values in column 'Embarked': ", test['Embarked'].isna().sum())
print("Null values in column 'SibSp': ", test['SibSp'].isna().sum())

Null values in column 'Pclass':  0
Null values in column 'Sex':  0
Null values in column 'Age':  86
Null values in column 'Fare':  1
Null values in column 'Cabin':  327
Null values in column 'Embarked':  0
Null values in column 'SibSp':  0


In [94]:
Xtest_transform = trans.transform(test) 

In [95]:
Xtest_transform.shape

(418, 24)

In [96]:
ypred_lr = model_lr.predict(Xtest_transform)


In [97]:
submission_lr = pd.DataFrame(ypred_lr, index=test.index, columns=['Survived'])
submission_lr.to_csv('submission_lr.csv')

## Decision Tree

In [98]:
model_dt = DecisionTreeClassifier(max_depth=5)  # we allow that many questions
model_dt.fit(Xtrain_transform,ytrain)

DecisionTreeClassifier(max_depth=5)

In [99]:
ypred_dt = model_dt.predict(Xtest_transform)

In [100]:
submission_dt = pd.DataFrame(ypred_dt, index=test.index, columns=['Survived'])
submission_dt.to_csv('submission_dt.csv')

## Random forest

In [101]:
model_rf = RandomForestClassifier(max_depth=6, random_state=0)

In [102]:
model_rf.fit(Xtrain_transform, ytrain)

RandomForestClassifier(max_depth=6, random_state=0)

In [103]:
ypred_rf = model_rf.predict(Xtest_transform)

In [104]:
submission_rf = pd.DataFrame(ypred_rf, index=test.index, columns=['Survived'])
submission_rf.to_csv('submission_rf.csv')