In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler

from sklearn.impute import SimpleImputer

from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

## Import Data

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Data Visualization

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.describe(include=['O'])

In [None]:
train_df.dtypes[train_df.dtypes != 'object']

In [None]:
train_df.groupby(['Pclass'], as_index = False)['Survived'].mean()

In [None]:
train_df.groupby(['SibSp'], as_index = False)['Survived'].mean()

In [None]:
train_df.groupby(['Parch'], as_index = False)['Survived'].mean()

In [None]:
train_df.dtypes[train_df.dtypes == 'object']

In [None]:
train_df.groupby(['Sex'], as_index = False)['Survived'].mean()

In [None]:
train_df.groupby(['Embarked'], as_index = False)['Survived'].mean()

In [None]:
train_df['Cabin'].unique()

In [None]:
train_df['Deck'] = train_df['Cabin'].str[0]
test_df['Deck'] = test_df['Cabin'].str[0]

In [None]:
train_df.groupby(['Deck', 'Pclass']).size()

In [None]:
train_df.groupby(['Pclass','Deck'], as_index = False)['Survived'].mean()

## Feature Extraction

In [None]:
#Fill Deck class 1 = semi-private, class 2,3 = No
train_df.loc[(train_df['Deck'].isnull()) & (train_df['Pclass'] == 1), 'Deck'] = 'Semi-Private'
train_df.loc[(train_df['Deck'].isnull()) & (train_df['Pclass'].isin([2, 3])), 'Deck'] = 'No'

test_df.loc[(train_df['Deck'].isnull()) & (test_df['Pclass'] == 1), 'Deck'] = 'Semi-Private'
test_df.loc[(test_df['Deck'].isnull()) & (test_df['Pclass'].isin([2, 3])), 'Deck'] = 'No'

In [None]:
train_df['Family_Size'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family_Size'] = test_df['SibSp'] + test_df['Parch'] + 1

In [None]:
train_df.groupby(['Family_Size'], as_index = False)['Survived'].mean()

In [None]:
#Ryan Nolan
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Medium', 8: 'Large', 9: 'Large', 10: 'Large', 11: 'Large'}
train_df['Family_Size_Grouped'] = train_df['Family_Size'].map(family_map)
test_df['Family_Size_Grouped'] = test_df['Family_Size'].map(family_map)

In [None]:
#Andrei paulavets
train_df['Title'] = train_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

In [None]:
title_mask_train_df = ~train_df['Title'].isin(['Mr', 'Miss', 'Mrs', 'Master'])
title_mask_test_df = ~test_df['Title'].isin(['Mr', 'Miss', 'Mrs', 'Master'])

train_df.loc[title_mask_train_df, 'Title'] = train_df.loc[title_mask_train_df, 'Sex'].map({'male': 'Mr', 'female': 'Mrs'})
test_df.loc[title_mask_test_df, 'Title'] = test_df.loc[title_mask_test_df, 'Sex'].map({'male': 'Mr', 'female': 'Mrs'})

In [None]:
train_df.groupby(['Title'])['Age'].mean()

In [None]:
#Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
title_age_mean = {
    'Master' : 4.5,
    'Miss' : 22.5,
    'Mr' : 33.5,
    'Mrs' :36.5
}

In [None]:
for title, median_age in title_age_mean.items():
        train_age_mask = (train_df['Age'].isnull()) & (train_df['Title'] == title)
        test_age_mask = (test_df['Age'].isnull()) & (test_df['Title'] == title)
    
        train_df.loc[train_age_mask, 'Age'] = median_age
        test_df.loc[test_age_mask, 'Age'] = median_age

In [None]:
train_df.groupby(['Family_Size_Grouped'], as_index = False)['Survived'].mean()

In [None]:
train_df['Age*Fare'] = train_df['Age'] * train_df['Fare']
test_df['Age*Fare'] = test_df['Age'] * test_df['Fare']

In [None]:
train_df['Pclass*Fare'] = train_df['Pclass'] * train_df['Fare']
test_df['Pclass*Fare'] = test_df['Pclass'] * test_df['Fare']

## Feature Scaling & Feature Encoding

In [None]:
train_df.info()

In [None]:
train_df = train_df.drop(columns = ['PassengerId', 'Name', 'Cabin', 'Ticket'])
test_df = test_df.drop(columns = ['Name', 'Cabin', 'Ticket'])

In [None]:
train_df.dtypes[train_df.dtypes == 'object']

In [None]:
train_df.dtypes[train_df.dtypes != 'object']

In [None]:
ode_cols = ['Family_Size_Grouped']

In [None]:
ohe_cols = ['Sex','Embarked','Deck','Title']

In [None]:
num_cols = ['Pclass','Age','SibSp','Parch','Fare','Family_Size','Age*Fare','Pclass*Fare']

In [None]:
num_pipeline = Pipeline(steps =[
    ('impute', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])

In [None]:
ode_pipeline = Pipeline(steps =[
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('ode', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1))
])

In [None]:
ohe_pipeline = Pipeline(steps =[
    ('impute', SimpleImputer(strategy = 'most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

In [None]:
col_trans = ColumnTransformer(transformers = [
    ('num_p', num_pipeline, num_cols),
    ('ode_p', ode_pipeline, ode_cols),
    ('ohe_p', ohe_pipeline, ohe_cols),
])

In [None]:
pipeline = Pipeline(steps = [
    ('preprocessing', col_trans)
])

In [None]:
X = train_df.drop(columns = ['Survived'])
y = train_df['Survived']

In [None]:
X_preprocessed = pipeline.fit_transform(X)

## Train and Test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size = 0.2, random_state = 42)

## Train and Evaluate Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

from sklearn.metrics import classification_report, accuracy_score

In [None]:
strat_5fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
strat_3fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
LGR = LogisticRegression()

In [None]:
LGR.fit(X_train, y_train)

In [None]:
y_pred_LGR = LGR.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_LGR)

In [None]:
SVC = SVC()

In [None]:
param_grid_SVC = {
    'C' : [0.1 ,1, 10],
    'kernel' : ['linear', 'poly', 'rbf'],
    'degree' : [2, 3, 4, 5],
    'gamma' : [0.001, 0.01, 0.1, 1]
}

In [None]:
SVC_cv = GridSearchCV(SVC, param_grid_SVC, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [None]:
SVC_cv.fit(X_train, y_train)

In [None]:
y_pred_SVC = SVC_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_SVC)

In [None]:
KNN = KNeighborsClassifier()

In [None]:
param_grid_KNN = {
    'n_neighbors' : [3, 5, 7, 9],
    'weights' : ['uniform', 'distance'],
    'metric' : ['cityblock', 'cosine', 'euclidean', 'manhattan']
}

In [None]:
KNN_cv = GridSearchCV(KNN, param_grid_KNN, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [None]:
KNN_cv.fit(X_train, y_train)

In [None]:
y_pred_KNN = KNN_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_KNN)

In [None]:
DTC = DecisionTreeClassifier()

In [None]:
param_grid_DTC = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [3,5,7],
    'min_samples_split' : [2, 3, 5, 10],
}

In [None]:
DTC_cv = GridSearchCV(DTC, param_grid_DTC, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [None]:
DTC_cv.fit(X_train, y_train)

In [None]:
y_pred_DTC = DTC_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_DTC)

In [None]:
GNB = GaussianNB()

In [None]:
GNB.fit(X_train, y_train)

In [None]:
y_pred_GNB = GNB.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_GNB)

In [None]:
RFC = RandomForestClassifier()

In [None]:
param_grid_RFC = {
    'max_depth' : [3, 5, 10, 15],
    'n_estimators' : [100, 250, 500],
    'min_samples_split' : [2, 3, 5 ,10]
}

In [None]:
RFC_cv = GridSearchCV(RFC, param_grid_RFC, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [None]:
RFC_cv.fit(X_train, y_train)

In [None]:
y_pred_RFC = RFC_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_RFC)

In [None]:
XGB = XGBClassifier()

In [None]:
param_grid_XGB = {
    'learning_rate': [0.01 ,0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3, 5 ,7],
    'min_child_weight': [1,2,3],
    'gamma': [0.01, 0.1, 0.2],
    'subsample' : [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8,0.9,1.0]
}

In [None]:
XGB_cv = GridSearchCV(XGB, param_grid_XGB, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [None]:
XGB_cv.fit(X_train, y_train)

In [None]:
y_pred_XGB = XGB_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_XGB)

In [None]:
GBC = GradientBoostingClassifier()

In [None]:
param_grid_GBC = {
    'learning_rate': [0.01],
    'max_depth' : [13,14,15],
    'n_estimators' : [135,150,175],
    'min_samples_leaf' : [23, 25 ,27],
    'max_features': [0.5]
}

In [None]:
GBC_cv = GridSearchCV(GBC, param_grid_GBC, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [None]:
GBC_cv.fit(X_train, y_train)

In [None]:
y_pred_GBC = GBC_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_GBC)

In [None]:
ABC = AdaBoostClassifier()

In [None]:
param_grid_ABC = {
    'learning_rate': [0.001,0.01, 0.1, 1],
    'n_estimators' : [300, 500, 700],
}

In [None]:
ABC_cv = GridSearchCV(ABC, param_grid_ABC, cv = 5, scoring = 'accuracy', n_jobs = -1)

In [None]:
ABC_cv.fit(X_train, y_train)

In [None]:
y_pred_ABC = ABC_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_ABC)

In [None]:
accuracy_score(y_test, y_pred_LGR)

In [None]:
accuracy_score(y_test, y_pred_SVC)

In [None]:
accuracy_score(y_test, y_pred_KNN)

In [None]:
accuracy_score(y_test, y_pred_DTC)

In [None]:
accuracy_score(y_test, y_pred_GNB)

In [None]:
accuracy_score(y_test, y_pred_RFC)

In [None]:
accuracy_score(y_test, y_pred_XGB)

In [None]:
accuracy_score(y_test, y_pred_GBC)

In [None]:
df_test_preprocessed = pipeline.transform(X_test)