# Titanic: Machine Learning from Disaster

## Import package and files

In [None]:
# --- Import packages --- #
import pandas as pd 
import numpy as np #arrays and specific computations
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold #ML
from sklearn.linear_model import LinearRegression, LogisticRegression # ML models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier # ML models
from sklearn import metrics # models parameters
from sklearn.feature_selection import RFE, RFECV # select variables in Regression

import warnings # avoid useless warning messages

warnings.filterwarnings('ignore')

In [None]:
# --- Import datasets  --- #
train_csv = pd.read_csv('/Users/eliotmoll/Documents/Data_Aticles_Pro/titanic/train.csv', index_col = 'PassengerId')
test_csv = pd.read_csv('/Users/eliotmoll/Documents/Data_Aticles_Pro/titanic/test.csv', index_col = 'PassengerId')
submission_csv = pd.read_csv('/Users/eliotmoll/Documents/Data_Aticles_Pro/titanic/gender_submission.csv') #Female survive only 


## Dataviz and exploration are skipped on this notebook

## Functions for feature engineering

#### Clean NaN by median/mode imputation, create dummies and new infos from variables

In [None]:
def avgUnderCondition(df, col_fill, col_condition, missing_info = False):
    """
    Function to fill missing data (impute data) from median/mode values for subgroups based on other column values.
    Important: /!\/!\ col_condition must not have empty values /!\/!\
    
    input:
        df (df): data df
        col_fill (str): column with missing values to fill
        col_condition (list, str): List of columns to use to impute data
        missing_info (bool): If True it add a column to specify which row had imputation.
    
    output: 
        df (df): input dataframe with missing values imputed
    
    """

    # Create df with median/mode values
    if df[col_fill].dtypes in (float,int):
        print('Treated as a numeric : median')
        df_for_fill = df.groupby(col_condition)[[col_fill]].median()
        df_for_fill.reset_index(inplace = True)
    else:
        print('Treated as a string/object : mode')
        df_for_fill = df.groupby(col_condition).agg(lambda x:x.value_counts().index[0])
        df_for_fill.reset_index(inplace = True)
        
        allcol = col_condition.copy()
        allcol.append(col_fill)
        df_for_fill = df_for_fill[allcol]
    
    
    # Fill NaN, from values previously computed
    df_t = pd.merge(df, df_for_fill,  how='left', left_on=col_condition, right_on = col_condition)
    df_t[col_fill] = np.where(pd.isnull(df_t[str(col_fill+'_x')]), df_t[str(col_fill+'_y')], df_t[str(col_fill+'_x')])
    
    #Add column to specify which values were missing
    if missing_info == True:
        df_t[str(col_fill + '_was_miss')] = np.where(pd.isnull(df_t[str(col_fill+'_x')]), 1, 0)
    
    # Remove temporary columns
    df_t.drop(columns=[str(col_fill+'_x'), str(col_fill+'_y')], inplace = True)
    return df_t


def addInfos(df):
    '''
    Not compulsary to use this as a function. Allow to do some feature engineering based on data exploration.
    
    input:
        df (df): titanic df, must have 'Name', 'title', 'SibSp', 'Parch', 'Cabin', 'Age', 'Sex', 'Embarked' and 'Pclass'
    
    output:
        df (df): same df with feature engineering (add columns)
    '''
    #Get titles from names
    df['title'] = df.Name.str.replace('^.+, ', '', case=False)
    df['title'] = df.title.str.replace('\. .+', '', case=False)

    df['GroupTitle'] = 'Default'
    df['GroupTitle'][df['title'].isin(['Sir', 'Lady', 'the Countess', 'Don', 'Jonkheer'])] = 'Noble'
    df['GroupTitle'][df['title'].isin(['Col', 'Major', 'Capt', 'Rev'])] = 'Social'
    df['GroupTitle'][df['title'].isin(['Master'])] = 'YoungM'
    df['GroupTitle'][df['title'].isin(['Miss', 'Mlle'])] = 'YoungF'
    
    #Get familly size
    df['FamillySize'] = df['SibSp'] + df['Parch'] + 1
    df['FamillyBinS'] = df['FamillySize'].map(lambda s: 1 if s == 1 else 0)
    df['FamillyBinM'] = df['FamillySize'].map(lambda s: 1 if 2 <= s <= 4  else 0)
    df['FamillyBinL'] = df['FamillySize'].map(lambda s: 1 if s >= 5 else 0)
    
    #Get the cabin type (first letter of the string)
    df['CabinType'] = df['Cabin'].map(lambda s: str(s)[0] if not pd.isnull(s) else 'No')
    idx = df[df['CabinType'] == 'T'].index # replace the only one 'T' cabin to 'A' cabin (most similar).
    df.loc[idx, 'CabinType'] = 'A'
    
    #Create new infomation about Age and Sex of passengers
    df['AgeYoung'] = [1 if x <15 else 0 for x in list(df.Age)] #not usefull with bins of age
    df['isMale'] = [1 if x == 'male' else 0 for x in list(df['Sex'])]
    
    #Ultimately merge Cabin type based on survival probability (reduce number of variables)
    df['CabinType'] = df['CabinType'].replace(['A', 'B','C'], 'ABC')
    df['CabinType'] = df['CabinType'].replace(['D', 'E'], 'DE')
    df['CabinType'] = df['CabinType'].replace(['F', 'G'], 'FG')

    
    df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix="Emb")], axis=1)
    df = pd.concat([df, pd.get_dummies(df['Pclass'], prefix="Pc")], axis=1)
    df = pd.concat([df, pd.get_dummies(df['GroupTitle'], prefix="Tit")], axis=1)
    df = pd.concat([df, pd.get_dummies(df['CabinType'], prefix="Cab")], axis=1)
    
    return df

In [None]:
# --- Data Cleaning, Engineering and Completion --- #
#Prepare data for easy merge and split actions
test = test_csv.copy()
train = train_csv.copy()
test['data'] = 'test'
train['data'] = 'train'

#Merge both datasets into 1 to apply same transformations
df_all = pd.concat([train, test], sort=True).reset_index(drop=True)

#
df_all = addInfos(df_all)

df_all = avgUnderCondition(df_all, 'Age', ['Pclass' , 'Sex', 'GroupTitle'], True)
df_all = avgUnderCondition(df_all, 'Embarked', ['Pclass' , 'Sex'])
df_all = avgUnderCondition(df_all, 'Fare', ['Pclass'])

# Create some scale in quantitative values.
df_all['FareBin'] = pd.qcut(df_all['Fare'], 9, labels= False)

df_all['FareBin'] = df_all['FareBin'].replace([0, 1, 2], 0)
df_all['FareBin'] = df_all['FareBin'].replace([3, 4], 1)
df_all['FareBin'] = df_all['FareBin'].replace([5, 6, 7], 2)
df_all['FareBin'] = df_all['FareBin'].replace(8, 4)

df_all['AgeBin'] = pd.qcut(df_all['Age'], 7, labels= False)

# Tickets bought can have an impact on survival rate (famillies, groups, etc.)
df_all['Ticket_Frequency'] = df_all.groupby('Ticket')['Ticket'].transform('count')

# Drop freshly useless columns
df_all = df_all.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'title', 'GroupTitle', 'FamillySize', 'CabinType', 'Embarked', 'Fare','Age'], axis = 1)


In [None]:
# --- Re-create working dataframes --- #
test = df_all[df_all['data'] == 'test'].drop(['Survived','data'], axis=1)
train = df_all[df_all['data'] == 'train'].drop(['data'], axis=1)

In [None]:
# --- Predicted-Explanatory variables correlation  --- #
df_t_corr = train.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_t_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_t_corr[df_t_corr['Feature 1'] == 'Survived']

## Train models

#### Create dataframes from train data

In [None]:
# --- train and validation sets --- #
y = train[['Survived']]
X = train.drop(['Survived'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size= 0.1)
#

In [None]:
# --- Train models --- #
# Based on train sets, train model with cross-validation to improve predictions. Parameters are optimzed following the method in appendix
gb_model = GradientBoostingClassifier(n_estimators=800, learning_rate=0.01, max_depth=5, min_samples_leaf= 5)
cross_val_score(gb_model , X_train , y_train , cv=5)

rf_model = RandomForestClassifier(criterion='gini', n_estimators=2000, max_depth=6, min_samples_split=6, min_samples_leaf=5, max_features='auto') 
cross_val_score(rf_model , X_train , y_train , cv=5)


rfecv = RFECV(estimator=LogisticRegression(), step=1, cv=5, scoring='accuracy')
rfecv.fit(X_train, y_train)

rgs_model = LogisticRegression()

print("Optimal number of features: " + str(rfecv.n_features_))
rg_model = RFE(rgs_model, rfecv.n_features_)
#print('Selected features: %s' % list(X.columns[rfe.support_]))
#

In [None]:
# --- Predict on validation set to estimate quality of predictions --- #
gb_model.fit(X_train , y_train)
model_gb = list(gb_model.predict(X_val))

rf_model.fit(X_train , y_train)
model_rf = list(rf_model.predict(X_val))

rg_model.fit(X_train , y_train)
model_rg = list(rg_model.predict(X_val))

y_val['pred1'] = model_gb
y_val['pred2'] = model_rf
y_val['pred3'] = model_rg
#

In [None]:
# --- Look on predictions --- #
x = pd.crosstab(y_val.Survived, y_val.pred1)
print(round(np.trace(x)/len(y_val)*100,1),'%')
print(x)

x = pd.crosstab(y_val.Survived, y_val.pred2)
print(round(np.trace(x)/len(y_val)*100,1),'%')
print(x)

x = pd.crosstab(y_val.Survived, y_val.pred3)
print(round(np.trace(x)/len(y_val)*100,1),'%')
print(x)

y = pd.crosstab(y_val.pred1, y_val.pred3)
print(round(np.trace(y)/len(y_val)*100,1),'%')

y = pd.crosstab(y_val.pred2, y_val.pred3)
print(round(np.trace(y)/len(y_val)*100,1),'%')

y = pd.crosstab(y_val.pred2, y_val.pred1)
print(round(np.trace(y)/len(y_val)*100,1),'%')

# Since models are not agree on predictions, ensemble model can help to improve predictions
y_val['mode'] = round(y_val[['pred1', 'pred2', 'pred3']].sum(axis=1)/3,0)

x = pd.crosstab(y_val.Survived , y_val['mode'])
print(round(np.trace(x)/len(y_val)*100,1),'%')
print(x)
#

In [None]:
# --- Predict on submission file --- #
submission = submission_csv.copy()

gb_model.fit(X_train , y_train)
model_gb = list(gb_model.predict(test))

rf_model.fit(X_train , y_train)
model_rf = list(rf_model.predict(test))

rg_model.fit(X_train , y_train)
model_rg = list(rg_model.predict(test))

submission['pred1'] = model_gb
submission['pred2'] = model_rf
submission['pred3'] = model_rg

submission = submission[['PassengerId', 'pred1', 'pred2', 'pred3']]
submission['mode'] = submission.mode(axis='columns')[0].astype(int) # most common value predicted
#

In [None]:
# --- Prepare output for Kaggle submission --- #
out = submission[['PassengerId','mode']]
out.columns = ['PassengerId', 'Survived']
out.to_csv('/Users/eliotmoll/Documents/Data_Aticles_Pro/titanic/preds/submission_file.csv', index=False)
#

In [None]:
# --- GridSearch to search optimal parameters --- #

# Example for random forest
kfold = StratifiedKFold(n_splits=10)

rfc = RandomForestClassifier(n_jobs=-1, oob_score = True) 

# Define all candidates
grid = {"max_depth": [4, 6, 6, 8, 10, None],
              "min_samples_split": [3, 5, 7, 10],
              "min_samples_leaf": [3, 4, 5, 6],
              "n_estimators" :[1000, 1500, 2000, 2500],
              "max_features": ['log2', 'auto', 5, 10]
       }

# Perform GridSearch
rfcGS = GridSearchCV(rfc,param_grid = grid , cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)


rfcGS.fit(X_train,y_train)
print(rfcGS.best_params_)
#