## Import Libraries

In [1]:
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
titanic = pd.read_csv('titanic passenger list.csv')
display(train.head())
display(test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Functions: Handle Missing Values

In [3]:
# Fill Missing Age
def update_age(params, age_df):
    pclass = params[0]
    title = params[1]
    sex = params[2]
    age = params[3]
    if pd.isnull(age):
        age = np.float(age_df[(age_df['title'] == title) & (age_df["Sex"] == sex) & (age_df['Pclass'] == pclass)]["Age"])
    return age

In [4]:
# Fill Remaining
def handle_missing_value(all_data, train):
    # Handle Cabin
    all_data["Cabin"] = train["Cabin"].apply(lambda x: "X" if pd.isnull(x) or x == "T" else x)
    
    # Handle Embarked
    all_data.loc[all_data['Embarked'].isnull(), 'Embarked'] = train["Embarked"].mode()[0]
    
    # Fill Fare
    all_data.loc[all_data['Fare'].isnull(), 'Fare'] = train['Fare'].median()
    
    # Family Ticket Size
    add_family_ticket_size(all_data)
    
    # Fill missing ages using training data
    age_df = all_data[0:891].groupby(['Pclass','title','Sex']).Age.median().reset_index()
    all_data['Age'] = all_data[['Pclass', 'title', 'Sex', 'Age']].apply(lambda x: update_age(x, age_df), axis = 1)
    return all_data

## Functions: Add Feature

### Add Title

In [5]:
def add_new_title(df):
    # Grouping Title
    new_title = {
        'Mr' : 'Mr','Ms' : 'Ms','Mrs' : 'Mrs','Rev' : 'officer','Sir' : 'royalty','theCountess' : 'royalty','Dona' : 'royalty','Capt' : 'officer','Col' : 'officer','Don' : 'royalty','Dr' : 'officer','Jonkheer' : 'royalty','Lady' : 'royalty','Major' : 'officer','Master' : 'kid','Miss' : 'Ms','Mlle' : 'Ms','Mme' : 'Mrs'
    }

    #Add Title
    df['title'] = df['Name'].apply(lambda x: x.split(",")[1])
    df['title'] = df['title'].apply(lambda x: x.split(".")[0])
    df.title = df.title.str.replace(' ', '')
    # Group Title
    df['title'] = df['title'].apply(lambda x: new_title[x])

### Add Cabin Section

In [6]:
def add_cabin_section(df):
    df["Cabin"] = df["Cabin"].apply(lambda x: "X" if pd.isnull(x) or x == "T" else x)
    df["Cabin_Section"] = df['Cabin'].str[0]

### Add Family Ticket Size

In [7]:
def add_family_ticket_size(df):
    df["Family_Size"] = df["SibSp"] + df["Parch"] + 1
    df["Ticket_Group_Size"] = df.groupby('Ticket')['Ticket'].transform('count')
    # Grouping Family Sizes
    family_map = {
        1: 'Alone', 
        2: 'Small', 
        3: 'Small', 
        4: 'Small', 
        5: 'Medium', 
        6: 'Medium', 
        7: 'Large', 
        8: 'Large', 
        11: 'Large'
    }
    all_data['Family_Size_Grouped'] = all_data['Family_Size'].map(family_map)

### Add Survival Rate

In [8]:
# Add Survival Rates (Percentage) using Ticket, Cabin and Family Name
def add_surival_rates(x, survival_rate_df, feature_name, new_feature):
    feature_val = x[feature_name]
    # For Test Data to see if same Ticket is present in Train data
    # For Training Data update survival rate only if this ticket is present in Test Data
    if feature_val in list(survival_rate_df[feature_name]):
        x[new_feature] = np.float(survival_rate_df[survival_rate_df[feature_name] == feature_val]['Survived'])
    return x

def add_survival_rates(all_data):
    # Split Data
    x_train = all_data[0:891]
    x_test = all_data[891:]
    
    # Get Mean Age by Ticket, Cabin and Family Name
    ticket_survival_rate_df = pd.DataFrame(x_train.groupby(['Ticket'])[['Survived']].mean().reset_index())
    cabin_survival_rate_df = pd.DataFrame(x_train.groupby(['Cabin'])['Survived'].mean().reset_index())
    name_survival_rate_df = pd.DataFrame(x_train.groupby(['Family_Name'])['Survived'].mean().reset_index())
    
    # Get List of Ticket, Family name & Cabins in Test Data
    test_ticket_list = list(x_test['Ticket'])
    test_name_list = list(x_test['Family_Name'])
    test_cabin_list = list(x_test['Cabin'])
    
    feature_name = ['Ticket', 'Cabin', 'Family_Name']
    survival_feature_names = ['Ticket_Survival_Rate', 'Cabin_Survival_Rate', 'Name_Survival_Rate']
    test_lists = [test_ticket_list, test_cabin_list, test_name_list]
    grouped_dfs = [ticket_survival_rate_df, cabin_survival_rate_df, name_survival_rate_df]
    
    for i, j, k, l in zip(feature_name, survival_feature_names, test_lists, grouped_dfs):
        x_test[[i, j]] = x_test[[i, j]].apply(lambda x: add_surival_rates(x, l, i, j), axis = 1)
        # Update Survival Rate only if Ticket/Cabin/Family Name is present in test data
        x_train[[i, j]] = x_train[[i,j]].apply(lambda x: add_surival_rates(x, l, i , j) if x[i] in k else x, axis = 1)
    
    all_data = pd.concat([x_train, x_test], sort=False)
    return all_data

## Adding features

In [9]:
# Combine train and test
all_data = pd.concat([train, test], sort=False)

# Add Family Name
all_data["Family_Name"] = all_data['Name'].str.split(',', 1, expand=True)[0]

# Bin Age and Fare
# all_data["Fare_Bins"] = pd.qcut(all_data["Fare"], 15)
# all_data["Age_Bins"] = pd.qcut(all_data["Age"], 11)

add_family_ticket_size(all_data) 
add_new_title(all_data)
add_cabin_section(all_data)

# Add Mean Survival Rates
all_data['Ticket_Survival_Rate'] = np.mean(train['Survived'])
all_data['Cabin_Survival_Rate'] = np.mean(train['Survived'])
all_data['Name_Survival_Rate'] = np.mean(train['Survived'])

# Update Survival Rates
all_data = add_survival_rates(all_data)

In [10]:
display(all_data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,Family_Name,Family_Size,Ticket_Group_Size,Family_Size_Grouped,title,Cabin_Section,Ticket_Survival_Rate,Cabin_Survival_Rate,Name_Survival_Rate
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,S,Braund,2,1,Small,Mr,X,0.383838,0.299419,0.383838
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,Cumings,2,2,Small,Mrs,C,1.0,1.0,1.0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,S,Heikkinen,1,1,Alone,Ms,X,0.383838,0.299419,0.383838
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,S,Futrelle,2,2,Small,Mrs,C,0.383838,0.383838,0.383838
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,S,Allen,1,1,Alone,Mr,X,0.383838,0.299419,0.383838


## Filling Missing Values

In [11]:
all_data = handle_missing_value(all_data, train)

## Drop Columns

In [12]:
def drop_cols(all_data):
    all_data = all_data.drop(
        [
            'PassengerId', 
            'Survived',
            'Name'
        ], axis = 1)
    return all_data

## Create Dummies

In [13]:
def dummify_data(all_data):
    # Columns that need to be dummified
    dummy_cols = ['Sex','Pclass', 'Embarked', 'title', 'Cabin_Section', 
                  'Family_Size_Grouped', 'Cabin', 'Family_Name', 'Ticket']
    
    # Create Dummies
    all_data = pd.get_dummies(all_data, prefix=dummy_cols, columns=dummy_cols)
    return all_data

In [14]:
# Drop Cols
all_data = drop_cols(all_data)

# Dummify Cols
all_data = dummify_data(all_data)

## Split Data

In [15]:
x_train = all_data[0:891]
x_test = all_data[891:]
y_train = train['Survived']
print("Train Shape:",x_train.shape)
print("Test Shape:",x_test.shape)

Train Shape: (891, 1986)
Test Shape: (418, 1986)


## Save Model

In [16]:
def save_model(model, x_train, x_test, y_train):
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    result = pd.concat(
        [pd.DataFrame(test["PassengerId"]),pd.DataFrame(predictions)], 
        ignore_index=True,
        axis=1
    )
    headerList = ['PassengerId', 'Survived']
    # converting data frame to csv
    result.to_csv("submission.csv", header=headerList, index=False)
    return predictions

## Get Cross Val Score

In [17]:
def test_model(model, x, y):
    score=cross_val_score(model, x, y,scoring='accuracy', cv=5)
    print(np.mean(score))

## Fine Tuning Model

#### Fine Tune max depth, max_samples, max_features & criterion used

In [139]:
rf = RandomForestClassifier(
    n_jobs = -1,
    n_estimators=101,
)
params = [{
    'max_depth' : np.arange(1,21,1),
    'max_samples' : np.arange(0.1, 1.1, 0.1),
    'max_features' : ['auto', 'log', 'sqrt'],
    'criterion' : ['gini']
}]

model = GridSearchCV(
    estimator=rf, 
    param_grid=params, 
    scoring='accuracy', 
    cv=5,
    verbose = 2,
    n_jobs=-1
)
model.fit(x_train, y_train)
print(model.best_params_)
print(model.best_score_)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits
{'criterion': 'gini', 'max_depth': 19, 'max_features': 'auto', 'max_samples': 0.5}
0.8608248069801018


#### Fix: Criterion & max_feature 
#### Fine Tune: Max_depth and max_samples

In [140]:
rf = RandomForestClassifier(
    n_jobs = -1,
    n_estimators=101,
    max_features = 'auto',
    criterion = 'gini',
)
params = [{
    'max_depth' : np.arange(18,25,1),
    'max_samples' : np.arange(0.4, 0.61, 0.01),
}]

model = GridSearchCV(
    estimator=rf, 
    param_grid=params, 
    scoring='accuracy', 
    cv=5,
    verbose = 2,
    n_jobs=-1
)
model.fit(x_train, y_train)
print(model.best_params_)
print(model.best_score_)

Fitting 5 folds for each of 147 candidates, totalling 735 fits
{'max_depth': 19, 'max_samples': 0.4900000000000001}
0.8619546795555835


#### Fix: Max_Depth & Max_Samples 
#### Modify: min_samples_split & min_samples_leaf

In [168]:
rf = RandomForestClassifier(
    n_jobs = -1,
    n_estimators=101,
    max_features = 'auto',
    criterion = 'gini',
    max_depth = 19,
    max_samples = 0.49
)
params = [{
    'min_samples_split' : np.arange(2,21,1),
    'min_samples_leaf' : np.arange(2,21,1)
}]

model = GridSearchCV(
    estimator=rf, 
    param_grid=params, 
    scoring='accuracy', 
    cv=5,
    verbose = 2,
    n_jobs=-1
)
model.fit(x_train, y_train)
print(model.best_params_)
print(model.best_score_)

Fitting 5 folds for each of 361 candidates, totalling 1805 fits
{'min_samples_leaf': 2, 'min_samples_split': 4}
0.7856631724311093


#### Fix: min_samples_split & min_samples_leaf
#### Modify: n_estimators

In [441]:
rf = RandomForestClassifier(
    n_jobs = -1,
    max_features = 'auto',
    criterion = 'gini',
    max_depth = 19,
    max_samples = 0.49,
    min_samples_split = 4,
    min_samples_leaf = 2
)
params = [{
    'n_estimators' : np.arange(11,5001,500),
}]

model = GridSearchCV(
    estimator=rf, 
    param_grid=params, 
    scoring='accuracy', 
    cv=5,
    verbose = 2,
    n_jobs=-1
)
model.fit(x_train, y_train)
print(model.best_params_)
print(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 4511}
0.7867804908668633


In [442]:
model.cv_results_['mean_test_score']

array([0.76664365, 0.78455841, 0.7834097 , 0.77779173, 0.78003264,
       0.78452075, 0.7845333 , 0.78340343, 0.78565062, 0.78678049])

#### Fine Tune: n_estimators

In [445]:
rf = RandomForestClassifier(
    n_jobs = -1,
    max_features = 'auto',
    criterion = 'gini',
    max_depth = 19,
    max_samples = 0.49,
    min_samples_split = 4,
    min_samples_leaf = 2
)
params = [{
    'n_estimators' : np.arange(4001,5001,100),
}]

model = GridSearchCV(
    estimator=rf, 
    param_grid=params, 
    scoring='accuracy', 
    cv=5,
    verbose = 2,
    n_jobs=-1
)
model.fit(x_train, y_train)
print(model.best_params_)
print(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'n_estimators': 4401}
0.7890339589479631


In [446]:
model.cv_results_['mean_test_score']

array([0.78901513, 0.78116879, 0.78227983, 0.7845333 , 0.78903396,
       0.7890214 , 0.78901513, 0.78678677, 0.78227983, 0.78340343])

In [447]:
rf = RandomForestClassifier(
    n_jobs = -1,
    max_features = 'auto',
    criterion = 'gini',
    max_depth = 19,
    max_samples = 0.49,
    min_samples_split = 4,
    min_samples_leaf = 2
)
params = [{
    'n_estimators' : np.arange(4301,4502,10),
}]

model = GridSearchCV(
    estimator=rf, 
    param_grid=params, 
    scoring='accuracy', 
    cv=5,
    verbose = 2,
    n_jobs=-1
)
model.fit(x_train, y_train)
print(model.best_params_)
print(model.best_score_)

Fitting 5 folds for each of 21 candidates, totalling 105 fits
{'n_estimators': 4351}
0.7901450003138535


In [448]:
model.cv_results_['mean_test_score']

array([0.7845333 , 0.78003892, 0.78229239, 0.7845333 , 0.78678049,
       0.790145  , 0.7845333 , 0.78116251, 0.78003264, 0.78227983,
       0.78340343, 0.78228611, 0.78564434, 0.78004519, 0.78340343,
       0.7856569 , 0.78116879, 0.7834097 , 0.78227355, 0.78564434,
       0.78339715])

In [449]:
rf = RandomForestClassifier(
    n_jobs = -1,
    max_features = 'auto',
    criterion = 'gini',
    max_depth = 19,
    max_samples = 0.49,
    min_samples_split = 4,
    min_samples_leaf = 2
)
params = [{
    'n_estimators' : np.arange(4341,4361,1),
}]

model = GridSearchCV(
    estimator=rf, 
    param_grid=params, 
    scoring='accuracy', 
    cv=5,
    verbose = 10,
    n_jobs=-1
)
model.fit(x_train, y_train)
print(model.best_params_)
print(model.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'n_estimators': 4343}
0.7890214048082356


In [450]:
model.cv_results_['mean_test_score']

array([0.77666185, 0.78003892, 0.7890214 , 0.77890905, 0.7856569 ,
       0.78115624, 0.7845333 , 0.78003264, 0.78452702, 0.7834097 ,
       0.78227983, 0.78676794, 0.7845333 , 0.78452075, 0.78452702,
       0.78452702, 0.77891532, 0.78677421, 0.78452702, 0.78116879])

## My 2 Best Models

In [472]:
rf = RandomForestClassifier(
    n_jobs = -1,
    n_estimators = 4343,
    max_features = 'auto',
    criterion = 'entropy',
    max_depth = 19,
    max_samples = 0.49,
    min_samples_split = 6,
    min_samples_leaf = 2
)

In [473]:
rf = RandomForestClassifier(
    n_estimators=2179,
    max_depth=19,
    max_samples=0.55,
    criterion='entropy',
    min_samples_split=6,
    min_samples_leaf=2
)