In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


In [2]:
raw_train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
raw_test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [3]:
## Let's see the volume of missing data in each column
for i in raw_train_data.columns:
    print('Variable: ', '{:<15}'.format(i), 'Missing Values: ', raw_train_data[i].isna().sum())

Variable:  PassengerId     Missing Values:  0
Variable:  Survived        Missing Values:  0
Variable:  Pclass          Missing Values:  0
Variable:  Name            Missing Values:  0
Variable:  Sex             Missing Values:  0
Variable:  Age             Missing Values:  177
Variable:  SibSp           Missing Values:  0
Variable:  Parch           Missing Values:  0
Variable:  Ticket          Missing Values:  0
Variable:  Fare            Missing Values:  0
Variable:  Cabin           Missing Values:  687
Variable:  Embarked        Missing Values:  2


In [4]:
## Notably, the cabin feature is missing for 687 of the total 891 observances
## Let's drop this variable
## Let's also drop PassengerId, Name, and Ticket 
## We create a checkpoint here

train_data = raw_train_data.drop(['Cabin'], axis=1)
train_data = train_data.drop(['PassengerId'], axis=1)
train_data = train_data.drop(['Name'], axis=1)
train_data = train_data.drop(['Ticket'], axis=1)
train_data.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
## For the test data, we need to keep the PassengerId column
test_data = raw_test_data.drop(['Cabin'], axis=1)
test_data = test_data.drop(['Name'], axis=1)
test_data = test_data.drop(['Ticket'], axis=1)
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [6]:
## The fare paid is an indication of placement on the ship
## A more expensive fare should correspond to a superior placement on the ship
## This likely effects probability of survival
## Instead of the exact fare, we are more interested in categories of fares
## Let's create 4 categories, based on quartiles

train_data['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [7]:
for df in (train_data, test_data):
    df['Fare'] = df['Fare'].fillna(0)
    df['Fare'] = df['Fare'].astype(int)

    df.loc[train_data['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.4542), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.4542) & (df['Fare'] <= 31), 'Fare'] = 2
    df.loc[df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)

train_data['Fare'].describe()

count    891.000000
mean       1.453423
std        1.124206
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max        3.000000
Name: Fare, dtype: float64

In [8]:
## We need to fill in the missing age values
## This can be done many ways, but let us use the average age for a given combination of Sex and Pclass
## This will provide a more accurate estimate for age
train_data['Age'] = train_data.groupby(['Sex', 'Pclass']).transform(lambda x: x.fillna(x.mean()))['Age']
test_data['Age'] = test_data.groupby(['Sex', 'Pclass']).transform(lambda x: x.fillna(x.mean()))['Age']

train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.318643,0.523008,0.381594,1.453423
std,0.486592,0.836071,13.281103,1.102743,0.806057,1.124206
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,21.75,0.0,0.0,0.0
50%,0.0,3.0,26.507589,0.0,0.0,1.0
75%,1.0,3.0,36.0,1.0,0.0,2.0
max,1.0,3.0,80.0,8.0,6.0,3.0


In [9]:
## The exact age of a person doesn't tell us much
## Categorizing passengers based on age into infants, children, young adults, adults, and seniors would be more useful

for df in (train_data, test_data):

    df.loc[df['Age'] <= 2, 'Age'] = 0 # toddlers
    df.loc[(df['Age'] > 5) & (df['Age'] <= 13), 'Age'] = 1 # children
    df.loc[(df['Age'] > 13) & (df['Age'] <= 24), 'Age'] = 2 # young adults
    df.loc[(df['Age'] > 25) & (df['Age'] <= 60), 'Age'] = 3 # adults
    df.loc[df['Age'] > 60, 'Age'] = 4 # seniors
    df['Age'] = df['Age'].astype(int)

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,2,1,0,0,S
1,1,1,female,3,1,0,3,C
2,1,3,female,3,0,0,0,S
3,1,1,female,3,1,0,3,S
4,0,3,male,3,0,0,1,S


In [10]:
## Finally, let's convert our two categorical features (Sex and Embarked) into numbers
## Note there are two missing values for Embarked
## Let's fill these with the most common Port of Embarkation



for df in (train_data, test_data):

    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    df['Sex'] = df['Sex'].map({'male':0, 'female':1})
    df['Embarked'] = df['Embarked'].map({'S':0, 'C':1, 'Q':2})



In [11]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,0,3,0,0,0,2
1,893,3,1,3,1,0,0,0
2,894,2,0,4,0,0,0,2
3,895,3,0,3,0,0,0,0
4,896,3,1,2,1,1,0,0


In [12]:
X = train_data.iloc[:,1:]
test_X = test_data.iloc[:,1:]

y = train_data['Survived']

In [13]:
# Finally, let's set up and run our model
# First, we split our current training data into training and validation
# To determine optimial model parameters, we run a nested loop and predict on our validation set

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

best_OOB = 0
for i in range(20,200,10):
    for j in range(1,10,1):
        for k in range(1,11,1):
            trial_model = RandomForestClassifier(n_estimators=100,
                                               min_samples_split=i,
                                               min_samples_leaf=j,
                                               max_depth=k, 
                                               oob_score=True,
                                               random_state=1)
            trial_model.fit(train_X, train_y)
            val_predict = trial_model.predict(val_X)
            val_mae = mean_absolute_error(val_predict, val_y)

            if trial_model.oob_score_ > best_OOB:
                best_OOB = trial_model.oob_score_
                best_min_samples_split = i
                best_min_samples_leaf = j
                best_max_depth = k


            print("min_samples_split: {} \t min_samples_leaf: {} \t max_depth: {} \t OOB Score: {} \t MAE: {}".format(i, j, k, trial_model.oob_score_,val_mae))
            print()
        
print("The optimal model has the following parameters, for an Out-of-bag score of {}: " 
          "min_samples_split = {}, min_samples_leaf = {}, max_depth = {}".format(best_OOB,best_min_samples_split,best_min_samples_leaf,best_max_depth))


min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 1 	 OOB Score: 0.75 	 MAE: 0.29596412556053814

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 2 	 OOB Score: 0.7949101796407185 	 MAE: 0.21524663677130046

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 3 	 OOB Score: 0.8173652694610778 	 MAE: 0.21524663677130046

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 4 	 OOB Score: 0.8308383233532934 	 MAE: 0.21076233183856502

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 5 	 OOB Score: 0.8353293413173652 	 MAE: 0.21076233183856502

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 6 	 OOB Score: 0.8248502994011976 	 MAE: 0.21524663677130046

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 7 	 OOB Score: 0.8293413173652695 	 MAE: 0.21076233183856502

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 8 	 OOB Score: 0.8173652694610778 	 MAE: 0.21076233183856502

min_samples_split: 20 	 min_samples_leaf: 1 	 max_depth: 9 	 O

In [14]:
## Let's build our final model with the optimal parameters

model = RandomForestClassifier(n_estimators=100,
                               min_samples_split=best_min_samples_split, 
                               min_samples_leaf=best_min_samples_leaf,
                               max_depth=best_max_depth,
                               oob_score=True,
                               random_state=1)


model.fit(X, y)
predictions = model.predict(test_X)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('Titanic_Submission_Oct26F.csv', index=False)