## Titanic Kaggle competition

#### This is an attempt to build a standard tabular model using Random Forest on Kaggle data

In [1]:
# Let's start by some standard code
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display

from sklearn import metrics
from graphviz import *

In [3]:
PATH = "data/Kaggle/competitions/titanic/"

In [4]:
!ls {PATH}

gender_submission.csv test.csv              train.csv


### 1. Let's load and explore the data

In [5]:
df_raw=pd.read_csv(f'{PATH}Train.csv')

In [6]:
df_raw.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
df_raw.shape

(891, 12)

In [8]:
display(df_raw.tail())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


### What we see in the data
1. No date field. So no date formatting and feature extraction required.
2. A few categorical variables. We will have to convert these into numbers.
3. Some missing values, let's see what we cab do with it.

In [9]:
df_raw['Sex'].unique()

array(['male', 'female'], dtype=object)

In [10]:
df_raw['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

We seem to have some missing values. Let's explore summary stats.

In [11]:
df_raw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### 2. Format the data

In [13]:
df_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
df_raw.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

We would not see the codes in the actual dataframe but we can check using the following code.

### We need to tell the program which variables should be treated categorically

In [15]:
df_raw.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [16]:
cat_vars = ['Survived','Pclass','Sex','SibSp','Parch','Embarked']

In [17]:
# Let's take a backup of original dataframe
df_raw_backup = df_raw

In [18]:
df_raw.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [19]:
df = df_raw.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

In [20]:
for v in cat_vars: df[v] = df[v].astype('category')

In [21]:
# Now let's check if Sex variable is considered as a car var
df.Sex.cat.codes.head()

0    1
1    0
2    0
3    0
4    1
dtype: int8

In [22]:
# Now although we have generated coded for Categorical variables we havn't used them in the orginal dataframe
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [23]:
# Change any columns of strings in a panda's dataframe to a column of catagorical values. 
# This applies the changes inplace.
train_cats(df)

In [24]:
df.Sex.cat.categories,df.Sex.cat.codes.head()

(Index(['female', 'male'], dtype='object'), 0    1
 1    0
 2    0
 3    0
 4    1
 dtype: int8)

In [25]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [26]:
# We can add fix_missing as one of the parameter. 
# It works only on numeric, as pandas automatically takes care of categorical variable missing values 
# by setting them to -1.
df_x, y, nas = proc_df(df,'Survived')

In [27]:
df_x.head() 

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_na
0,3,2,22.0,2,1,7.25,3,False
1,1,1,38.0,2,1,71.2833,1,False
2,3,1,26.0,1,1,7.925,3,False
3,1,1,35.0,2,1,53.1,3,False
4,3,2,35.0,1,1,8.05,3,False


In [28]:
y[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1], dtype=int8)

In [29]:
df_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_na
0,3,2,22.0,2,1,7.25,3,False
1,1,1,38.0,2,1,71.2833,1,False
2,3,1,26.0,1,1,7.925,3,False
3,1,1,35.0,2,1,53.1,3,False
4,3,2,35.0,1,1,8.05,3,False


### 3. Build the model

In [30]:
# n_jobs=-1 helps us to run the jobs in parallel (one per CPU core)
m = RandomForestClassifier(n_jobs=-1)
# Build a forest of trees from the training set (X, y).
m.fit(df_x, y)
# The score() Returns the mean accuracy on the given test data and labels.
m.score(df_x,y)

0.9640852974186308

This model we have is purely on training data, so the high score is not necessarily a good sign.

### Let's load the test data and process it.

In [31]:
df_raw_test = pd.read_csv(f'{PATH}Test.csv')

In [32]:
df_raw_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [33]:
df_test = df_raw_test.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
# RandomForest can work fine even without dropping them

In [34]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [35]:
# Remove the response variable from the list cat_vars
cat_vars.remove('Survived')
cat_vars

['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

In [36]:
# Convert appropriate test variables into 'categorical' type
for v in cat_vars: df_test[v] = df_test[v].astype('category')

In [37]:
df_test.dtypes

Pclass      category
Sex         category
Age          float64
SibSp       category
Parch       category
Fare         float64
Embarked    category
dtype: object

In [38]:
# Now, convert text into numbers
# Changes any columns of strings in df into categorical variables using trn as a template for the category codes.
apply_cats(df=df_test,trn=df)

In [39]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [40]:
df_test_x = proc_df(df_test)

In [41]:
df_test_x[0].head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_na,Fare_na
0,3,2,34.5,1,1,7.8292,2,False,False
1,3,1,47.0,2,1,7.0,3,False,False
2,2,2,62.0,1,1,9.6875,2,False,False
3,3,2,27.0,1,1,8.6625,3,False,False
4,3,1,22.0,2,2,12.2875,3,False,False


In [42]:
type(df_test_x[0])

pandas.core.frame.DataFrame

Now let's try the model on this test data

In [43]:
df_test_x[0].shape

(418, 9)

In [44]:
df_x.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Age_na'], dtype='object')

In [45]:
df_test_x[0].columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Age_na',
       'Fare_na'],
      dtype='object')

We have an extra column in the test data - Fare_na, let's remove that.

In [46]:
df_test = df_test_x[0].drop('Fare_na',axis=1)

In [47]:
df_test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Age_na'], dtype='object')

In [48]:
results = m.predict(df_test)

In [49]:
results

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 

In [50]:
df_test.shape

(418, 8)

We have the predictions for each of the test set example.