Most of the following code comes from this article, lets remember to quote this.https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [54]:
# get titanic & test csv files as a DataFrame
train = pd.read_csv("./titanic/train.csv")
print(train.shape)

(891, 12)


In [55]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [56]:
NAs = pd.concat([train.isnull().sum()], axis=1, keys=['Missing in Train'])
NAs[NAs.sum(axis=1) > 0] #prints only the one that have missin values >0

Unnamed: 0,Missing in Train
Age,177
Cabin,687
Embarked,2


In [57]:
#We will remove ‘Cabin’, ‘Name’ and ‘Ticket’ columns as they require some processing to extract useful features
train.pop('Cabin')
train.pop('Name')
train.pop('Ticket')

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [58]:
# Filling missing Age values with mean
train['Age'] = train['Age'].fillna(train['Age'].mean())

In [59]:
# Filling missing Embarked values with most common value
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])

In [60]:
#‘Pclass’ is a categorical feature so we convert its values to strings
train['Pclass'] = train['Pclass'].apply(str)


In [61]:
# Getting Dummies from all other categorical vars
for col in train.dtypes[train.dtypes == 'object'].index:
     for_dummy = train.pop(col)
     train = pd.concat([train, pd.get_dummies(for_dummy, prefix=col)], axis=1)
train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,3,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,5,0,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [62]:
train.corr(method="pearson")

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,1.0,-0.005007,0.033207,-0.057527,-0.001652,0.012658,0.034303,-8.6e-05,-0.029486,-0.042939,0.042939,-0.001205,-0.033606,0.022204
Survived,-0.005007,1.0,-0.069809,-0.035322,0.081629,0.257307,0.285904,0.093349,-0.322308,0.543351,-0.543351,0.16824,0.00365,-0.149683
Age,0.033207,-0.069809,1.0,-0.232625,-0.179191,0.091566,0.319916,0.006589,-0.281004,-0.084153,0.084153,0.032024,-0.013855,-0.019336
SibSp,-0.057527,-0.035322,-0.232625,1.0,0.414838,0.159651,-0.054582,-0.055932,0.092548,0.114631,-0.114631,-0.059528,-0.026354,0.068734
Parch,-0.001652,0.081629,-0.179191,0.414838,1.0,0.216225,-0.017633,-0.000734,0.01579,0.245489,-0.245489,-0.011069,-0.081228,0.060814
Fare,0.012658,0.257307,0.091566,0.159651,0.216225,1.0,0.591711,-0.118557,-0.413333,0.182333,-0.182333,0.269335,-0.117216,-0.162184
Pclass_1,0.034303,0.285904,0.319916,-0.054582,-0.017633,0.591711,1.0,-0.288585,-0.626738,0.098013,-0.098013,0.296423,-0.155342,-0.161921
Pclass_2,-8.6e-05,0.093349,0.006589,-0.055932,-0.000734,-0.118557,-0.288585,1.0,-0.56521,0.064746,-0.064746,-0.125416,-0.127301,0.18998
Pclass_3,-0.029486,-0.322308,-0.281004,0.092548,0.01579,-0.413333,-0.626738,-0.56521,1.0,-0.137143,0.137143,-0.153329,0.237449,-0.015104
Sex_female,-0.042939,0.543351,-0.084153,0.114631,0.245489,0.182333,0.098013,0.064746,-0.137143,1.0,-1.0,0.082853,0.074115,-0.119224


OK NOW WE START THE REAL DEAL

In [63]:
from sklearn.model_selection import train_test_split

labels = train.pop('Survived') #WE isolate SURVIVED

#For testing, we choose to split our data to 75% train and 25% for test

x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.25)

In [64]:
from sklearn.ensemble import RandomForestClassifier
#The random forest uses many trees, and it makes a prediction by averaging the predictions of each component tree.
rf = RandomForestClassifier()

rf.fit(x_train, y_train)


y_pred = rf.predict(x_test)

In [69]:
y_pred

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0])

We will use AUC (Area Under Curve) as the evaluation metric. Our target value is binary so it’s a binary classification problem. AUC is a good way for evaluation for this type of problems. (?)

In [71]:
from sklearn.metrics import roc_curve, auc, mean_absolute_error, precision_score

In [68]:

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.8058129370629371

We can also use Mean Absolute Value. With the MAE metric, we take the absolute value of each error. This converts each error to a positive number. We then take the average of those absolute errors. This is our measure of model quality. 

In [70]:
mean_absolute_error(y_test, y_pred) #I GUESS THIS DOES NOT REALLY MAKE SENSE

0.17488789237668162

In [72]:
precision_score(y_test, y_pred) #PRECISION is tp/tp+fp. A score of 1 is therefoe no false positive

0.7662337662337663