## Random Forest for Titanic

In [1]:
import numpy as np
np.random.seed(10)

import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from decisiontree import *

In [2]:
train = pd.read_csv("datasets/titanic/titanic_training.csv")
test = pd.read_csv("datasets/titanic/titanic_testing_data.csv")

In [3]:
train.fillna(train.mean(),inplace=True)
test.fillna(train.mean(),inplace=True)

mode_sex = list(train["sex"])
mode_ticket = list(train["ticket"])
mode_embarked = list(train["embarked"])
reps = {"sex":max(set(mode_sex),key=mode_sex.count),
        "ticket":max(set(mode_ticket),key=mode_ticket.count),
        "embarked":max(set(mode_embarked),key=mode_embarked.count)}

train.fillna(reps,inplace=True)
test.fillna(reps,inplace=True)

In [4]:
#one hot cabin training set
cabin = train[["cabin"]].copy()
cabin["cabin"] = cabin["cabin"].str.slice(0,1)
mode_cabin = list(cabin["cabin"])
cabin["cabin"] = cabin["cabin"].fillna(
                max(set(mode_cabin),
                key=mode_cabin.count))
one_hot = pd.get_dummies(cabin["cabin"])
train = pd.concat([train, cabin], axis=1, sort=False)
train = train.loc[:,~train.columns.duplicated()] #removed extra T col

#one hot cabin test set
cabin = test[["cabin"]].copy()
cabin["cabin"] = cabin["cabin"].str.slice(0,1)
mode_cabin = list(cabin["cabin"])
cabin["cabin"] = cabin["cabin"].fillna(
                max(set(mode_cabin),
                key=mode_cabin.count))
one_hot = pd.get_dummies(cabin["cabin"])
# test.drop(columns=["cabin"],axis=1,inplace=True)
test = pd.concat([test, cabin], axis=1, sort=False)
test = test.loc[:,~train.columns.duplicated()] #removed extra T col

In [5]:
#one hot sex training set
train = pd.concat([train,
                   pd.get_dummies(train["sex"], 
                      prefix="sex")],axis=1)
train.drop(["sex"], axis=1,inplace=True)

#one hot sex test set
test = pd.concat([test,
                  pd.get_dummies(test["sex"], 
                     prefix="sex")],axis=1)
test.drop(["sex"], axis=1, inplace=True)

In [6]:
#one hot embarked training set
train = pd.concat([train,
                   pd.get_dummies(train["embarked"],
                        prefix="embarked")],axis=1)
train.drop(["embarked"], axis=1, inplace=True)

#one hot embarked test set
test = pd.concat([test,
                  pd.get_dummies(test["embarked"], 
                         prefix="embarked")],axis=1)
test.drop(["embarked"], axis=1, inplace=True)

In [7]:
def getnum(s):
    s = re.findall(r"\d+",str(s))
    return int(s[0]) if len(s)>=1 else -1

train["ticket"] = train["ticket"].apply(lambda s: getnum(s))
test["ticket"] = test["ticket"].apply(lambda s: getnum(s))
train["ticket"] = train["ticket"].apply(lambda s: train["ticket"].mean() if s==-1 else s)
test["ticket"] = test["ticket"].apply(lambda s: test["ticket"].mean() if s==-1 else s)

train_y = train["survived"].values
train.drop(["survived"], axis=1, inplace=True)

train.drop(columns=["cabin"],axis=1,inplace=True)
test.drop(columns=["cabin"],axis=1,inplace=True)

In [8]:
train = train.values
x_test = test.values
x_train, x_val, y_train, y_val = \
train_test_split(train, train_y, test_size=0.2)
x_train.shape,x_test.shape

((800, 11), (310, 11))

In [9]:
%%time
rf = RandomForest(num_trees=10,header="titanic_randomforest")
rf.fit(x_train, y_train)

2019-03-27 03:15:41,562	INFO node.py:423 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-03-27_03-15-41_59094/logs.
2019-03-27 03:15:41,672	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:60972 to respond...
2019-03-27 03:15:41,784	INFO services.py:363 -- Waiting for redis server at 127.0.0.1:49058 to respond...
2019-03-27 03:15:41,787	INFO services.py:760 -- Starting Redis shard with 3.44 GB max memory.
2019-03-27 03:15:41,798	INFO services.py:1384 -- Starting the Plasma object store with 5.15 GB memory using /tmp.


CPU times: user 99.2 ms, sys: 63.4 ms, total: 163 ms
Wall time: 3.07 s


In [10]:
%%time
train_preds = rf.predict(x_train,"train")
print("training accuracy:",np.mean(train_preds==y_train))
val_preds = rf.predict(x_val,"val")
print("validation accuracy:",np.mean(val_preds==y_val))
rf.predict(x_test,"test")

training accuracy: 0.62375
validation accuracy: 0.57
saved predictions
CPU times: user 63.9 ms, sys: 6.41 ms, total: 70.3 ms
Wall time: 77.5 ms
