In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from mpl_toolkits import mplot3d
%matplotlib inline
import matplotlib.pyplot as plt

from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import fixed

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

exp_data = pd.read_csv("./train.csv")
X = pd.get_dummies(exp_data[features])
age_column = [x for x in list(exp_data['Age']) if str(x) != 'nan']
X = X.fillna({'Age':np.median(age_column)})
y = exp_data['Survived']

In [3]:
exp_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
submition_data = pd.read_csv("./test.csv")
submition_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
test_per = 0.1

X_train_val, X_test, y_train_val, y_test \
    = train_test_split(X, y, test_size=test_per, random_state=1)

print('total experimental data', X.shape[0])
print('training and validation num rows:', X_train_val.shape[0])
print('test num rows:', X_test.shape[0])

total experimental data 891
training and validation num rows: 801
test num rows: 90


In [6]:
from sklearn.ensemble import RandomForestClassifier

num_folds = 10
folds = np.array_split(range(X_train_val.shape[0]), num_folds)

avg_errors = []
num_est_param1 = []
max_depth_param = []

for num_estimators in [10, 20, 30, 50, 70, 100]:
    for max_depth in range(1,20,2):
        errors = []
        for val_fold in range(len(folds)):
            val_indicies = folds[val_fold]
            X_val = X_train_val.iloc[val_indicies]
            y_val = y_train_val.iloc[val_indicies]

            train_indicies = sum([list(folds[fold_ind]) for fold_ind in range(len(folds))\
                                     if fold_ind != val_fold], [])

            X_train = X_train_val.iloc[train_indicies]
            y_train = y_train_val.iloc[train_indicies]

            model = RandomForestClassifier(n_estimators=num_estimators, max_depth=max_depth, random_state=1)
            model.fit(X_train, y_train)
            predictions = model.predict(X_val)
            assert(len(y_val) == len(predictions))
            errors.append(sum([np.abs(y_val.iloc[i] - predictions[i]) for i in range(len(y_val))])/len(y_val))
        num_est_param1.append(num_estimators)
        max_depth_param.append(max_depth)
        avg_errors.append(np.average(errors))
        
    print("Completed Iteration")
    print("num_estimators: ", num_estimators)
    print("Best accuracy so far:", 1 - min(avg_errors))
    print("==========================================")

min_index = np.argmin(avg_errors)
print("The highest average accuracy was produced with parameters,")
print("num_estimators: ", num_est_param1[min_index])
print("max_depth: ", max_depth_param[min_index])
print("Average Accuracy:", 1 - avg_errors[min_index])
            

Completed Iteration
num_estimators:  10
Best accuracy so far: 0.8264660493827161
Completed Iteration
num_estimators:  20
Best accuracy so far: 0.8302469135802469
Completed Iteration
num_estimators:  30
Best accuracy so far: 0.8302469135802469
Completed Iteration
num_estimators:  50
Best accuracy so far: 0.8364969135802469
Completed Iteration
num_estimators:  70
Best accuracy so far: 0.8364969135802469
Completed Iteration
num_estimators:  100
Best accuracy so far: 0.8364969135802469
The highest average accuracy was produced with parameters,
num_estimators:  50
max_depth:  7
Average Accuracy: 0.8364969135802469


In [10]:
opts1 = list(set(num_est_param1))
opts1.sort()
slider1 = widgets.SelectionSlider(description='estimators', options=opts1)

def display1(num_estimators):
    indicies = [i for i, val in enumerate(num_est_param1) if val == num_estimators]
    depth_sel = list(map(lambda ind: max_depth_param[ind], indicies))
    errs_sel = list(map(lambda ind: avg_errors[ind], indicies))
    fig1, ax1 = plt.subplots()
    ax1.set_xlabel('max depth')
    ax1.set_ylabel('error')
    ax1.plot(depth_sel, errs_sel)
    
interact(display1, num_estimators=slider1)

interactive(children=(SelectionSlider(description='estimators', options=(10, 20, 30, 50, 70, 100), value=10), …

<function __main__.display1(num_estimators)>

In [11]:
from sklearn.ensemble import AdaBoostClassifier

num_folds = 10
folds = np.array_split(range(X_train_val.shape[0]), num_folds)

avg_errors = []
num_est_param2 = []
lr_param = []

for num_estimators in [10, 20, 30, 50, 70, 100]:
    for learning_rate in np.linspace(0.01,2, 20):
        errors = []
        for val_fold in range(len(folds)):
            val_indicies = folds[val_fold]
            X_val = X_train_val.iloc[val_indicies]
            y_val = y_train_val.iloc[val_indicies]

            train_indicies = sum([list(folds[fold_ind]) for fold_ind in range(len(folds))\
                                     if fold_ind != val_fold], [])

            X_train = X_train_val.iloc[train_indicies]
            y_train = y_train_val.iloc[train_indicies]

            model = AdaBoostClassifier(n_estimators=num_estimators, learning_rate=learning_rate, random_state=1)
            model.fit(X_train, y_train)
            predictions = model.predict(X_val)
            assert(len(y_val) == len(predictions))
            errors.append(sum([np.abs(y_val.iloc[i] - predictions[i]) for i in range(len(y_val))])/len(y_val))
        num_est_param2.append(num_estimators)
        lr_param.append(learning_rate)
        avg_errors.append(np.average(errors))
        
    print("Completed Iteration")
    print("num_estimators: ", num_estimators)
    print("Best accuracy so far:", 1 - min(avg_errors))
    print("==========================================")

min_index = np.argmin(avg_errors)
print("The highest average accuracy was produced with parameters,")
print("num_estimators: ", num_est_param2[min_index])
print("lr: ", lr_param[min_index])
print("Average Accuracy:", 1 - avg_errors[min_index])

Completed Iteration
num_estimators:  10
Best accuracy so far: 0.8139814814814815
Completed Iteration
num_estimators:  20
Best accuracy so far: 0.8139814814814815
Completed Iteration
num_estimators:  30
Best accuracy so far: 0.8139814814814815
Completed Iteration
num_estimators:  50
Best accuracy so far: 0.8139814814814815
Completed Iteration
num_estimators:  70
Best accuracy so far: 0.8140277777777778
Completed Iteration
num_estimators:  100
Best accuracy so far: 0.8140277777777778
The highest average accuracy was produced with parameters,
num_estimators:  70
lr:  1.6857894736842105
Average Accuracy: 0.8140277777777778


In [14]:
opts2 = list(set(num_est_param2))
opts2.sort()
slider2 = widgets.SelectionSlider(description='estimators', options=opts2)

def display2(num_estimators):
    indicies = [i for i, val in enumerate(num_est_param2) if val == num_estimators]
    lr_sel = list(map(lambda ind: lr_param[ind], indicies))
    errs_sel = list(map(lambda ind: avg_errors[ind], indicies))
    #plt.xlabel('lr')
    #plt.ylabel('error')
    #plt.plot(lr_sel, errs_sel)
    fig2, ax2 = plt.subplots()
    ax2.set_xlabel('lr')
    ax2.set_ylabel('error')
    ax2.plot(lr_sel, errs_sel)
    
interact(display2, num_estimators=slider2)

interactive(children=(SelectionSlider(description='estimators', options=(10, 20, 30, 50, 70, 100), value=10), …

<function __main__.display2(num_estimators)>

Applying what I learned, I will instead try to use python libraries to perform hyper parameter search on random forest. According to the documentation, Cross validation is performed automatically. 

In [15]:
clf = RandomForestClassifier()
grid_search = {'max_depth': [5, 10], 'n_estimators':[1, 100]}
model = GridSearchCV(estimator = clf, param_grid = grid_search, cv = 4, verbose = 5, n_jobs = -1)
model.fit(X_train_val, y_train_val)
print("best max_depth:", model.best_estimator_.max_depth)
print("best n_estimators:", model.best_estimator_.n_estimators)

Fitting 4 folds for each of 4 candidates, totalling 16 fits
best max_depth: 10
best n_estimators: 100


In [21]:
clf = RandomForestClassifier()
#grid_search = {'max_depth': [5, 10], 'n_estimators':[1, 100]}
param_grid = {'max_depth':range(1,20,2), 'n_estimators':range(1,100,5)}
model = RandomizedSearchCV(clf,param_grid, cv = 2)
model.fit(X_train_val, y_train_val)
print("best max_depth:", model.best_estimator_.max_depth)
print("best n_estimators:", model.best_estimator_.n_estimators)

best max_depth: 5
best n_estimators: 46
