# Random Forest

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df.dropna(inplace=True)

## Data Preprocessing

In [5]:
X = df[['pclass', 'sex', 'age']]

In [6]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

In [7]:
X['sex'] = lb.fit_transform(X['sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
y = df['survived']

## Training

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [11]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [12]:
def print_score(clf, X_train, X_test, y_train, y_test, train = True):
    '''
    print accuracy score and confusion matrix
    '''
    
    if train:
        '''
        train performance
        '''
        print('Train Result: \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train, clf.predict(X_train))))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, clf.predict(X_train))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv = 10, scoring='accuracy')
        print('Average Accuracy: \t {0:.4f}'.format(np.mean(res)))
        print('Average SD: \t\t {0:.4f}'.format(np.std(res)))
        
    elif train == False:
        '''
        test performance
        '''
        print('Test Result \n')
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test, clf.predict(X_test))))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test))))

In [13]:
rf_clf = RandomForestClassifier(random_state = 42)

In [14]:
rf_clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [15]:
print_score(rf_clf, X_train, X_test, y_train, y_test, train=True)

Train Result: 

Accuracy Score: 0.9134

Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.93      0.88        44
           1       0.96      0.90      0.93        83

   micro avg       0.91      0.91      0.91       127
   macro avg       0.90      0.92      0.91       127
weighted avg       0.92      0.91      0.91       127
 

Confusion Matrix: 
 [[41  3]
 [ 8 75]] 

Average Accuracy: 	 0.7478
Average SD: 		 0.1003


In [16]:
print_score(rf_clf, X_train, X_test, y_train, y_test, train=False)

Test Result 

Accuracy Score: 0.7636

Classification Report: 
               precision    recall  f1-score   support

           0       0.54      0.87      0.67        15
           1       0.94      0.72      0.82        40

   micro avg       0.76      0.76      0.76        55
   macro avg       0.74      0.80      0.74        55
weighted avg       0.83      0.76      0.78        55
 

Confusion Matrix: 
 [[13  2]
 [11 29]] 



## Grid Search

In [17]:
from sklearn.pipeline import Pipeline

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
rf_clf = RandomForestClassifier(random_state = 42)

In [25]:
params_grid = {'max_depth': [3, None],
              'min_samples_split' : [2,3,10],
              'min_samples_leaf' : [1,3,10],
              'bootstrap' : [True, False],
              'criterion' : ['gini', 'entropy']}

In [26]:
grid_search = GridSearchCV(rf_clf, params_grid,
                          n_jobs = -1, cv = 5,
                          verbose = 1, scoring = 'accuracy')

In [27]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    2.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, None], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [28]:
grid_search.best_score_

0.7795275590551181

In [30]:
grid_search.best_estimator_.get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [31]:
print_score(grid_search, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.8740

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.77      0.81        44
           1       0.89      0.93      0.91        83

   micro avg       0.87      0.87      0.87       127
   macro avg       0.87      0.85      0.86       127
weighted avg       0.87      0.87      0.87       127
 

Confusion Matrix: 
 [[34 10]
 [ 6 77]] 

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Average Accuracy: 	 0.7580
Average SD: 		 0.0836


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.7s finished


In [32]:
print_score(grid_search, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.8545

Classification Report: 
               precision    recall  f1-score   support

           0       0.68      0.87      0.76        15
           1       0.94      0.85      0.89        40

   micro avg       0.85      0.85      0.85        55
   macro avg       0.81      0.86      0.83        55
weighted avg       0.87      0.85      0.86        55
 

Confusion Matrix: 
 [[13  2]
 [ 6 34]] 



# Extra-Trees Ensemble

In [33]:
from sklearn.ensemble import ExtraTreesClassifier

In [34]:
xt_clf = ExtraTreesClassifier(random_state = 42)

In [35]:
xt_clf.fit(X_train, y_train)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [36]:
print_score(xt_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.9370

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.98      0.91        44
           1       0.99      0.92      0.95        83

   micro avg       0.94      0.94      0.94       127
   macro avg       0.92      0.95      0.93       127
weighted avg       0.94      0.94      0.94       127
 

Confusion Matrix: 
 [[43  1]
 [ 7 76]] 

Average Accuracy: 	 0.7407
Average SD: 		 0.0584


In [37]:
print_score(xt_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.8182

Classification Report: 
               precision    recall  f1-score   support

           0       0.61      0.93      0.74        15
           1       0.97      0.78      0.86        40

   micro avg       0.82      0.82      0.82        55
   macro avg       0.79      0.85      0.80        55
weighted avg       0.87      0.82      0.83        55
 

Confusion Matrix: 
 [[14  1]
 [ 9 31]] 



In [38]:
xt_clf = ExtraTreesClassifier(random_state = 42, min_samples_leaf = 5)

In [39]:
xt_clf.fit(X_train, y_train)



ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [40]:
print_score(xt_clf, X_train, X_test, y_train, y_test, train = True)

Train Result: 

Accuracy Score: 0.7717

Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.57      0.63        44
           1       0.79      0.88      0.83        83

   micro avg       0.77      0.77      0.77       127
   macro avg       0.75      0.72      0.73       127
weighted avg       0.77      0.77      0.76       127
 

Confusion Matrix: 
 [[25 19]
 [10 73]] 

Average Accuracy: 	 0.7247
Average SD: 		 0.1281


In [41]:
print_score(xt_clf, X_train, X_test, y_train, y_test, train = False)

Test Result 

Accuracy Score: 0.7818

Classification Report: 
               precision    recall  f1-score   support

           0       0.64      0.47      0.54        15
           1       0.82      0.90      0.86        40

   micro avg       0.78      0.78      0.78        55
   macro avg       0.73      0.68      0.70        55
weighted avg       0.77      0.78      0.77        55
 

Confusion Matrix: 
 [[ 7  8]
 [ 4 36]] 

