# 1. Initialization

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             fbeta_score, make_scorer, classification_report, confusion_matrix)

plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
crash_data_clean = pd.read_csv('Crash_Analysis_System_CAS_data_clean.csv', keep_default_na=False)

# 2. Preprocessing

Talk about no need for feature scaling or transformation, nor normalizaton
Why one hot encoding and not label encoding, why not ordinal labels and the need to have categorical features binarized

Explore and choose the right metric

In [3]:
def parse_type(dtype):
    if dtype == 'int':
        return np.int8
    elif dtype == 'float':
        return np.float
    else:
        return dtype

# Read features descriptions
features_catalog = pd.read_table('features_description.tsv')
# Make a dict to use as dtypes for panda's dataframe
features_dtypes = features_catalog.set_index('feature_name')['pandas_dtype'].apply(parse_type).to_dict()
# Keep only the columns that remain in the clean version of the dataframe
features_dtypes = {k: v for k, v in features_dtypes.items() if k in crash_data_clean.columns}

In [4]:
crash_data_clean = crash_data_clean.astype(features_dtypes, copy=False)

In [5]:
crash_data_clean['speedLimit'] = crash_data_clean['speedLimit'].apply(lambda x: 999 if x == -1 else x)

In [6]:
categorical_features = list(features_catalog[features_catalog['feature_type'] == 'categorical']['feature_name'])
categorical_features.remove('crashSeverity')
crash_data_ohe = pd.get_dummies(crash_data_clean,columns=categorical_features)

In [7]:
crash_data_ohe.drop(['fatalCount', 'seriousInjuryCount', 'minorInjuryCount'], axis=1, inplace=True)

In [8]:
y = crash_data_ohe['crashSeverity']
X = crash_data_ohe.drop('crashSeverity', axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

# 3. Benchmark

also try:

* Naive Bayes
* sklearn.ensemble.GradientBoostingClassifier
* XGBoost
* LGBM
* Random Forest

In [10]:
def print_results(true, pred, betta=1, digits=2):
    print('Accuracy score: ', format(accuracy_score(true, pred)))
    print('Precision score: ', format(precision_score(true, pred, average='weighted')))
    print('Recall score: ', format(recall_score(true, pred, average='weighted')))
    print('F1 score: ', format(f1_score(true, pred, average='weighted')))
    print('F betta score with betta=%.2f: ' % betta, format(fbeta_score(true, pred, betta, average='weighted')))
    print('\n', classification_report(y_test, pred, digits=digits))

In [11]:
clf_multi = MultinomialNB()
clf_multi.fit(X_train, y_train)

In [11]:
clf_multi = joblib.load('multinomialNB.pkl') 

In [12]:
predictions_NB = clf_multi.predict(X_test)
print_results(y_test, predictions_NB, betta=2)

Accuracy score:  0.6308358513852832
Precision score:  0.6907373501708025
Recall score:  0.6308358513852832
F1 score:  0.6425445046329308
F betta score with betta=2.00:  0.6320174104457815

              precision    recall  f1-score   support

          F       0.03      0.49      0.06      1728
          M       0.53      0.20      0.30     42522
          N       0.79      0.81      0.80    133170
          S       0.14      0.13      0.14     10556

avg / total       0.69      0.63      0.64    187976



In [12]:
joblib.dump(clf_multi, 'multinomialNB.pkl')

SyntaxError: EOL while scanning string literal (<ipython-input-12-dd5d06b9e238>, line 4)

One of the major advantages that Naive Bayes has over other classification algorithms is its ability to handle an extremely large number of features. In our case, each word is treated as a feature and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and is relatively unaffected by them. The other major advantage it has is its relative simplicity. Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases where the distribution of the data is known. 
It rarely ever overfits the data. Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle. All in all, Naive Bayes' really is a gem of an algorithm!

Congratulations! You have successfully designed a model that can efficiently predict if an SMS message is spam or not!

Thank you for learning with us!

HOW TO DO ROC/AUC?
HOW TO IMPROVE THESE METRICS?

Feature selection is not a must-do task for RF algorithm

In [16]:
clf = RandomForestClassifier(random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [19]:
clf = joblib.load('randomForest.pkl') 

In [20]:
predictions_RF = clf.predict(X_test)
print_results(y_test, predictions_RF, betta=2, digits=5)

Accuracy score:  0.7150221304847427
Precision score:  0.6700691397910647
Recall score:  0.7150221304847427
F1 score:  0.6847572385900031
F betta score with betta=2.00:  0.7015092971350111

              precision    recall  f1-score   support

          F    0.12438   0.02894   0.04695      1728
          M    0.44517   0.32536   0.37595     42522
          N    0.78149   0.89838   0.83587    133170
          S    0.25968   0.08384   0.12675     10556

avg / total    0.67007   0.71502   0.68476    187976



In [18]:
joblib.dump(clf, 'randomForest.pkl')

['randomForest.pkl']

In [21]:
warnings.filterwarnings('ignore')

parameters = {
    'n_estimators': [20, 50, 100, 150, 200],
    'max_depth': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    'min_samples_split': [50, 100, 500, 1000]
}

clf = RandomForestClassifier(random_state=42, n_jobs=-1)
# Prioritize Recall over Precision
scorer = make_scorer(fbeta_score, beta=2, average='weighted')
grid_obj = GridSearchCV(clf, parameters, scorer, verbose=4)
grid_fit = grid_obj.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] max_depth=6, min_samples_split=50, n_estimators=20 ..............
[CV]  max_depth=6, min_samples_split=50, n_estimators=20, score=0.6935034529453695, total=   4.3s
[CV] max_depth=6, min_samples_split=50, n_estimators=20 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.2s remaining:    0.0s


[CV]  max_depth=6, min_samples_split=50, n_estimators=20, score=0.6937151657983909, total=   4.0s
[CV] max_depth=6, min_samples_split=50, n_estimators=20 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.0s remaining:    0.0s


[CV]  max_depth=6, min_samples_split=50, n_estimators=20, score=0.6889675225181106, total=   4.0s
[CV] max_depth=6, min_samples_split=50, n_estimators=50 ..............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   17.9s remaining:    0.0s


[CV]  max_depth=6, min_samples_split=50, n_estimators=50, score=0.6909789696159203, total=   7.0s
[CV] max_depth=6, min_samples_split=50, n_estimators=50 ..............
[CV]  max_depth=6, min_samples_split=50, n_estimators=50, score=0.6949253314026949, total=   7.1s
[CV] max_depth=6, min_samples_split=50, n_estimators=50 ..............
[CV]  max_depth=6, min_samples_split=50, n_estimators=50, score=0.6827890236994413, total=   7.7s
[CV] max_depth=6, min_samples_split=50, n_estimators=100 .............
[CV]  max_depth=6, min_samples_split=50, n_estimators=100, score=0.6938485081427701, total=  12.8s
[CV] max_depth=6, min_samples_split=50, n_estimators=100 .............
[CV]  max_depth=6, min_samples_split=50, n_estimators=100, score=0.6890481253355507, total=  11.6s
[CV] max_depth=6, min_samples_split=50, n_estimators=100 .............
[CV]  max_depth=6, min_samples_split=50, n_estimators=100, score=0.6857682834746833, total=  11.6s
[CV] max_depth=6, min_samples_split=50, n_estimators=1

[CV]  max_depth=6, min_samples_split=1000, n_estimators=100, score=0.6894420746966287, total=  11.7s
[CV] max_depth=6, min_samples_split=1000, n_estimators=100 ...........
[CV]  max_depth=6, min_samples_split=1000, n_estimators=100, score=0.6956299594814082, total=  11.7s
[CV] max_depth=6, min_samples_split=1000, n_estimators=150 ...........
[CV]  max_depth=6, min_samples_split=1000, n_estimators=150, score=0.6987083182064737, total=  16.6s
[CV] max_depth=6, min_samples_split=1000, n_estimators=150 ...........
[CV]  max_depth=6, min_samples_split=1000, n_estimators=150, score=0.694355559933672, total=  16.5s
[CV] max_depth=6, min_samples_split=1000, n_estimators=150 ...........
[CV]  max_depth=6, min_samples_split=1000, n_estimators=150, score=0.690971246794876, total=  18.0s
[CV] max_depth=6, min_samples_split=1000, n_estimators=200 ...........
[CV]  max_depth=6, min_samples_split=1000, n_estimators=200, score=0.6997138407463687, total=  20.0s
[CV] max_depth=6, min_samples_split=1000,

[CV]  max_depth=7, min_samples_split=500, n_estimators=150, score=0.6926123833477652, total=  21.9s
[CV] max_depth=7, min_samples_split=500, n_estimators=200 ............
[CV]  max_depth=7, min_samples_split=500, n_estimators=200, score=0.6968236950126165, total=  26.2s
[CV] max_depth=7, min_samples_split=500, n_estimators=200 ............
[CV]  max_depth=7, min_samples_split=500, n_estimators=200, score=0.6952782259654234, total=  25.5s
[CV] max_depth=7, min_samples_split=500, n_estimators=200 ............
[CV]  max_depth=7, min_samples_split=500, n_estimators=200, score=0.6921553833286305, total=  25.0s
[CV] max_depth=7, min_samples_split=1000, n_estimators=20 ............
[CV]  max_depth=7, min_samples_split=1000, n_estimators=20, score=0.6992734752685564, total=   4.5s
[CV] max_depth=7, min_samples_split=1000, n_estimators=20 ............
[CV]  max_depth=7, min_samples_split=1000, n_estimators=20, score=0.6982764871600218, total=   4.4s
[CV] max_depth=7, min_samples_split=1000, n_e

[CV]  max_depth=8, min_samples_split=500, n_estimators=20, score=0.7002338165081128, total=   4.6s
[CV] max_depth=8, min_samples_split=500, n_estimators=20 .............
[CV]  max_depth=8, min_samples_split=500, n_estimators=20, score=0.6992183630492874, total=   4.8s
[CV] max_depth=8, min_samples_split=500, n_estimators=20 .............
[CV]  max_depth=8, min_samples_split=500, n_estimators=20, score=0.7002547988351244, total=   5.1s
[CV] max_depth=8, min_samples_split=500, n_estimators=50 .............
[CV]  max_depth=8, min_samples_split=500, n_estimators=50, score=0.7005111826171834, total=   8.8s
[CV] max_depth=8, min_samples_split=500, n_estimators=50 .............
[CV]  max_depth=8, min_samples_split=500, n_estimators=50, score=0.6989684123196361, total=   8.9s
[CV] max_depth=8, min_samples_split=500, n_estimators=50 .............
[CV]  max_depth=8, min_samples_split=500, n_estimators=50, score=0.7003450639001619, total=   8.0s
[CV] max_depth=8, min_samples_split=500, n_estimato

[CV]  max_depth=9, min_samples_split=100, n_estimators=50, score=0.6998890146001703, total=  11.0s
[CV] max_depth=9, min_samples_split=100, n_estimators=50 .............
[CV]  max_depth=9, min_samples_split=100, n_estimators=50, score=0.7073990677046382, total=  12.3s
[CV] max_depth=9, min_samples_split=100, n_estimators=100 ............
[CV]  max_depth=9, min_samples_split=100, n_estimators=100, score=0.7014059303047323, total=  20.2s
[CV] max_depth=9, min_samples_split=100, n_estimators=100 ............
[CV]  max_depth=9, min_samples_split=100, n_estimators=100, score=0.6998238482968698, total=  17.9s
[CV] max_depth=9, min_samples_split=100, n_estimators=100 ............
[CV]  max_depth=9, min_samples_split=100, n_estimators=100, score=0.7043903697854496, total=  17.2s
[CV] max_depth=9, min_samples_split=100, n_estimators=150 ............
[CV]  max_depth=9, min_samples_split=100, n_estimators=150, score=0.7013427570808132, total=  24.4s
[CV] max_depth=9, min_samples_split=100, n_esti

[CV]  max_depth=10, min_samples_split=50, n_estimators=100, score=0.7053777801877995, total=  18.4s
[CV] max_depth=10, min_samples_split=50, n_estimators=100 ............
[CV]  max_depth=10, min_samples_split=50, n_estimators=100, score=0.7060997828114463, total=  18.3s
[CV] max_depth=10, min_samples_split=50, n_estimators=150 ............
[CV]  max_depth=10, min_samples_split=50, n_estimators=150, score=0.7047891123415496, total=  26.8s
[CV] max_depth=10, min_samples_split=50, n_estimators=150 ............
[CV]  max_depth=10, min_samples_split=50, n_estimators=150, score=0.7064818894755205, total=  26.6s
[CV] max_depth=10, min_samples_split=50, n_estimators=150 ............
[CV]  max_depth=10, min_samples_split=50, n_estimators=150, score=0.707563077177194, total=  27.4s
[CV] max_depth=10, min_samples_split=50, n_estimators=200 ............
[CV]  max_depth=10, min_samples_split=50, n_estimators=200, score=0.7056708628312454, total=  35.0s
[CV] max_depth=10, min_samples_split=50, n_est

[CV]  max_depth=10, min_samples_split=1000, n_estimators=150, score=0.7038183226011088, total=  25.7s
[CV] max_depth=10, min_samples_split=1000, n_estimators=150 ..........
[CV]  max_depth=10, min_samples_split=1000, n_estimators=150, score=0.7055861916866545, total=  25.8s
[CV] max_depth=10, min_samples_split=1000, n_estimators=200 ..........
[CV]  max_depth=10, min_samples_split=1000, n_estimators=200, score=0.7030032079211197, total=  33.7s
[CV] max_depth=10, min_samples_split=1000, n_estimators=200 ..........
[CV]  max_depth=10, min_samples_split=1000, n_estimators=200, score=0.7030159595189469, total=  33.7s
[CV] max_depth=10, min_samples_split=1000, n_estimators=200 ..........
[CV]  max_depth=10, min_samples_split=1000, n_estimators=200, score=0.7054650776429762, total=  33.6s
[CV] max_depth=11, min_samples_split=50, n_estimators=20 .............
[CV]  max_depth=11, min_samples_split=50, n_estimators=20, score=0.7060336116367578, total=   5.6s
[CV] max_depth=11, min_samples_split

[CV]  max_depth=11, min_samples_split=500, n_estimators=200, score=0.7055197833283141, total=  37.5s
[CV] max_depth=11, min_samples_split=500, n_estimators=200 ...........
[CV]  max_depth=11, min_samples_split=500, n_estimators=200, score=0.7075734701015632, total=  37.4s
[CV] max_depth=11, min_samples_split=1000, n_estimators=20 ...........
[CV]  max_depth=11, min_samples_split=1000, n_estimators=20, score=0.7059000053649117, total=   5.5s
[CV] max_depth=11, min_samples_split=1000, n_estimators=20 ...........
[CV]  max_depth=11, min_samples_split=1000, n_estimators=20, score=0.7048236814999082, total=   5.5s
[CV] max_depth=11, min_samples_split=1000, n_estimators=20 ...........
[CV]  max_depth=11, min_samples_split=1000, n_estimators=20, score=0.7056045752118848, total=   5.5s
[CV] max_depth=11, min_samples_split=1000, n_estimators=50 ...........
[CV]  max_depth=11, min_samples_split=1000, n_estimators=50, score=0.7066602830173826, total=  10.9s
[CV] max_depth=11, min_samples_split=10

[CV]  max_depth=12, min_samples_split=500, n_estimators=20, score=0.7073869367428005, total=   6.1s
[CV] max_depth=12, min_samples_split=500, n_estimators=20 ............
[CV]  max_depth=12, min_samples_split=500, n_estimators=20, score=0.7110544758346511, total=   6.1s
[CV] max_depth=12, min_samples_split=500, n_estimators=50 ............
[CV]  max_depth=12, min_samples_split=500, n_estimators=50, score=0.7106490781331123, total=  12.0s
[CV] max_depth=12, min_samples_split=500, n_estimators=50 ............
[CV]  max_depth=12, min_samples_split=500, n_estimators=50, score=0.7077303575269013, total=  12.0s
[CV] max_depth=12, min_samples_split=500, n_estimators=50 ............
[CV]  max_depth=12, min_samples_split=500, n_estimators=50, score=0.7076018401617206, total=  12.0s
[CV] max_depth=12, min_samples_split=500, n_estimators=100 ...........
[CV]  max_depth=12, min_samples_split=500, n_estimators=100, score=0.7101624518132096, total=  22.1s
[CV] max_depth=12, min_samples_split=500, n_

[CV]  max_depth=13, min_samples_split=100, n_estimators=50, score=0.7117945312812292, total=  12.9s
[CV] max_depth=13, min_samples_split=100, n_estimators=50 ............
[CV]  max_depth=13, min_samples_split=100, n_estimators=50, score=0.71133350489904, total=  13.3s
[CV] max_depth=13, min_samples_split=100, n_estimators=100 ...........
[CV]  max_depth=13, min_samples_split=100, n_estimators=100, score=0.7104524731678982, total=  23.8s
[CV] max_depth=13, min_samples_split=100, n_estimators=100 ...........
[CV]  max_depth=13, min_samples_split=100, n_estimators=100, score=0.7108558541299765, total=  23.2s
[CV] max_depth=13, min_samples_split=100, n_estimators=100 ...........
[CV]  max_depth=13, min_samples_split=100, n_estimators=100, score=0.711146636982286, total=  26.6s
[CV] max_depth=13, min_samples_split=100, n_estimators=150 ...........
[CV]  max_depth=13, min_samples_split=100, n_estimators=150, score=0.7104698632008462, total=20.0min
[CV] max_depth=13, min_samples_split=100, n_

[CV]  max_depth=14, min_samples_split=50, n_estimators=100, score=0.71256191820367, total=  30.5s
[CV] max_depth=14, min_samples_split=50, n_estimators=100 ............
[CV]  max_depth=14, min_samples_split=50, n_estimators=100, score=0.7134986565994967, total=  28.4s
[CV] max_depth=14, min_samples_split=50, n_estimators=150 ............
[CV]  max_depth=14, min_samples_split=50, n_estimators=150, score=0.7136989646318131, total=  42.3s
[CV] max_depth=14, min_samples_split=50, n_estimators=150 ............
[CV]  max_depth=14, min_samples_split=50, n_estimators=150, score=0.712099186013571, total=  41.5s
[CV] max_depth=14, min_samples_split=50, n_estimators=150 ............
[CV]  max_depth=14, min_samples_split=50, n_estimators=150, score=0.7135252524044515, total=  41.7s
[CV] max_depth=14, min_samples_split=50, n_estimators=200 ............
[CV]  max_depth=14, min_samples_split=50, n_estimators=200, score=0.7141551472420171, total=  58.1s
[CV] max_depth=14, min_samples_split=50, n_estim

[CV]  max_depth=14, min_samples_split=1000, n_estimators=150, score=0.710003034688219, total=  39.3s
[CV] max_depth=14, min_samples_split=1000, n_estimators=150 ..........
[CV]  max_depth=14, min_samples_split=1000, n_estimators=150, score=0.7116953649596383, total=  34.8s
[CV] max_depth=14, min_samples_split=1000, n_estimators=200 ..........
[CV]  max_depth=14, min_samples_split=1000, n_estimators=200, score=0.7112165966183257, total=  48.7s
[CV] max_depth=14, min_samples_split=1000, n_estimators=200 ..........
[CV]  max_depth=14, min_samples_split=1000, n_estimators=200, score=0.7101266900971498, total=  47.4s
[CV] max_depth=14, min_samples_split=1000, n_estimators=200 ..........
[CV]  max_depth=14, min_samples_split=1000, n_estimators=200, score=0.7111768410761556, total=  49.7s
[CV] max_depth=15, min_samples_split=50, n_estimators=20 .............
[CV]  max_depth=15, min_samples_split=50, n_estimators=20, score=0.7146953274493725, total=   7.4s
[CV] max_depth=15, min_samples_split=

[CV]  max_depth=15, min_samples_split=500, n_estimators=200, score=0.712406259460811, total=  56.6s
[CV] max_depth=15, min_samples_split=500, n_estimators=200 ...........
[CV]  max_depth=15, min_samples_split=500, n_estimators=200, score=0.7124161076124793, total= 1.0min
[CV] max_depth=15, min_samples_split=1000, n_estimators=20 ...........
[CV]  max_depth=15, min_samples_split=1000, n_estimators=20, score=0.7154135011048721, total=   7.4s
[CV] max_depth=15, min_samples_split=1000, n_estimators=20 ...........
[CV]  max_depth=15, min_samples_split=1000, n_estimators=20, score=0.7129646612752142, total=   7.5s
[CV] max_depth=15, min_samples_split=1000, n_estimators=20 ...........
[CV]  max_depth=15, min_samples_split=1000, n_estimators=20, score=0.7132898828087597, total=   7.7s
[CV] max_depth=15, min_samples_split=1000, n_estimators=50 ...........
[CV]  max_depth=15, min_samples_split=1000, n_estimators=50, score=0.7130568586028535, total=  16.0s
[CV] max_depth=15, min_samples_split=100

[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed: 254.7min finished


In [38]:
df = pd.DataFrame(grid_obj.cv_results_).drop('params', axis=1).set_index('rank_test_score').sort_index()
df.to_csv('random_forest_grid_search_1.csv', index=False)

In [39]:
df

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,6.973879,0.668556,1.103236,0.068601,15,100,20,0.716024,0.713695,0.715222,0.714981,0.000966,0.716974,0.716978,0.716464,0.716805,0.000241
2,25.778031,0.027442,1.525331,0.003559,15,100,100,0.715495,0.714159,0.715278,0.714977,0.000586,0.716838,0.717463,0.716158,0.716820,0.000533
3,14.263440,0.436133,1.225039,0.004923,15,100,50,0.715786,0.713568,0.715522,0.714959,0.000989,0.716966,0.716574,0.716175,0.716572,0.000323
4,38.418590,0.321092,1.834619,0.006672,15,100,150,0.715801,0.713571,0.714725,0.714699,0.000911,0.717257,0.716716,0.715371,0.716448,0.000793
5,25.788893,2.105248,1.521120,0.013113,15,50,100,0.713983,0.713493,0.715958,0.714478,0.001065,0.715213,0.717372,0.717438,0.716674,0.001034
6,55.585708,0.263145,2.344001,0.034455,15,100,200,0.715616,0.713541,0.714160,0.714439,0.000870,0.716898,0.716861,0.714855,0.716205,0.000954
7,6.467996,0.093706,1.091619,0.042164,14,50,20,0.714068,0.713883,0.715131,0.714361,0.000550,0.715540,0.716875,0.716090,0.716168,0.000548
8,14.078852,0.192671,1.223864,0.001895,15,50,50,0.713703,0.712865,0.716069,0.714212,0.001357,0.715562,0.716535,0.717243,0.716446,0.000689
9,6.508589,0.216372,1.031606,0.015710,15,50,20,0.714695,0.712791,0.714905,0.714130,0.000951,0.716680,0.716152,0.716641,0.716491,0.000240
10,37.806803,2.561332,1.876461,0.057498,15,50,150,0.714455,0.712870,0.714912,0.714079,0.000875,0.715903,0.716867,0.716453,0.716407,0.000395


In [43]:
predictions_RF = grid_obj.best_estimator_.predict(X_test)

In [44]:
print_results(y_test, predictions_RF, betta=2, digits=5)

Accuracy score:  0.7462335617312849
Precision score:  0.7125518513040267
Recall score:  0.7462335617312849
F1 score:  0.6791007108699543
F betta score with betta=2.00:  0.7139590474282128

              precision    recall  f1-score   support

          F    0.38462   0.00579   0.01140      1728
          M    0.62842   0.21111   0.31605     42522
          N    0.75626   0.98485   0.85555    133170
          S    0.55372   0.01269   0.02482     10556

avg / total    0.71255   0.74623   0.67910    187976



In [45]:
warnings.filterwarnings('ignore')

parameters = {
    'n_estimators': [20, 50, 100],
    'max_depth': [17, 19, 21, 23],
    'min_samples_split': [50, 100, 500],
    'class_weight': ['balanced', 'balanced_subsample']
}

clf = RandomForestClassifier(random_state=42, n_jobs=-1)
# Prioritize Recall over Precision
scorer = make_scorer(fbeta_score, beta=2, average='weighted')
grid_obj = GridSearchCV(clf, parameters, scorer, verbose=4)
grid_fit = grid_obj.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=20 
[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=20, score=0.5915569644913063, total=   8.9s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.0s remaining:    0.0s


[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=20, score=0.5911472407987799, total=   8.3s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=20 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   21.5s remaining:    0.0s


[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=20, score=0.5946423963978782, total=   8.4s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=50 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   32.2s remaining:    0.0s


[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=50, score=0.5986852820833006, total=  19.2s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=50 
[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=50, score=0.5979330940809496, total=  17.2s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=50 
[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=50, score=0.5997543595216509, total=  18.7s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=100 
[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=100, score=0.600797622064334, total=  38.4s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=100 
[CV]  class_weight=balanced, max_depth=17, min_samples_split=50, n_estimators=100, score=0.6017379409881243, total=  34.3s
[CV] class_weight=balanced, max_depth=17, min_samples_split=50,

KeyboardInterrupt: 

In [47]:
grid_obj.best_params_

{'max_depth': 11, 'n_estimators': 100}

In [46]:
print_results(y_test, predictions_RF, betta=2, digits=5)

Accuracy score:  0.74064774226497
Precision score:  0.7012972151359186
Recall score:  0.74064774226497
F1 score:  0.6671947692437938
F betta score with betta=2.00:  0.7054613768988003

              precision    recall  f1-score   support

          F    0.25000   0.00058   0.00115      1728
          M    0.64074   0.18233   0.28388     42522
          N    0.74761   0.98708   0.85082    133170
          S    0.43478   0.00189   0.00377     10556

avg / total    0.70130   0.74065   0.66719    187976



Talk about <span style="color: red">**Out-of-bag error**</span>