In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv("oasis_longitudinal.csv")

In [3]:
data.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [4]:
data = data.drop(['MRI ID', 'Visit', 'Hand'], axis=1,)

In [5]:
data.head()

Unnamed: 0,Subject ID,Group,MR Delay,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,Nondemented,0,M,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,Nondemented,457,M,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,Demented,0,M,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,Demented,560,M,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,Demented,1895,M,80,12,,22.0,0.5,1698,0.701,1.034


In [6]:
# Check missing values by each column
pd.isnull(data).sum() 
# The column, SES has 8 missing values

Subject ID     0
Group          0
MR Delay       0
M/F            0
Age            0
EDUC           0
SES           19
MMSE           2
CDR            0
eTIV           0
nWBV           0
ASF            0
dtype: int64

In [7]:
#data.fillna(data.mean(), inplace=True)
df_dropna = data.dropna(axis=0, how='any')

In [8]:
df_dropna['M/F'] = df_dropna['M/F'].replace(['F','M'], [0,1])
df_dropna['Group'] = df_dropna['Group'].replace(['Converted'], ['Demented'])
df_dropna['Group'] = df_dropna['Group'].replace(['Demented', 'Nondemented'], [1,0])

In [9]:
x = df_dropna[['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']]
y = df_dropna['Group']

In [10]:
from sklearn.preprocessing import scale
X = scale(x)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 3)

In [12]:
# confirm that splitting also has similar distribution of spam and ham 
# emails
print(y_train.mean())
print(y_test.mean())

0.46153846153846156
0.4672897196261682


In [13]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()

In [14]:
# fit
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
# Making predictions
predictions = rfc.predict(X_test)

In [16]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [17]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90        57
           1       0.93      0.82      0.87        50

   micro avg       0.89      0.89      0.89       107
   macro avg       0.89      0.88      0.89       107
weighted avg       0.89      0.89      0.89       107



In [18]:
# Printing confusion matrix
print(confusion_matrix(y_test,predictions))

[[54  3]
 [ 9 41]]


In [19]:
print(accuracy_score(y_test,predictions))

0.8317757009345794


# Hyperperameter tuning

In [19]:
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_depth': range(2, 40, 5)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': range(2, 40, 5)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [122]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.013348,0.003127,0.002107,0.000788,5,{'min_samples_split': 5},0.86,0.84,0.76,0.795918,...,0.813765,0.034939,1,0.974619,0.964467,0.984772,0.974747,0.964824,0.972686,0.007529
1,0.008461,2e-05,0.001207,2.6e-05,10,{'min_samples_split': 10},0.8,0.76,0.68,0.795918,...,0.785425,0.069387,2,0.928934,0.944162,0.939086,0.959596,0.929648,0.940285,0.011237
2,0.008782,0.000527,0.001318,0.000104,15,{'min_samples_split': 15},0.76,0.8,0.7,0.755102,...,0.769231,0.044914,3,0.86802,0.898477,0.918782,0.90404,0.909548,0.899773,0.017234
3,0.008625,0.000293,0.001294,7.4e-05,20,{'min_samples_split': 20},0.8,0.82,0.68,0.693878,...,0.769231,0.069667,3,0.873096,0.878173,0.86802,0.89899,0.919598,0.887575,0.019163
4,0.00838,7.5e-05,0.001211,2.2e-05,25,{'min_samples_split': 25},0.76,0.78,0.68,0.734694,...,0.753036,0.044617,7,0.852792,0.873096,0.852792,0.853535,0.864322,0.859307,0.008167


In [21]:
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'n_estimators': range(500, 2500, 500)}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(500, 2500, 500)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [22]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.414377,0.013741,0.033901,0.001127,500,{'n_estimators': 500},0.82,0.82,0.72,0.755102,...,0.805668,0.066861,1,0.883249,0.903553,0.908629,0.888889,0.894472,0.895759,0.009293
1,0.884521,0.089905,0.079816,0.02378,1000,{'n_estimators': 1000},0.8,0.82,0.74,0.734694,...,0.797571,0.058606,2,0.888325,0.898477,0.923858,0.89899,0.869347,0.895799,0.017665
2,1.276496,0.094149,0.122997,0.043581,1500,{'n_estimators': 1500},0.8,0.8,0.7,0.734694,...,0.789474,0.073516,4,0.883249,0.903553,0.908629,0.893939,0.879397,0.893754,0.011259
3,1.669714,0.098946,0.160999,0.037828,2000,{'n_estimators': 2000},0.8,0.82,0.7,0.734694,...,0.793522,0.074528,3,0.888325,0.898477,0.913706,0.878788,0.864322,0.888723,0.016814


In [23]:
# GridSearchCV to find optimal max_features
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'max_features': [3,6]}

# instantiate the model
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_features': [3, 6]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [120]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.013348,0.003127,0.002107,0.000788,5,{'min_samples_split': 5},0.86,0.84,0.76,0.795918,...,0.813765,0.034939,1,0.974619,0.964467,0.984772,0.974747,0.964824,0.972686,0.007529
1,0.008461,2e-05,0.001207,2.6e-05,10,{'min_samples_split': 10},0.8,0.76,0.68,0.795918,...,0.785425,0.069387,2,0.928934,0.944162,0.939086,0.959596,0.929648,0.940285,0.011237
2,0.008782,0.000527,0.001318,0.000104,15,{'min_samples_split': 15},0.76,0.8,0.7,0.755102,...,0.769231,0.044914,3,0.86802,0.898477,0.918782,0.90404,0.909548,0.899773,0.017234
3,0.008625,0.000293,0.001294,7.4e-05,20,{'min_samples_split': 20},0.8,0.82,0.68,0.693878,...,0.769231,0.069667,3,0.873096,0.878173,0.86802,0.89899,0.919598,0.887575,0.019163
4,0.00838,7.5e-05,0.001211,2.2e-05,25,{'min_samples_split': 25},0.76,0.78,0.68,0.734694,...,0.753036,0.044617,7,0.852792,0.873096,0.852792,0.853535,0.864322,0.859307,0.008167


In [121]:
# plotting accuracies with max_features
plt.figure()
plt.plot(scores["param_max_features"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_features"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_features")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

KeyError: 'param_max_features'

<Figure size 432x288 with 0 Axes>

In [25]:
# GridSearchCV to find optimal min_samples_leaf
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_leaf': range(5, 50, 5)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_samples_leaf': range(5, 50, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [26]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.01004,0.001473,0.001434,0.00019,5,{'min_samples_leaf': 5},0.82,0.78,0.76,0.693878,...,0.781377,0.054252,1,0.888325,0.888325,0.898477,0.893939,0.884422,0.890698,0.004933
1,0.00837,9.2e-05,0.001258,5.6e-05,10,{'min_samples_leaf': 10},0.84,0.72,0.64,0.693878,...,0.753036,0.089013,2,0.847716,0.817259,0.827411,0.853535,0.788945,0.826973,0.023137
2,0.00811,0.000119,0.00127,8.9e-05,15,{'min_samples_leaf': 15},0.82,0.76,0.68,0.734694,...,0.753036,0.046105,2,0.807107,0.80203,0.832487,0.80303,0.824121,0.813755,0.012288
3,0.008039,8.3e-05,0.001214,2.5e-05,20,{'min_samples_leaf': 20},0.82,0.66,0.7,0.714286,...,0.740891,0.063772,6,0.781726,0.791878,0.812183,0.777778,0.763819,0.785477,0.016103
4,0.009751,0.002062,0.001656,0.000457,25,{'min_samples_leaf': 25},0.82,0.84,0.66,0.653061,...,0.744939,0.078166,5,0.751269,0.766497,0.781726,0.772727,0.753769,0.765198,0.011456


In [27]:
# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'min_samples_split': range(5, 50, 5)}

# instantiate the model
rf = RandomForestClassifier()


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_samples_split': range(5, 50, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [28]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.013348,0.003127,0.002107,0.000788,5,{'min_samples_split': 5},0.86,0.84,0.76,0.795918,...,0.813765,0.034939,1,0.974619,0.964467,0.984772,0.974747,0.964824,0.972686,0.007529
1,0.008461,2e-05,0.001207,2.6e-05,10,{'min_samples_split': 10},0.8,0.76,0.68,0.795918,...,0.785425,0.069387,2,0.928934,0.944162,0.939086,0.959596,0.929648,0.940285,0.011237
2,0.008782,0.000527,0.001318,0.000104,15,{'min_samples_split': 15},0.76,0.8,0.7,0.755102,...,0.769231,0.044914,3,0.86802,0.898477,0.918782,0.90404,0.909548,0.899773,0.017234
3,0.008625,0.000293,0.001294,7.4e-05,20,{'min_samples_split': 20},0.8,0.82,0.68,0.693878,...,0.769231,0.069667,3,0.873096,0.878173,0.86802,0.89899,0.919598,0.887575,0.019163
4,0.00838,7.5e-05,0.001211,2.2e-05,25,{'min_samples_split': 25},0.76,0.78,0.68,0.734694,...,0.753036,0.044617,7,0.852792,0.873096,0.852792,0.853535,0.864322,0.859307,0.008167


# Tuning n_estimators

In [48]:
# GridSearchCV to find optimal n_estimators
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {'n_estimators': range(100, 1500, 400)}

# instantiate the model (note we are specifying a max_depth)
rf = RandomForestClassifier(max_depth=4)


# fit tree on training data
rf = GridSearchCV(rf, parameters, 
                    cv=n_folds, 
                   scoring="accuracy")
rf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(100, 1500, 400)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [257]:
# scores of GridSearch CV
scores = rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.013348,0.003127,0.002107,0.000788,5,{'min_samples_split': 5},0.86,0.84,0.76,0.795918,...,0.813765,0.034939,1,0.974619,0.964467,0.984772,0.974747,0.964824,0.972686,0.007529
1,0.008461,2e-05,0.001207,2.6e-05,10,{'min_samples_split': 10},0.8,0.76,0.68,0.795918,...,0.785425,0.069387,2,0.928934,0.944162,0.939086,0.959596,0.929648,0.940285,0.011237
2,0.008782,0.000527,0.001318,0.000104,15,{'min_samples_split': 15},0.76,0.8,0.7,0.755102,...,0.769231,0.044914,3,0.86802,0.898477,0.918782,0.90404,0.909548,0.899773,0.017234
3,0.008625,0.000293,0.001294,7.4e-05,20,{'min_samples_split': 20},0.8,0.82,0.68,0.693878,...,0.769231,0.069667,3,0.873096,0.878173,0.86802,0.89899,0.919598,0.887575,0.019163
4,0.00838,7.5e-05,0.001211,2.2e-05,25,{'min_samples_split': 25},0.76,0.78,0.68,0.734694,...,0.753036,0.044617,7,0.852792,0.873096,0.852792,0.853535,0.864322,0.859307,0.008167


# Grid Search to Find Optimal Hyperparameters

In [28]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [2,7,12],
    'min_samples_leaf': range(5, 65,20),
    'min_samples_split': range(5, 65,20),
    'n_estimators': [500,1300,1700], 
    'max_features': [3,6]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

In [29]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 486 out of 486 | elapsed:  4.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [2, 7, 12], 'min_samples_leaf': range(5, 65, 20), 'min_samples_split': range(5, 65, 20), 'n_estimators': [500, 1300, 1700], 'max_features': [3, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [22]:
# printing the optimal accuracy score and hyperparameters
from sklearn.model_selection import GridSearchCV


In [30]:
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

We can get accuracy of 0.805668016194332 using {'max_depth': 12, 'max_features': 3, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 500}


In [250]:
# model with the best hyperparameters
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=1300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [251]:
# fit
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=1300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [252]:
# predict
predictions = rfc.predict(X_test)

In [253]:
# evaluation metrics
from sklearn.metrics import classification_report,confusion_matrix

In [254]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.78      0.98      0.87        57
           1       0.97      0.68      0.80        50

   micro avg       0.84      0.84      0.84       107
   macro avg       0.87      0.83      0.83       107
weighted avg       0.87      0.84      0.84       107



In [255]:
print(confusion_matrix(y_test,predictions))

[[56  1]
 [16 34]]


In [256]:
print(accuracy_score(y_test,predictions))

0.8411214953271028
