In [29]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

In [13]:
subset = pd.read_pickle("../output/subset_Color_48x48NormalizedFilter.pkl")

In [14]:
subset.head()

Unnamed: 0,label,image_array,image_array_Denoise
0,0,"[[[208, 154, 195], [115, 66, 113], [195, 94, 1...","[[[0.8628318584070797, 0.6150442477876106, 0.7..."
1,0,"[[[195, 118, 169], [117, 55, 115], [211, 151, ...","[[[0.9512195121951219, 0.5756097560975609, 0.8..."
2,0,"[[[247, 210, 231], [254, 227, 239], [251, 213,...","[[[0.9686274509803922, 0.8235294117647058, 0.9..."
3,0,"[[[128, 64, 123], [102, 56, 109], [172, 116, 1...","[[[0.5432098765432098, 0.30864197530864196, 0...."
4,0,"[[[219, 170, 204], [169, 105, 159], [202, 125,...","[[[0.9864864864864865, 0.7657657657657657, 0.9..."


In [15]:
X = np.array(list(subset["image_array"]))
X.shape

(4000, 48, 48, 3)

In [16]:
X[0].shape

(48, 48, 3)

In [17]:
X = X.reshape(X.shape[0], -1)

In [18]:
X.shape

(4000, 6912)

In [19]:
y = np.array(subset["label"])
y

array([0, 0, 0, ..., 1, 1, 1])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
model = RandomForestClassifier()

In [23]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
y_pred=model.predict(X_test)

In [28]:
print("Accuracy", accuracy_score(y_test, y_pred))
print("Precision", precision_score(y_test, y_pred))
print("Precision", precision_score(y_test, y_pred, average='weighted'))
print("Recall", recall_score(y_test, y_pred))
print("Recall", recall_score(y_test, y_pred, average='weighted'))
print("F1Score",f1_score(y_test, y_pred),"\n")

Accuracy 0.52
Precision 0.5327868852459017
Precision 0.5212374405076678
Recall 0.47794117647058826
Recall 0.52
F1Score 0.5038759689922481 



In [30]:
params = {'n_estimators':[100, 200, 500, 1000]}

"""
n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, 
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, 
n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None
"""



tuning_gradient = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=2, min_samples_leaf=1, bootstrap = True),
                               param_grid=params, scoring='accuracy',iid=False, cv=5, verbose=1)

tuning_gradient.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 16.3min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [31]:
tuning_gradient.best_params_

{'n_estimators': 500}

In [32]:
y_pred_grid = tuning_gradient.predict(X_test)

In [33]:
print("Accuracy", accuracy_score(y_test, y_pred_grid))
print("Precision", precision_score(y_test, y_pred_grid))
print("Precision", precision_score(y_test, y_pred_grid, average='weighted'))
print("Recall", recall_score(y_test, y_pred_grid))
print("Recall", recall_score(y_test, y_pred_grid, average='weighted'))
print("F1Score",f1_score(y_test, y_pred_grid),"\n")

Accuracy 0.5375
Precision 0.5533707865168539
Precision 0.5393587407632352
Recall 0.48284313725490197
Recall 0.5375
F1Score 0.5157068062827226 



In [34]:
params = {"criterion":["gini", "entropy"],
         "max_features":["auto", "sqrt", "log2", None]}

"""
n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, 
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, 
n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None
"""



tuning_gradient = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=2, min_samples_leaf=1, bootstrap = True, n_estimators = 500),
                               param_grid=params, scoring='accuracy',iid=False, cv=5, verbose=1)

tuning_gradient.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 730.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=500, n_jobs=None,
                                              oob_score=False,
                                              rando

In [35]:
tuning_gradient.best_params_

{'criterion': 'gini', 'max_features': None}

In [36]:
y_pred_grid = tuning_gradient.predict(X_test)

In [37]:
print("Accuracy", accuracy_score(y_test, y_pred_grid))
print("Precision", precision_score(y_test, y_pred_grid))
print("Precision", precision_score(y_test, y_pred_grid, average='weighted'))
print("Recall", recall_score(y_test, y_pred_grid))
print("Recall", recall_score(y_test, y_pred_grid, average='weighted'))
print("F1Score",f1_score(y_test, y_pred_grid),"\n")

Accuracy 0.545
Precision 0.5617977528089888
Precision 0.5469673043830348
Recall 0.49019607843137253
Recall 0.545
F1Score 0.5235602094240838 

