# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import numpy as np
from PIL import Image
import cv2
from  matplotlib import pyplot as plt
import matplotlib.image as mpimg

%matplotlib inline

skip = True

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Colab Notebooks/Archie_Short_CW_Folder_UG'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

Mounted at /content/drive
['.DS_Store', 'Personal_Dataset', 'Models', 'CW_Dataset', 'Code', 'test_functions.ipynb']


# Import and format training data

In [None]:
TRAIN_FOLDER = os.path.join(GOOGLE_DRIVE_PATH,'CW_Dataset/train')
tData = pd.read_csv(os.path.join(TRAIN_FOLDER,"train_data_refined.csv"))
trainData = tData.to_numpy()




In [None]:
#length of training data
print(trainData.shape) 

print(trainData[0])
#reformat input to data array, label array 
def formatTraining(trainingData):
  labels = []
  formattedTraining = []
  for sample in trainingData:
    labels.append(int(sample[len(sample)-1]))
    sample = sample[:-1]; sample 
    formattedTraining.append(sample) 


  return formattedTraining, labels
  
from numpy.random import shuffle
shuffle(trainData)


x,y = formatTraining(trainData)

(12271, 8501)
[ 0.11125135  0.16247456  0.07009258 ... 31.         15.
  7.        ]


In [None]:
import sklearn.model_selection as model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(x,y, train_size=0.90, test_size=0.1)


# Random Forest Model creation and training
Following the code structure found here https://machinelearningmastery.com/random-forest-ensemble-in-python/  

My parameter tuning came from this https://www.analyticsvidhya.com/blog/2020/03/beginners-guide-random-forest-hyperparameter-tuning/

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
#reference https://machinelearningmastery.com/random-forest-ensemble-in-python/
#reference https://www.analyticsvidhya.com/blog/2020/03/beginners-guide-random-forest-hyperparameter-tuning/ 
model=RandomForestClassifier(min_samples_split=5,max_features="auto" ,min_samples_leaf= 2,n_estimators=1600,bootstrap=False)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_pred,y_test)
#reference https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

print(classification_report(y_pred,y_test))



              precision    recall  f1-score   support

           1       0.42      0.77      0.54        71
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.95      0.63      0.76       728
           5       0.19      0.69      0.30        55
           6       0.07      1.00      0.13         5
           7       0.75      0.51      0.60       369

    accuracy                           0.60      1228
   macro avg       0.34      0.51      0.33      1228
weighted avg       0.82      0.60      0.67      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Hyperparameter Tuning**
To find the best parameters I followed this article to search through the options. The code follows their implementation. All the code up until saving the model is from this article  https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74   

In [None]:
# Code from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Code from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 30, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(X_train[:1000],y_train[:1000])

Fitting 3 folds for each of 30 candidates, totalling 90 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=30,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [None]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 40,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 1600}

In [None]:
MODEL_FOLDER = os.path.join(GOOGLE_DRIVE_PATH,'Models')
import joblib

model_good = False
#SAVE
if model_good:
  joblib.dump(model, os.path.join(MODEL_FOLDER,"random_forest.joblib"))

In [None]:
#LOAD

loaded_rf = joblib.load(os.path.join(MODEL_FOLDER,"random_forest.joblib"))
y_pred = loaded_rf.predict(X_test)
accuracy_score(y_pred,y_test)
print(classification_report(y_pred,y_test))


              precision    recall  f1-score   support

           1       0.99      1.00      0.99       135
           2       0.94      1.00      0.97        16
           3       0.96      1.00      0.98        70
           4       1.00      0.98      0.99       470
           5       0.98      1.00      0.99       222
           6       0.97      1.00      0.99        67
           7       0.99      0.97      0.98       248

    accuracy                           0.99      1228
   macro avg       0.98      0.99      0.98      1228
weighted avg       0.99      0.99      0.99      1228

