# Import packages

In [6]:
import pandas as pd
import numpy as np
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score
from sklearn import tree
from sklearn.model_selection import cross_val_score
import pylab as pl
from joblib import dump, load
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import image_lib as imlib
import cv2

# Random forest classifier

In [7]:
def RFclassification(df, getResult, crossval, testSize, pretrainedModel):
    # Assigning data
    X = df.drop(['label'], axis=1)   
    y = df['label'] 
    
    scaler = StandardScaler()
    X_norm = scaler.fit_transform(X)
    
    # Splitting the data in to training and test data
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size = testSize, random_state=10)
    print('[UPDATE] Data has been loaded successfully!')
    
    if pretrainedModel == 'yes':
        print('[UPDATE] Using pretrained model!')
        RFclassifier = load('PIPE4_RF.joblib')
        print('[UPDATE] Pretrained model loaded!')
    
    else:
        # Identification of hyperparameters
        #param_grid = {
        #'bootstrap': [True],
        #'max_depth': [2, 4, 8, 12, 20],
        #'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        #'min_samples_leaf': [4, 5, 6],
        #'min_samples_split': [2, 4, 8, 12],
        #'n_estimators': [2, 4, 6, 12, 24, 48]
        #}
        
        # The parameters used
        param_grid = {
        'bootstrap': [True],
        'max_depth': [4],
        'max_features': [3],
        'min_samples_leaf': [5],
        'min_samples_split': [8],
        'n_estimators': [12]
        }
        # Calling the Support Vector Classifier function
        RandomForest=RandomForestClassifier()

        # Discovering the optimum in the parameter grid
        print('[UPDATE] GridSearch initiated....')
        RFclassifier = GridSearchCV(RandomForest, param_grid)

        # Training the model using the discovered SVC function with the discovered parameters
        print('[UPDATE] Training model....')
        RFclassifier.fit(X_train,y_train)

        print('[UPDATE] Best hyperparameters found in gridsearch: ', RFclassifier.best_params_)

        dump(RFclassifier, 'PIPE4_RF.joblib') 
        print('[UPDATE] Model has been saved!')

        # Making Prediction
        y_pred=RFclassifier.predict(X_norm)

    if crossval == 'yes':
        print('[UPDATE] Cross-validating accuracy...')
        
        # !!! COMMENT CROSS_VAL_SCORE FOR FASTER COMPUTATION !!!
        scores = cross_val_score(RFclassifier, X_norm, y, cv=StratifiedKFold(10))
        print('------------- Cross validated accuracy ---------------')
        print(scores)
        print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
        print(' ')
        

    if getResult == 'yes': # Print results 
        print('[UPDATE] Printing Results')
        print('------------------ Confusion Matrix -------------------')
        print(confusion_matrix(y,y_pred))
        print(' ')
        print('---------------- Classification report ----------------')
        print(classification_report(y,y_pred))
        print(' ')
        print('--------------------- AUC score -----------------------')
        print(roc_auc_score(y, y_pred))
    elif getResult == 'no':
        print('[UPDATE] Results not requested')
    else:
        print('[UPDATE] !!!THE REQUEST FOR RESULTS WAS INVALID!!!!')

# Load images, feature extraction, and call random forest classifier
This code block is what was used to test the the random forest classifier.

(Import packages and random forest function before running this)

In [10]:
augmentation_type = 'dark005'
pipeline = imlib.pip1
directory ='darken/' + augmentation_type
path = "/home/sofus/deep/data/Augmented/"

pos = [cv2.cvtColor(cv2.imread(path + directory + "/Positive/{}_1.jpg".format(i)), cv2.COLOR_BGR2GRAY) for i in range(10502, 14501)]
neg = [cv2.cvtColor(cv2.imread(path + directory + "/Negative/{}.jpg".format(i)), cv2.COLOR_BGR2GRAY)  for i in range(10502, 14501)]

out_p = [pipeline(img) for img in pos]
out_n = [pipeline(img) for img in neg]

feat_pos = [imlib.feature_extraction(img, out_p[i]) for i, img in enumerate(pos)]
feat_neg = [imlib.feature_extraction(img, out_n[i]) for i, img in enumerate(neg)]

SAMPLE_SIZE = len(feat_pos)
print('SAMPLESIZE:', len(feat_pos))

# Generating the features
int_mean_p = [feat_pos[i][0] for i in range(len(feat_pos))]
int_mean_n = [feat_neg[i][0] for i in range(len(feat_neg))]
int_mean = np.concatenate((int_mean_p, int_mean_n))

int_stdev_p = [feat_pos[i][1] for i in range(len(feat_pos))]
int_stdev_n = [feat_neg[i][1] for i in range(len(feat_neg))]
int_stdev = np.concatenate((int_stdev_p, int_stdev_n))

ratio_p = [feat_pos[i][2] for i in range(len(feat_pos))]
ratio_n = [feat_neg[i][2] for i in range(len(feat_neg))]
ratio = np.concatenate((ratio_p, ratio_n))

grad_mean_p = [feat_pos[i][3] for i in range(len(feat_pos))]
grad_mean_n = [feat_neg[i][3] for i in range(len(feat_neg))]
grad_mean = np.concatenate((grad_mean_p, grad_mean_n))

grad_mag_p = [feat_pos[i][4] for i in range(len(feat_pos))]
grad_mag_n = [feat_neg[i][4] for i in range(len(feat_neg))]
grad_mag = np.concatenate((grad_mag_p, grad_mag_n))

grad_angle_p = [feat_pos[i][5] for i in range(len(feat_pos))]
grad_angle_n = [feat_neg[i][5] for i in range(len(feat_neg))]
grad_angle = np.concatenate((grad_angle_p, grad_angle_n))

grad_max_p = [feat_pos[i][6] for i in range(len(feat_pos))]
grad_max_n = [feat_neg[i][6] for i in range(len(feat_neg))]
grad_max = np.concatenate((grad_max_p, grad_max_n))

grad_mag_mean_p = [feat_pos[i][7] for i in range(len(feat_pos))]
grad_mag_mean_n = [feat_neg[i][7] for i in range(len(feat_neg))]
grad_mag_mean = np.concatenate((grad_mag_mean_p, grad_mag_mean_n))

grad_stand_p = [feat_pos[i][8] for i in range(len(feat_pos))]
grad_stand_n = [feat_neg[i][8] for i in range(len(feat_neg))]
grad_stand = np.concatenate((grad_stand_p, grad_stand_n))

mean_int_p = [feat_pos[i][9] for i in range(len(feat_pos))]
mean_int_n = [feat_neg[i][9] for i in range(len(feat_neg))]
mean_int = np.concatenate((mean_int_p, mean_int_n))

stdev_mean_p = [feat_pos[i][10] for i in range(len(feat_pos))]
stdev_mean_n = [feat_neg[i][10] for i in range(len(feat_neg))]
stdev_mean = np.concatenate((stdev_mean_p, stdev_mean_n))

# Generating corresponding labels
label_neg = np.array(np.zeros(SAMPLE_SIZE, dtype = int))
label_pos = np.array(np.ones(SAMPLE_SIZE, dtype = int))
labels = np.concatenate((label_pos, label_neg),0)


# Create dataframe
df = pd.DataFrame(np.column_stack([int_mean, int_stdev, ratio, grad_mean, grad_mag, grad_angle, grad_max, 
                                   grad_mag_mean, grad_stand, mean_int, stdev_mean, labels]),  
                  
                  
                  
                  columns=['Local IntensMean', 'Local IntensStDev', 'Ratio', 'GradMean', 
                           'Unique Grad Mag', 'Unique Grad Angle', 'Grad Max', 'Grad Mag Mean', 'Grad Stand',
                           'Global intensMean', 'Global Intens Stand', 'label'])
df.fillna(0, inplace=True)

# Running the SVM classifier
RFclassification(df, getResult = 'yes', crossval='yes' ,testSize = 0.2051, pretrainedModel = 'no')
print('----------------- Augmentation type -------------------')
print(augmentation_type)

SAMPLESIZE: 3999
[UPDATE] Data has been loaded successfully!
[UPDATE] GridSearch initiated....
[UPDATE] Training model....




[UPDATE] Best hyperparameters found in gridsearch:  {'max_depth': 4, 'min_samples_leaf': 5, 'bootstrap': True, 'n_estimators': 12, 'min_samples_split': 8, 'max_features': 3}
[UPDATE] Model has been saved!
[UPDATE] Cross-validating accuracy...




------------- Cross validated accuracy ---------------
[0.99125    0.98625    0.9925     0.99125    0.9875     0.985
 0.98       0.995      0.995      0.98621554]
Accuracy: 0.9890 (+/- 0.0091)
 
[UPDATE] Printing Results
------------------ Confusion Matrix -------------------
[[3986   13]
 [  43 3956]]
 
---------------- Classification report ----------------
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      3999
         1.0       1.00      0.99      0.99      3999

   micro avg       0.99      0.99      0.99      7998
   macro avg       0.99      0.99      0.99      7998
weighted avg       0.99      0.99      0.99      7998

 
--------------------- AUC score -----------------------
0.9929982495623906
----------------- Augmentation type -------------------
dark005
