In [1]:
#This is the version used to create the predictions file

#All imports used to create the classifier and preprocess the data.

from google.colab import files
from google.colab import drive
drive.mount('/content/gdrive') #Mounting the drive
import pandas as pd
import numpy as np
import io
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#Loading of the data into the Pandas dataframe
training_data_1 = pd.read_csv('gdrive/My Drive/Colab/FML/training1.csv') #I named the folder inside the google drive FML, if your testing the code this is the only section that might need to be changed in order to import the data.
training_data_2 = pd.read_csv('gdrive/My Drive/Colab/FML/training2.csv')
testing_data_1 = pd.read_csv('gdrive/My Drive/Colab/FML/test.csv')

#This version splits the data into two seperate sets of data, and then concatenates them again. They train on the same Random Forest model, which out of the three tested below provieded the best result. This is the version that provides the data for the testing

#Splits the first set of training data into CNN and GIST features, this is not entirely necessary in this version as it was both used on the same model later.
X_train_cnn_1 = training_data_1.iloc[:, :2048]
X_train_gist_1 = training_data_1.iloc[:, 2048:-2]
Y_train_1 = training_data_1.iloc[:, -2]
confidence_weights1 = training_data_1.iloc[:, -1]
confidence_weights1 = (confidence_weights1 - 0.66) / 0.34#Splits the  confidence rating into Binary values

#Splits the second set of training data into CNN and GIST features, this is not entirely necessary in this version as it was both used on the same model later.
X_train_cnn_2 = training_data_2.iloc[:, :2048]
X_train_gist_2 = training_data_2.iloc[:, 2048:-2]
Y_train_2 = training_data_2.iloc[:, -2]
confidence_weights2 = training_data_2.iloc[:, -1]
confidence_weights2 = (confidence_weights2 - 0.66) / 0.34#Splits the  confidence rating into Binary values

#Concatenate the two seperate training data sets into CNN and GIST features seperately. Splitting them wasn't entirely necessary because in the end i didn't train seperate classifiers on the two data types
X_train_cnn = np.concatenate((X_train_cnn_1, X_train_cnn_2), axis=0)#concatenates the CNN data from training set one
X_train_gist = np.concatenate((X_train_gist_1, X_train_gist_2), axis=0)#concatenates the CNN data from training set two
Y_train = np.concatenate((Y_train_1, Y_train_2), axis=0)#concatenates the Y data from training set one and two
confidence_weights = np.concatenate((confidence_weights1, confidence_weights2), axis=0)#concatenates the confidence weights

#Concatenate the CNN and GIST features as both will be used to train the same model
X_train = np.concatenate((X_train_cnn, X_train_gist), axis=1) 

#Splits the data into a training set and a validation set
X_trainable, X_validation, Y_trainable, Y_validation, confidence_train, confidence_val = train_test_split(X_train, Y_train, confidence_weights,  random_state=6014, test_size=0.2) #The model was tuned to this specific seed to give the best result, as well as this the split size was also tuned for the best result

#Applies the Imputer to the data, replacing the gaps with the median values of the values around it. This provides a better result than the standard mean application
IMP = SimpleImputer(strategy='median')#creates the Imputer. Uses median instead of mean, this generally gave a better result on different seeds and tunings.
X_trainable = IMP.fit_transform(X_trainable)#fits and then transforms the training data using the imputer
X_validation = IMP.transform(X_validation)#transforms the validation set


#Uses the scaler as a form of preprocessing on the data. Applied the confidence weights to the values as well 
SCA = StandardScaler()#creates the scaler, no parameter tuning.
X_trainable = SCA.fit_transform(X_trainable,sample_weight=confidence_train)#transforms and then fits the data on the scaler
X_validation = SCA.transform(X_validation)#transforms the validation set, doesn't fit it


#Set up the testing data using the trained Scaler and Imputer
X_test_cnn = testing_data_1.iloc[:, :2048]
X_test_gist = testing_data_1.iloc[:, 2048:]
X_test = np.concatenate((X_test_cnn, X_test_gist), axis=1)
X_test = IMP.transform(X_test)#Imputes the data using the imputer trained from the training data
X_test = SCA.transform(X_test)#Scales the data using the same imputer trained from the training data

class_weights = {0: 1, 1: 1} #The class weights were tested in the Logistic Regression model, however 

classifier_3 = RandomForestClassifier(n_estimators=300, max_depth=19, min_samples_split=25, random_state=6014, max_features="sqrt")
#(n_estimators=152, max_depth=11, min_samples_split=5, random_state=9876543, max_features="sqrt") This was the first best attempt I had at tuning, after using the grid search from SKLearn and manual tuning
#(n_estimators=300, max_depth=19, min_samples_split=25, random_state=6014, max_features="sqrt") This was the best attempt i managed to get after doing mainly manual tuning

#Fitting the classifier with the trainable data set
classifier_3.fit(X_trainable, Y_trainable, sample_weight=confidence_train) #The parameters are that of the split data, only putting in the trainable data to fit the inital model.

#predict the testing data using the classifier and upload it back to the same folder of which the data was imported from
Y_prediction = classifier_3.predict(X_test)
with open('gdrive/My Drive/Colab/FML/predictions.csv', 'w', encoding = 'utf-8-sig') as f:
  Y_prediction = pd.DataFrame(Y_prediction, columns = ['prediction'])#names the column predicition
  Y_prediction.to_csv(f,index=False)

#predicting the data for the trainable and validation set, checking for overfitting using the training set and getting an estimate for the accuracy on the validation set
trainable_prediction = classifier_3.predict(X_trainable)
validation_prediction = classifier_3.predict(X_validation)

#Compare training and validation data sets
accuracy = np.mean(Y_trainable == trainable_prediction)
print(f"Accuracy on the training set FOREST: {accuracy*100}%") #these lines paste the training accuracy

accuracy2 = np.mean(Y_validation == validation_prediction)
print(f"Accuracy on the validation set FOREST: {accuracy2*100}%") #these lines paste the validation accuracy

Mounted at /content/gdrive
Accuracy on the training set FOREST: 78.125%
Accuracy on the validation set FOREST: 81.16666666666667%
