<a href="https://colab.research.google.com/github/ElviraKonovalov/soen471-bigData/blob/main/Models_Max.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [217]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [218]:
#Prepping data.

file = 'clean_data_hot-one.csv'
df =  pd.read_csv(file, sep=',')

#CATEGORISES ANY COLUMNS REMAINING AS STRINGS
for i in range(len(df.columns) - 1):
  if not isinstance(df.iloc[0, i], np.int64):
    df[df.columns[i]] = pd.factorize(df[df.columns[i]])[0]

df = df.iloc[: , 1:] #REMOVES THE EXTRA ID COLUMN

print(df.columns)

Index(['ACCLASS', 'AG_DRIV', 'ALCOHOL', 'AUTOMOBILE', 'CYCLIST', 'DATE',
       'DISABILITY', 'EMERG_VEH', 'HOOD_ID', 'HOUR', 'INITDIR', 'INVAGE',
       'LIGHT', 'LOCCOORD', 'MANOEUVER', 'MOTORCYCLE', 'PASSENGER',
       'PEDESTRIAN', 'RDSFCOND', 'REDLIGHT', 'ROAD_CLASS', 'SPEEDING',
       'TRAFFCTL', 'TRSN_CITY_VEH', 'TRUCK', 'VISIBILITY'],
      dtype='object')


In [219]:
data = df.to_numpy()

X = np.delete(data, 0, 1) #Necesarry to remove the label column
y = data[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [315]:
#Defining models.

dt = DecisionTreeClassifier(random_state=0, max_depth=5)
rf = RandomForestClassifier(random_state=0, max_depth=6)
mlp = MLPClassifier(random_state=0, max_iter=1000, activation='tanh', solver='adam', learning_rate='adaptive')

In [316]:
#Fitting decision tree classifier.
dt.fit(X_train_scaled, y_train)

DecisionTreeClassifier(max_depth=5, random_state=0)

In [308]:
#Fitting random forest classifier.
rf.fit(X_train_scaled, y_train)

RandomForestClassifier(max_depth=6, random_state=0)

In [247]:
#Fitting MLP classifier.
mlp.fit(X_train_scaled, y_train)

MLPClassifier(activation='tanh', learning_rate='adaptive', max_iter=1000,
              random_state=0)

In [317]:
#Predicting test set on decision tree.
predicted = dt.predict(X_test_scaled)

accuracy_dt = round(sklearn.metrics.accuracy_score(y_test, predicted)*100, 2)
precision_dt = round(sklearn.metrics.precision_score(y_test, predicted, average='weighted', labels=np.unique(predicted)) * 100, 2)
recall_dt = round(sklearn.metrics.recall_score(y_test, predicted, average='weighted') * 100, 2)
f1_dt = round(sklearn.metrics.f1_score(y_test, predicted, average='weighted', labels=np.unique(predicted)) * 100, 2)

In [309]:
#Predicting test set on random forest.
predicted = rf.predict(X_test_scaled)

accuracy_rf = round(sklearn.metrics.accuracy_score(y_test, predicted)*100, 2)
precision_rf = round(sklearn.metrics.precision_score(y_test, predicted, average='weighted', labels=np.unique(predicted)) * 100, 2)
recall_rf = round(sklearn.metrics.recall_score(y_test, predicted, average='weighted') * 100, 2)
f1_rf = round(sklearn.metrics.f1_score(y_test, predicted, average='weighted', labels=np.unique(predicted)) * 100, 2)

In [248]:
#Predicting test set on neural network.
predicted = mlp.predict(X_test_scaled)

accuracy_mlp = round(sklearn.metrics.accuracy_score(y_test, predicted)*100, 2)
precision_mlp = round(sklearn.metrics.precision_score(y_test, predicted, average='weighted', labels=np.unique(predicted)) * 100, 2)
recall_mlp = round(sklearn.metrics.recall_score(y_test, predicted, average='weighted') * 100, 2)
f1_mlp = round(sklearn.metrics.f1_score(y_test, predicted, average='weighted', labels=np.unique(predicted)) * 100, 2)

In [318]:
#Displaying metrics.

print('Decision Tree Metrics:')
print("\tAccuracy:  " + str(accuracy_dt)+'%')
print("\tPrecision: " + str(precision_dt)+'%')
print("\tRecall:    " + str(recall_dt)+'%')
print("\tF1 Score:  " + str(f1_dt)+'%')

print('\n')

print('Random Forest Metrics:')
print("\tAccuracy:  " + str(accuracy_rf)+'%')
print("\tPrecision: " + str(precision_rf)+'%')
print("\tRecall:    " + str(recall_rf)+'%')
print("\tF1 Score:  " + str(f1_rf)+'%')

print('\n')

print('MLP Neural Network Metrics:')
print("\tAccuracy:  " + str(accuracy_mlp)+'%')
print("\tPrecision: " + str(precision_mlp)+'%')
print("\tRecall:    " + str(recall_mlp)+'%')
print("\tF1 Score:  " + str(f1_mlp)+'%')

Decision Tree Metrics:
	Accuracy:  87.27%
	Precision: 82.08%
	Recall:    87.27%
	F1 Score:  82.83%


Random Forest Metrics:
	Accuracy:  87.64%
	Precision: 87.64%
	Recall:    87.64%
	F1 Score:  93.41%


MLP Neural Network Metrics:
	Accuracy:  82.01%
	Precision: 82.21%
	Recall:    82.01%
	F1 Score:  82.11%
