In [1]:
# Some common data manipulation imports
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

#some other imports
import os
pd.set_option('display.max_rows', 100000)

# For stable output on each run
np.random.seed(3)

# To plot easily visible pictures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
import time
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, precision_recall_fscore_support
import csv
from statistics import mean
from memory_profiler import memory_usage

#normalizing the data
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from pandas import DataFrame

import time

from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC # Support Vector Machine, two types of kernels
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier


In [3]:
def normalize(train, test):
    train_max = train.loc[:, train.columns != 'target']
    test_max = test.loc[:, test.columns != 'target']
    Y_train = train.loc[:, train.columns == 'target']
    Y_test = test.loc[:, test.columns == 'target']
    scaler = StandardScaler()
    train_p = scaler.fit_transform(train_max)#.loc[:, train.columns!='Class'])
    train_p = DataFrame(train_p)
    train_p.columns = train_max.columns.values
    test_p = scaler.transform(test_max)#.loc[:, test.columns!='Class'])
    test_p = DataFrame(test_p)
    test_p.columns = test_max.columns.values
    
    return train_p, Y_train, test_p, Y_test

In [4]:
from imblearn.over_sampling import SMOTE
original = pd.read_csv('200_nodes.csv')
original = original.drop(['date', 'name'], axis=1)
train, test = train_test_split(original, test_size=0.2)
train_original = train.copy()
test_original = test.copy()
train_s, Y_train, test_s, Y_test = normalize(train, test)
len(test_s.columns)

smote = SMOTE()
# Applying SMOTE on the data
train_s, Y_train = smote.fit_resample(train_s, Y_train)
len(test_s.columns)

152

# Logistic Regression

In [5]:
# load the memory_profiler package
%load_ext memory_profiler

In [6]:
def train_model(X, y):
    lr = LogisticRegression(solver = 'saga', random_state = 0)
    # Train model and time it
    start_time = time.time()
    lr.fit(X, y)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return lr

# train the model and measure the memory usage
%memit lr = train_model(train_s, Y_train)


  return f(*args, **kwargs)


Time taken to train the model:  162.1067090034485 seconds
peak memory: 7030.39 MiB, increment: 1737.53 MiB




In [7]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[8647    0    0  434    0    1    0    2    0    0]
 [   0  624    0    0    0    4    0    0    0    0]
 [   0    0  457    0    0    0    0    0    0    0]
 [ 119    0    3 2252    8    1  182   84  172  104]
 [   0    0    0    1  217    0    0    0    0   11]
 [   0    3    0    0    0  401    0    0    0    0]
 [   3    1    0  124    0    0 5136    0    2    0]
 [  10    0    0  219    1    0    0 9339    2    6]
 [   2    1    0  360    3    0    0    0 6435    0]
 [   2    0    0  189   90    0    0    0    0 8148]]
Accouracy:  0.9510502283105022
Averages: 
Precision: 0.9180176163048538
Recall: 0.9518920274379202
F1 score: 0.9319852971911045
Category 0:
	Precision: 0.9845155413867699
	Recall: 0.9518934390136504
	F1 score: 0.9679297028040522
Category 1:
	Precision: 0.9920508744038156
	Recall: 0.9936305732484076
	F1 score: 0.9928400954653938
Category 2:
	Precision: 0.9934782608695653
	Recall: 1.0
	F1 score: 0.9967284623773174
Category 3:
	Precision: 0.6292260407935177
	Recall: 0

# 2. K-Nearest Neighbours (K-NN)

In [8]:
# def train_model_knn(X, y):
#     knn = KNeighborsClassifier(metric = 'minkowski', p=3)
#     # Train model and time it
#     start_time = time.time()
#     knn.fit(X, y)

#     end_time = time.time()
#     # Calculate time taken to train
#     time_taken = end_time - start_time
#     print("Time taken to train the model: ", time_taken, "seconds")
#     return knn

# # train the model and measure the memory usage
# %memit lr = train_model_knn(train_s, Y_train)


In [9]:
# y_pred_lr = lr.predict(test_s)
# cm_lr = confusion_matrix(Y_test, y_pred_lr)
# print(cm_lr)
# acc_lr = accuracy_score(Y_test, y_pred_lr)
# print ("Accouracy: ", acc_lr)

# precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

# print("Averages: ")
# print("Precision:", mean(precision))
# print("Recall:", mean(recall))
# print("F1 score:", mean(f1_score))
# # print the metrics for each category
# for i, label in enumerate(lr.classes_):
#     print(f"Category {label}:")
#     print(f"\tPrecision: {precision[i]}")
#     print(f"\tRecall: {recall[i]}")
#     print(f"\tF1 score: {f1_score[i]}")

# SVM: Linear Kernel

In [10]:
def train_model_svm_l(X, y):
    svc = SVC(kernel = 'linear', random_state = 0)
    # Train model and time it
    start_time = time.time()
    svc.fit(X, y)
    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return svc

# train the model and measure the memory usage
%memit lr = train_model_svm_l(train_s, Y_train)

  return f(*args, **kwargs)


Time taken to train the model:  3320.6942369937897 seconds
peak memory: 5436.53 MiB, increment: 2693.31 MiB


In [11]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[8678    0    0  406    0    0    0    0    0    0]
 [   0  623    0    0    0    5    0    0    0    0]
 [   0    0  455    2    0    0    0    0    0    0]
 [ 114    0    3 2285    7    0  194   80  142  100]
 [   0    0    0    4  210    0    0    0    0   15]
 [   0    3    0    1    0  400    0    0    0    0]
 [   0    0    0   47    0    0 5219    0    0    0]
 [   3    0    0  198    0    0    0 9376    0    0]
 [   0    0    0  377    0    0    0    0 6424    0]
 [   0    0    0  135   63    0    0    1    0 8230]]
Accouracy:  0.95662100456621
Averages: 
Precision: 0.9294557107180337
Recall: 0.952233908308963
F1 score: 0.9394297523196398
Category 0:
	Precision: 0.9866969869243889
	Recall: 0.9553060325847644
	F1 score: 0.9707478046870631
Category 1:
	Precision: 0.9952076677316294
	Recall: 0.9920382165605095
	F1 score: 0.9936204146730463
Category 2:
	Precision: 0.9934497816593887
	Recall: 0.9956236323851203
	F1 score: 0.994535519125683
Category 3:
	Precision: 0.6613603473227206

# 4. Kernel SVM (Support Vector Machine)

In [12]:
def train_model_svm_p(X, y):
    svc_rbf = SVC(kernel = 'rbf', random_state = 0)
    # Train model and time it
    start_time = time.time()
    svc_rbf.fit(X, y)
    #end time here
    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return svc_rbf

# train the model and measure the memory usage
%memit lr = train_model_svm_p(train_s, Y_train)


  return f(*args, **kwargs)


Time taken to train the model:  1130.1083681583405 seconds
peak memory: 5186.91 MiB, increment: 3000.19 MiB


In [13]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[8993    0    0   91    0    0    0    0    0    0]
 [   0  628    0    0    0    0    0    0    0    0]
 [   0    0  457    0    0    0    0    0    0    0]
 [  31    0    3 2640    7    0  128   29   72   15]
 [   0    0    0    2  212    0    0    0    0   15]
 [   0    3    0    0    0  401    0    0    0    0]
 [   0    2    0    0    0    0 5264    0    0    0]
 [   0    0    0   59    0    0    0 9518    0    0]
 [   0    0    0  146    0    0    0    0 6655    0]
 [   0    0    0   12   92    0    0    0    0 8325]]
Accouracy:  0.9838584474885844
Averages: 
Precision: 0.9517661086261833
Recall: 0.9770538763808116
F1 score: 0.9625310405735865
Category 0:
	Precision: 0.9965647163120568
	Recall: 0.9899823866138265
	F1 score: 0.9932626463441574
Category 1:
	Precision: 0.9921011058451816
	Recall: 1.0
	F1 score: 0.9960348929421095
Category 2:
	Precision: 0.9934782608695653
	Recall: 1.0
	F1 score: 0.9967284623773174
Category 3:
	Precision: 0.8949152542372881
	Recall: 0.90256410256410

# 5. Naïve Bayes

In [14]:
def train_model_nb(X, y):
    nb = GaussianNB()
    # Train model and time it
    start_time = time.time()
    nb.fit(X, y)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return nb

# train the model and measure the memory usage
%memit lr = train_model_nb(train_s, Y_train)


  return f(*args, **kwargs)


Time taken to train the model:  1.1128098964691162 seconds
peak memory: 4161.11 MiB, increment: 2812.48 MiB


In [15]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[8188    0    0  737    0    0    5  128    2   24]
 [   0  623    0    0    0    0    0    0    0    5]
 [   0    0  456    0    0    0    0    1    0    0]
 [ 147    0    3 1860    0    0  293  182  223  217]
 [   0    0    0    0  202    0    0    0    0   27]
 [   0    3    0    0    0  401    0    0    0    0]
 [  23    0    0    2    0    0 5212   29    0    0]
 [  98    0    0  272    0    0    0 8927   93  187]
 [   0    0    0  340    0    0    1    8 6452    0]
 [ 146    0    0  241    0    0    0    8   46 7988]]
Accouracy:  0.9202968036529681
Averages: 
Precision: 0.9278902143252489
Recall: 0.9220022073452974
F1 score: 0.9240315769421122
Category 0:
	Precision: 0.9518716577540107
	Recall: 0.9013650374284456
	F1 score: 0.9259301142146331
Category 1:
	Precision: 0.9952076677316294
	Recall: 0.9920382165605095
	F1 score: 0.9936204146730463
Category 2:
	Precision: 0.9934640522875817
	Recall: 0.9978118161925602
	F1 score: 0.9956331877729258
Category 3:
	Precision: 0.538818076477

# 6. Decision Tree

In [16]:
def train_model_dt(X, y):
    dt = DecisionTreeClassifier()
    # Train model and time it
    start_time = time.time()
    dt.fit(X, y)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return dt

# train the model and measure the memory usage
%memit lr = train_model_dt(train_s, Y_train)


Time taken to train the model:  28.11222505569458 seconds
peak memory: 4384.69 MiB, increment: 900.23 MiB


In [17]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[9039    0    0   45    0    0    0    0    0    0]
 [   0  621    0    0    0    7    0    0    0    0]
 [   0    0  447   10    0    0    0    0    0    0]
 [  27    0    2 2662    1    0   86   65   80    2]
 [   0    0    0    3  205    0    0    0    0   21]
 [   0    3    0    0    0  401    0    0    0    0]
 [   0    0    0  100    0    0 5166    0    0    0]
 [   0    1    0   55    0    0    0 9521    0    0]
 [   0    0    0   76    0    0    0    0 6725    0]
 [   0    0    0    2   26    0    0    0    0 8401]]
Accouracy:  0.986027397260274
Averages: 
Precision: 0.971644567766185
Recall: 0.9720540353960211
F1 score: 0.9718325092059994
Category 0:
	Precision: 0.9970218398411648
	Recall: 0.9950462351387054
	F1 score: 0.9960330578512396
Category 1:
	Precision: 0.9936
	Recall: 0.9888535031847133
	F1 score: 0.9912210694333599
Category 2:
	Precision: 0.9955456570155902
	Recall: 0.9781181619256017
	F1 score: 0.9867549668874172
Category 3:
	Precision: 0.9014561462919065
	Recall: 

# 7. Random Forest

In [18]:
def train_model_rf(X, y):
    rf = RandomForestClassifier()
    # Train model and time it
    start_time = time.time()
    rf.fit(X, y)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return rf

# train the model and measure the memory usage
%memit lr = train_model_rf(train_s, Y_train)


  """


Time taken to train the model:  150.1497302055359 seconds
peak memory: 4620.59 MiB, increment: 1081.88 MiB


In [19]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[9065    0    0   19    0    0    0    0    0    0]
 [   0  624    0    0    0    0    0    0    0    4]
 [   0    0  457    0    0    0    0    0    0    0]
 [  60    0    3 2513    0    0  132   39  142   36]
 [   0    0    0    1  202    0    0    0    0   26]
 [   0    3    0    0    0  401    0    0    0    0]
 [   0    1    0    1    0    0 5264    0    0    0]
 [   0    0    0   56    0    0    0 9521    0    0]
 [   0    0    0   14    0    0    0    0 6787    0]
 [   0    0    0    8    0    0    0    0    0 8421]]
Accouracy:  0.9875570776255708
Averages: 
Precision: 0.9885819260847802
Recall: 0.9716119847489372
F1 score: 0.9794091817636522
Category 0:
	Precision: 0.9934246575342466
	Recall: 0.9979084103918978
	F1 score: 0.995661486078313
Category 1:
	Precision: 0.9936305732484076
	Recall: 0.9936305732484076
	F1 score: 0.9936305732484076
Category 2:
	Precision: 0.9934782608695653
	Recall: 1.0
	F1 score: 0.9967284623773174
Category 3:
	Precision: 0.9620980091883614
	Recall: 0.

# 8. AdaBoost Classifier

In [20]:
def train_model_ada(X, y):
    abc = AdaBoostClassifier()
    # Train model and time it
    start_time = time.time()
    abc.fit(train_s, Y_train)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return abc

# train the model and measure the memory usage
%memit lr = train_model_ada(train_s, Y_train)


  return f(*args, **kwargs)


Time taken to train the model:  152.28145933151245 seconds
peak memory: 4601.19 MiB, increment: 2378.11 MiB


In [21]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[   0  499    0    0 8585    0    0    0    0    0]
 [   0  199  211    0  218    0    0    0    0    0]
 [   0    0  457    0    0    0    0    0    0    0]
 [   0  360  751    0 1814    0    0    0    0    0]
 [   0    0    0    0  229    0    0    0    0    0]
 [   0  404    0    0    0    0    0    0    0    0]
 [   0    0 1092    0 4174    0    0    0    0    0]
 [   0    0 9577    0    0    0    0    0    0    0]
 [   0    0    0    0 6801    0    0    0    0    0]
 [   0    0    0    0 8429    0    0    0    0    0]]
Accouracy:  0.020205479452054795
Averages: 
Precision: 0.018149124769758704
Recall: 0.23168789808917198
F1 score: 0.02783150739679518
Category 0:
	Precision: 0.0
	Recall: 0.0
	F1 score: 0.0
Category 1:
	Precision: 0.13611491108071136
	Recall: 0.31687898089171973
	F1 score: 0.19043062200956937
Category 2:
	Precision: 0.037806088682991394
	Recall: 1.0
	F1 score: 0.07285771223595057
Category 3:
	Precision: 0.0
	Recall: 0.0
	F1 score: 0.0
Category 4:
	Precision: 0.0075

  _warn_prf(average, modifier, msg_start, len(result))


# 9. Quadratic Discriminant Analysis

In [22]:
def train_model_qda(X, y):
    qda = QuadraticDiscriminantAnalysis()
    # Train model and time it
    start_time = time.time()
    qda.fit(train_s, Y_train)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return qda

# train the model and measure the memory usage
%memit lr = train_model_qda(train_s, Y_train)


  return f(*args, **kwargs)


Time taken to train the model:  2.524500846862793 seconds
peak memory: 4654.77 MiB, increment: 1137.39 MiB


In [23]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[8565    0    0  519    0    0    0    0    0    0]
 [   0  623    0    0    0    0    0    0    0    5]
 [   0    0  450    3    0    0    0    4    0    0]
 [  75    0    3 2248   10    0  256   83  138  112]
 [   0    0    0    0  223    0    0    0    0    6]
 [   0    3    0    0    0  401    0    0    0    0]
 [   1    0    0    9    0    0 5255    1    0    0]
 [   0    0    0  199    0    0    0 9378    0    0]
 [   0    0    0  375    0    0    0    0 6426    0]
 [   0    0    0  129  120    0    0    0    0 8180]]
Accouracy:  0.9531735159817352
Averages: 
Precision: 0.9165537224307788
Recall: 0.9546960258585119
F1 score: 0.9312928969419045
Category 0:
	Precision: 0.9912047216757319
	Recall: 0.9428665785997358
	F1 score: 0.9664315937940762
Category 1:
	Precision: 0.9952076677316294
	Recall: 0.9920382165605095
	F1 score: 0.9936204146730463
Category 2:
	Precision: 0.9933774834437086
	Recall: 0.9846827133479212
	F1 score: 0.9890109890109889
Category 3:
	Precision: 0.645605973578

# 10. MLP Classifier

In [24]:
def train_model_mlp(X, y):
    mlp = MLPClassifier()
    # Train model and time it
    start_time = time.time()
    mlp.fit(train_s, Y_train)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return mlp

# train the model and measure the memory usage
%memit lr = train_model_mlp(train_s, Y_train)


  return f(*args, **kwargs)


Time taken to train the model:  261.6060299873352 seconds
peak memory: 5120.50 MiB, increment: 0.00 MiB


In [25]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[9048    0    0   34    0    0    1    0    1    0]
 [   0  619    0    0    0    3    0    0    0    6]
 [   0    0  456    1    0    0    0    0    0    0]
 [  51    0    3 2522    3    0  122   85  116   23]
 [   0    0    0    4  205    0    0    0    0   20]
 [   0    3    0    0    0  401    0    0    0    0]
 [   0    1    0   82    0    0 5181    2    0    0]
 [   0    0    0   68    0    0    0 9508    1    0]
 [   0    0    0   91    0    0    1    1 6708    0]
 [   1    0    0   22   26    0    0    0    1 8379]]
Accouracy:  0.982351598173516
Averages: 
Precision: 0.9687241938395885
Recall: 0.9686558173790292
F1 score: 0.9686486665048817
Category 0:
	Precision: 0.9942857142857143
	Recall: 0.9960369881109643
	F1 score: 0.9951605807303124
Category 1:
	Precision: 0.9935794542536116
	Recall: 0.9856687898089171
	F1 score: 0.9896083133493205
Category 2:
	Precision: 0.9934640522875817
	Recall: 0.9978118161925602
	F1 score: 0.9956331877729258
Category 3:
	Precision: 0.8930594900849

# Gradient Boosting Classifier

In [26]:
def train_model_gradientBoost(X, y):
    g = GradientBoostingClassifier()
    # Train model and time it
    start_time = time.time()
    g.fit(X, y)

    end_time = time.time()
    # Calculate time taken to train
    time_taken = end_time - start_time
    print("Time taken to train the model: ", time_taken, "seconds")
    return g

# train the model and measure the memory usage
%memit lr = train_model_gradientBoost(train_s, Y_train)

  return f(*args, **kwargs)


Time taken to train the model:  7040.160327911377 seconds
peak memory: 5252.14 MiB, increment: 655.72 MiB


In [27]:
y_pred_lr = lr.predict(test_s)
cm_lr = confusion_matrix(Y_test, y_pred_lr)
print(cm_lr)
acc_lr = accuracy_score(Y_test, y_pred_lr)
print ("Accouracy: ", acc_lr)

precision, recall, f1_score, _ = precision_recall_fscore_support(Y_test, y_pred_lr, average=None)

print("Averages: ")
print("Precision:", mean(precision))
print("Recall:", mean(recall))
print("F1 score:", mean(f1_score))
# print the metrics for each category
for i, label in enumerate(lr.classes_):
    print(f"Category {label}:")
    print(f"\tPrecision: {precision[i]}")
    print(f"\tRecall: {recall[i]}")
    print(f"\tF1 score: {f1_score[i]}")

[[9051    0    0   33    0    0    0    0    0    0]
 [   0  628    0    0    0    0    0    0    0    0]
 [   0    0  455    0    0    0    1    1    0    0]
 [  40    0    3 2563    0    0  114   46  113   46]
 [   0    0    0    2  208    0    0    0    0   19]
 [   0    3    0    0    0  401    0    0    0    0]
 [   0    0    0    7    0    0 5259    0    0    0]
 [   0    0    0   53    0    0    0 9524    0    0]
 [   0    0    0   98    0    0    0    0 6703    0]
 [   0    0    0   11   24    0    0    0    0 8394]]
Accouracy:  0.9859817351598174
Averages: 
Precision: 0.975654837486742
Recall: 0.9743676037222331
F1 score: 0.9749252427071653
Category 0:
	Precision: 0.99560004399956
	Recall: 0.9963672391017173
	F1 score: 0.9959834938101788
Category 1:
	Precision: 0.9952456418383518
	Recall: 1.0
	F1 score: 0.9976171564733916
Category 2:
	Precision: 0.9934497816593887
	Recall: 0.9956236323851203
	F1 score: 0.994535519125683
Category 3:
	Precision: 0.926273942898446
	Recall: 0.8762