ClassifAI - Notebook to compare machine learning classifiers

In [None]:
import pandas as pd 
import numpy as np 
from scipy import stats 
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
from tabulateimport tabulate


## Data selection

In [None]:
# reading the data
data = None
data0 = pd.read_csv('diabetes.csv')
data = pd.concat([data, data0])


In [None]:
# renaming columns
data = data.rename(columns={'Glucose':'Glucose2'})
# renaming columns
data = data.rename(columns={'Outcome':'label'})


In [None]:
# removing columns
data = data.drop(columns=['Pregnancies'])


In [None]:
# removing rows
data = data.drop([1, 2, 3])


In [None]:
# removing null values
data = data.dropna()


In [None]:
# removing outliers
data = data[(np.abs(stats.zscore(data)) < 3).all(axis=1)]


In [None]:
# removing duplicates
data = data.drop_duplicates()


## Pre Processing

In [None]:
# processing the data
# normalizing the data
data = (data - data.min()) / (data.max() - data.min())


In [None]:
# splitting the data
X, y = data.drop(['label'], axis = 1), data['label']
X_train, X_test, y_train, ytest = train_test_split(X, y, test_size=0.2, random_state=45)


## Initializing the hyperparameters

In [None]:
# setting the hyperparameters
C = np.arange(0.1, 2, 0.4)
kernel = "linear"
gamma = 0.2


## Setting the classifiers

In [None]:
# setting the algorithms
svm1_C_1 = svm.SVC(C=0.1,kernel=kernel, gamma=gamma)
svm1_C_2 = svm.SVC(C=0.5,kernel=kernel, gamma=gamma)
svm1_C_3 = svm.SVC(C=0.9,kernel=kernel, gamma=gamma)
svm1_C_4 = svm.SVC(C=1.3,kernel=kernel, gamma=gamma)
svm1_C_5 = svm.SVC(C=1.7000000000000002,kernel=kernel, gamma=gamma)


In [None]:
# training the algorithms
start_svm1_C_1 = time.time()
svm1_C_1.fit(X_train, y_train)
end_svm1_C_1 = time.time()
start_svm1_C_2 = time.time()
svm1_C_2.fit(X_train, y_train)
end_svm1_C_2 = time.time()
start_svm1_C_3 = time.time()
svm1_C_3.fit(X_train, y_train)
end_svm1_C_3 = time.time()
start_svm1_C_4 = time.time()
svm1_C_4.fit(X_train, y_train)
end_svm1_C_4 = time.time()
start_svm1_C_5 = time.time()
svm1_C_5.fit(X_train, y_train)
end_svm1_C_5 = time.time()


In [None]:
# predicting the algorithms
y_pred_svm1_C_1 = svm1_C_1.predict(X_test)
y_pred_svm1_C_2 = svm1_C_2.predict(X_test)
y_pred_svm1_C_3 = svm1_C_3.predict(X_test)
y_pred_svm1_C_4 = svm1_C_4.predict(X_test)
y_pred_svm1_C_5 = svm1_C_5.predict(X_test)


## Visualisation of the results

In [None]:
# visualizing the results
accuracies = []
algorithms = []
if metrics.accuracy_score(ytest, y_pred_svm1_C_1) > 0:
	accuracies.append(metrics.accuracy_score(ytest, y_pred_svm1_C_1))
	algorithms.append('svm1_C_1')
if metrics.accuracy_score(ytest, y_pred_svm1_C_2) > 0:
	accuracies.append(metrics.accuracy_score(ytest, y_pred_svm1_C_2))
	algorithms.append('svm1_C_2')
if metrics.accuracy_score(ytest, y_pred_svm1_C_3) > 0:
	accuracies.append(metrics.accuracy_score(ytest, y_pred_svm1_C_3))
	algorithms.append('svm1_C_3')
if metrics.accuracy_score(ytest, y_pred_svm1_C_4) > 0:
	accuracies.append(metrics.accuracy_score(ytest, y_pred_svm1_C_4))
	algorithms.append('svm1_C_4')
if metrics.accuracy_score(ytest, y_pred_svm1_C_5) > 0:
	accuracies.append(metrics.accuracy_score(ytest, y_pred_svm1_C_5))
	algorithms.append('svm1_C_5')
plt.figure(figsize=(10, 10))
plt.plot(algorithms, accuracies)
plt.title('Accuracy of the algorithms')
plt.show()


In [None]:
losses = []
algorithms = []
if metrics.log_loss(ytest, y_pred_svm1_C_1) < 0:
	losses.append(metrics.log_loss(ytest, y_pred_svm1_C_1))
	algorithms.append('svm1_C_1')
if metrics.log_loss(ytest, y_pred_svm1_C_2) < 0:
	losses.append(metrics.log_loss(ytest, y_pred_svm1_C_2))
	algorithms.append('svm1_C_2')
if metrics.log_loss(ytest, y_pred_svm1_C_3) < 0:
	losses.append(metrics.log_loss(ytest, y_pred_svm1_C_3))
	algorithms.append('svm1_C_3')
if metrics.log_loss(ytest, y_pred_svm1_C_4) < 0:
	losses.append(metrics.log_loss(ytest, y_pred_svm1_C_4))
	algorithms.append('svm1_C_4')
if metrics.log_loss(ytest, y_pred_svm1_C_5) < 0:
	losses.append(metrics.log_loss(ytest, y_pred_svm1_C_5))
	algorithms.append('svm1_C_5')
plt.figure(figsize=(10, 10))
plt.pie(losses, labels=algorithms, autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title('Loss of the algorithms')
plt.show()


In [None]:
precisions = []
algorithms = []
if metrics.precision_score(ytest, y_pred_svm1_C_1) > 0:
	precisions.append(metrics.precision_score(ytest, y_pred_svm1_C_1))
	algorithms.append('svm1_C_1')
if metrics.precision_score(ytest, y_pred_svm1_C_2) > 0:
	precisions.append(metrics.precision_score(ytest, y_pred_svm1_C_2))
	algorithms.append('svm1_C_2')
if metrics.precision_score(ytest, y_pred_svm1_C_3) > 0:
	precisions.append(metrics.precision_score(ytest, y_pred_svm1_C_3))
	algorithms.append('svm1_C_3')
if metrics.precision_score(ytest, y_pred_svm1_C_4) > 0:
	precisions.append(metrics.precision_score(ytest, y_pred_svm1_C_4))
	algorithms.append('svm1_C_4')
if metrics.precision_score(ytest, y_pred_svm1_C_5) > 0:
	precisions.append(metrics.precision_score(ytest, y_pred_svm1_C_5))
	algorithms.append('svm1_C_5')
plt.figure(figsize=(10, 10))
plt.bar(algorithms, precisions)
plt.title('Precision of the algorithms')
plt.show()


In [None]:
recalls = []
algorithms = []
if metrics.recall_score(ytest, y_pred_svm1_C_1) > 0:
	recalls.append(metrics.recall_score(ytest, y_pred_svm1_C_1))
	algorithms.append('svm1_C_1')
if metrics.recall_score(ytest, y_pred_svm1_C_2) > 0:
	recalls.append(metrics.recall_score(ytest, y_pred_svm1_C_2))
	algorithms.append('svm1_C_2')
if metrics.recall_score(ytest, y_pred_svm1_C_3) > 0:
	recalls.append(metrics.recall_score(ytest, y_pred_svm1_C_3))
	algorithms.append('svm1_C_3')
if metrics.recall_score(ytest, y_pred_svm1_C_4) > 0:
	recalls.append(metrics.recall_score(ytest, y_pred_svm1_C_4))
	algorithms.append('svm1_C_4')
if metrics.recall_score(ytest, y_pred_svm1_C_5) > 0:
	recalls.append(metrics.recall_score(ytest, y_pred_svm1_C_5))
	algorithms.append('svm1_C_5')
plt.figure(figsize=(10, 10))
plt.bar(algorithms, recalls)
plt.title('Recall of the algorithms')
plt.show()


In [None]:
f1s = []
algorithms = []
if metrics.f1_score(ytest, y_pred_svm1_C_1) > 0:
	f1s.append(metrics.f1_score(ytest, y_pred_svm1_C_1))
	algorithms.append('svm1_C_1')
if metrics.f1_score(ytest, y_pred_svm1_C_2) > 0:
	f1s.append(metrics.f1_score(ytest, y_pred_svm1_C_2))
	algorithms.append('svm1_C_2')
if metrics.f1_score(ytest, y_pred_svm1_C_3) > 0:
	f1s.append(metrics.f1_score(ytest, y_pred_svm1_C_3))
	algorithms.append('svm1_C_3')
if metrics.f1_score(ytest, y_pred_svm1_C_4) > 0:
	f1s.append(metrics.f1_score(ytest, y_pred_svm1_C_4))
	algorithms.append('svm1_C_4')
if metrics.f1_score(ytest, y_pred_svm1_C_5) > 0:
	f1s.append(metrics.f1_score(ytest, y_pred_svm1_C_5))
	algorithms.append('svm1_C_5')
plt.figure(figsize=(10, 10))
plt.plot(algorithms, f1s)
plt.title('F1_score of the algorithms')
plt.show()


In [None]:
durations = []
algorithms = []
if end_svm1_C_1- start_svm1_C_1 > 0:
	durations.append(end_svm1_C_1- start_svm1_C_1)
	algorithms.append('svm1_C_1')
if end_svm1_C_2- start_svm1_C_2 > 0:
	durations.append(end_svm1_C_2- start_svm1_C_2)
	algorithms.append('svm1_C_2')
if end_svm1_C_3- start_svm1_C_3 > 0:
	durations.append(end_svm1_C_3- start_svm1_C_3)
	algorithms.append('svm1_C_3')
if end_svm1_C_4- start_svm1_C_4 > 0:
	durations.append(end_svm1_C_4- start_svm1_C_4)
	algorithms.append('svm1_C_4')
if end_svm1_C_5- start_svm1_C_5 > 0:
	durations.append(end_svm1_C_5- start_svm1_C_5)
	algorithms.append('svm1_C_5')
plt.figure(figsize=(10, 10))
plt.plot(algorithms, durations)
plt.title('Training duration of the algorithms')
plt.show()


In [None]:
table = [[]]
table[0].append(['accuracy'])
table[0].append(['loss'])
table[0].append(['precision'])
table[0].append(['recall'])
table[0].append(['f1'])
table[0].append(['training_duration'])
table.append(['svm1_C_1'])
table[1].append(accuracies[0])
table[1].append(precisions[0])
table[1].append(recalls[0])
table[1].append(f1s[0])
table[1].append(durations[0])
table.append(['svm1_C_2'])
table[2].append(accuracies[1])
table[2].append(precisions[1])
table[2].append(recalls[1])
table[2].append(f1s[1])
table[2].append(durations[1])
table.append(['svm1_C_3'])
table[3].append(accuracies[2])
table[3].append(precisions[2])
table[3].append(recalls[2])
table[3].append(f1s[2])
table[3].append(durations[2])
table.append(['svm1_C_4'])
table[4].append(accuracies[3])
table[4].append(precisions[3])
table[4].append(recalls[3])
table[4].append(f1s[3])
table[4].append(durations[3])
table.append(['svm1_C_5'])
table[5].append(accuracies[4])
table[5].append(precisions[4])
table[5].append(recalls[4])
table[5].append(f1s[4])
table[5].append(durations[4])
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))
