# Binary

In [323]:
import pandas as pd
import numpy as np
trainData = pd.read_csv("TrainingDataBinary.csv") # Dataset is loaded in from CSV files
testData = pd.read_csv("TestingDataBinary.csv")

#Creating DataFrames for training and testing
train_Data = pd.DataFrame(data = trainData)
test_Data = pd.DataFrame(data = testData)


In [324]:
train_Data.info() 
test_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5999 entries, 0 to 5998
Columns: 129 entries, 70.399324 to 0.15
dtypes: float64(113), int64(16)
memory usage: 5.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Columns: 128 entries, 12.118057 to 0.51
dtypes: float64(112), int64(16)
memory usage: 99.1 KB


In [325]:
from sklearn.model_selection import train_test_split
X = train_Data.iloc[:, :-1]  # Features (all columns except the last one)
y = train_Data.iloc[:, -1]   # Target variable (last column)

# Split the data into training and testing sets following the 80/20 split rule
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [297]:
#F1 score
# Calculated f1 score manually so I make sure it kept to the formula
def f1score(prediction):
    TP = 0 
    FP = 0
    TN = 0
    FN = 0

    for true, pred in zip(y_test, prediction): 
        if(true == 0 and pred == 0):
            TP = TP + 1
        if(true == 0 and pred == 1): 
            FN = FN + 1   
        if(true == 1 and pred == 0):
            FP = FP + 1
        if(true == 1 and pred == 1):
            TN = TN + 1 

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    print("precision : " + str(precision))
    print("recall: " + str(recall))
    f1 = 2 * ((precision * recall) / (precision + recall))
    print("TruePositive: " + str(TP) + " FalseNegative: " + str(FN) + " FalsePositive: " + str(FP) + " TrueNegative: " + str(TN) )
    print("f1-score: " + str(f1))

In [298]:
#Logistic regression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression() # Create an instance of the LogisticRegression classifier 
clf.fit(X_train,y_train) # Train the classifier on the training data
prediction = clf.predict(X_test) # Make predictions on the testing data
# Evaluation metrics
print(accuracy_score(y_test, prediction)) # Accuracy score is calculated
print(precision_score(y_test, prediction)) # Precision score is calculated
f1score(prediction) # f1score is calculated
# This process is the same for all of the classifiers

0.8558333333333333
0.9289940828402367
precision : 0.8023088023088023
recall: 0.9391891891891891
TruePositive: 556 FalseNegative: 36 FalsePositive: 137 TrueNegative: 471
f1-score: 0.8653696498054475


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [299]:
#Linear discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction)) 
f1score(prediction)

0.8766666666666667
0.9291044776119403
precision : 0.8343373493975904
recall: 0.9358108108108109
TruePositive: 554 FalseNegative: 38 FalsePositive: 110 TrueNegative: 498
f1-score: 0.8821656050955415


In [300]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB 
clf = GaussianNB()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))   
f1score(prediction)

0.5216666666666666
0.84
precision : 0.5078260869565218
recall: 0.9864864864864865
TruePositive: 584 FalseNegative: 8 FalsePositive: 566 TrueNegative: 42
f1-score: 0.6704936854190585


In [301]:
# C-Support Vector Classification
from sklearn.svm import SVC
clf = SVC(kernel='linear',C=1.9, gamma='auto')
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))        
f1score(prediction)

0.8708333333333333
0.9415204678362573
precision : 0.8180494905385735
recall: 0.9493243243243243
TruePositive: 562 FalseNegative: 30 FalsePositive: 125 TrueNegative: 483
f1-score: 0.8788115715402659


In [302]:
#  Multi-layer Perceptron classifier
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))    
f1score(prediction)

0.6041666666666666
0.67828418230563
precision : 0.5707376058041113
recall: 0.7972972972972973
TruePositive: 472 FalseNegative: 120 FalsePositive: 355 TrueNegative: 253
f1-score: 0.6652572233967582


In [303]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))  
f1score(prediction)

0.9616666666666667
0.9652317880794702
precision : 0.9580536912751678
recall: 0.964527027027027
TruePositive: 571 FalseNegative: 21 FalsePositive: 25 TrueNegative: 583
f1-score: 0.9612794612794613


In [305]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=150) # These were the best hyper-parameters found for random forest

clf.fit(X_train,y_train)

prediction = clf.predict(X_test)

pred_p = clf.predict_proba(X_test)[:,1] # The prediction probabilites for each entry in the model are obtained
threshold = 0.6 # 0.6 is chosen to produce more false postives than false negatives
for i in range(len(pred_p)): # The values are then re-classified given the threshold
    if pred_p[i] >= threshold:
        prediction[i] = 1
    else:
        prediction[i] = 0

print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))    
f1score(prediction)

0.9891666666666666
0.996661101836394
precision : 0.9816971713810316
recall: 0.9966216216216216
TruePositive: 590 FalseNegative: 2 FalsePositive: 11 TrueNegative: 597
f1-score: 0.9891031014249791


In [322]:
# Shows the predicted values from the test data, and then performs re-classification on these predictions
prediction = clf.predict(test_Data)
print(prediction)
# Reclassifies the values based on a threshold again
pred_p = clf.predict_proba(test_Data)[:,1]
threshold = 0.6
for i in range(len(pred_p)):
    if pred_p[i] >= threshold:
        prediction[i] = 1
    else:
        prediction[i] = 0
print(prediction)

[2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0]


Feature names unseen at fit time:
- -0.18182
- -0.20197
- -100.061349
- -100.072809
- -100.124375
- ...
Feature names seen at fit time, yet now missing:
- -100.86922
- -102.060972
- -119.550481
- -119.753909
- -120.341499
- ...

Feature names unseen at fit time:
- -0.18182
- -0.20197
- -100.061349
- -100.072809
- -100.124375
- ...
Feature names seen at fit time, yet now missing:
- -100.86922
- -102.060972
- -119.550481
- -119.753909
- -120.341499
- ...



# Multi-class

In [306]:
trainData = pd.read_csv("TrainingDataMulti.csv")
testData = pd.read_csv("TestingDataMulti.csv")

#Creating DataFrames for training and testing
train_Data = pd.DataFrame(data = trainData)
test_Data = pd.DataFrame(data = testData)


In [307]:
train_Data.info()
test_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5999 entries, 0 to 5998
Columns: 129 entries, 70.399324 to 0.15
dtypes: float64(113), int64(16)
memory usage: 5.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Columns: 128 entries, -100.141563 to 0.35
dtypes: float64(104), int64(24)
memory usage: 99.1 KB


In [308]:
X = train_Data.iloc[:, :-1]  
y = train_Data.iloc[:, -1]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [319]:
from sklearn.metrics import f1_score, confusion_matrix # Have to use new method to calculate f1 score as old method only accounts for binary labels
clf = LogisticRegression(max_iter=100000) # To allow for convergence max_iter is added
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro')) # Macro is used because the prediction is no longer binary
f1_score(y_test,prediction, average="weighted")

0.7158333333333333
0.6660695553941741


0.6963103500522466

In [310]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro')) 
f1_score(y_test,prediction, average="weighted")

0.7033333333333334
0.6558584810207017
precision : 0.8951612903225806
recall: 0.9685863874345549
TruePositive: 555 FalseNegative: 18 FalsePositive: 65 TrueNegative: 130
f1-score: 0.9304274937133278


In [311]:

clf = GaussianNB()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))   
f1_score(y_test,prediction, average="weighted")

0.5191666666666667
0.4990531852906379
precision : 0.7025703794369645
recall: 0.9982608695652174
TruePositive: 574 FalseNegative: 1 FalsePositive: 243 TrueNegative: 12
f1-score: 0.824712643678161


In [312]:
clf = SVC(kernel='linear',C=1.9, gamma='auto')
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))        
f1_score(y_test,prediction, average="weighted")

0.7025
0.6592871607296419
precision : 0.877643504531722
recall: 0.9880952380952381
TruePositive: 581 FalseNegative: 7 FalsePositive: 81 TrueNegative: 140
f1-score: 0.9296


In [313]:
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))  
f1_score(y_test,prediction, average="weighted")


0.8975
0.8812404062432124
precision : 0.9773123909249564
recall: 0.9739130434782609
TruePositive: 560 FalseNegative: 15 FalsePositive: 13 TrueNegative: 248
f1-score: 0.975609756097561


In [320]:
# In the multi dataset there are:
# 3000 , 0's
# 1500 , 1's
# 1500 , 2's
# a class_weight = {0: 0.6667, 1: 1.3333, 2: 1.3333} was used to help convey this to the model,  but doesn't help the accuracy
clf = RandomForestClassifier(n_estimators=150)
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))  
print(confusion_matrix(y_test,prediction)) # Produces the 3 x 3 confusion matrix for the multi data
f1_score(y_test,prediction, average="weighted")
# The predictions did not need to be reclassified as there are a similar amount of false Negatives to False Postives

0.96
0.9506305081182642
[[582   5   4]
 [  3 264  17]
 [  3  16 306]]


0.9600652448694115

In [321]:
# Produces the final prediction results for the test data
prediction = clf.predict(test_Data)
print(prediction)

Feature names unseen at fit time:
- -0.18182
- -0.20197
- -100.061349
- -100.072809
- -100.124375
- ...
Feature names seen at fit time, yet now missing:
- -100.86922
- -102.060972
- -119.550481
- -119.753909
- -120.341499
- ...



[2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
