# Binary

In [177]:
import pandas as pd
import numpy as np
trainData = pd.read_csv("TrainingDataBinary.csv") # Dataset is loaded in from CSV files
testData = pd.read_csv("TestingDataBinary.csv")

#Creating DataFrames for training and testing
train_Data = pd.DataFrame(data = trainData)
test_Data = pd.DataFrame(data = testData)


In [178]:
train_Data.info()
test_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5999 entries, 0 to 5998
Columns: 129 entries, 70.399324 to 0.15
dtypes: float64(113), int64(16)
memory usage: 5.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Columns: 128 entries, 12.118057 to 0.51
dtypes: float64(112), int64(16)
memory usage: 99.1 KB


In [179]:
from sklearn.model_selection import train_test_split
X = train_Data.iloc[:, :-1]  # Features (all columns except the last one)
y = train_Data.iloc[:, -1]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [180]:
#F1 score
#Just did manually so I could keep to the formula instead of macro or micro
def f1score(prediction):
    TP = 0 
    FP = 0
    TN = 0
    FN = 0

    for true, pred in zip(y_test, prediction): 
        if(true == 0 and pred == 0):
            TP = TP + 1
        if(true == 0 and pred == 1):
            FN = FN + 1   
        if(true == 1 and pred == 0):
            FP = FP + 1
        if(true == 1 and pred == 1):
            TN = TN + 1 

    print("TruePositive: " + str(TP) + " FalseNegative: " + str(FN) + " FalsePositive: " + str(FP) + " TrueNegative: " + str(TN) )


    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    print("precision : " + str(precision))
    print("recall: " + str(recall))
    f1 = 2 * ((precision * recall) / (precision + recall))
    print("TruePositive: " + str(TP) + " FalseNegative: " + str(FN) + " FalsePositive: " + str(FP) + " TrueNegative: " + str(TN) )
    print("f1-score: " + str(f1))

In [181]:
#Classification problem
#Logistic regression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))    
f1score(prediction)

0.8558333333333333
0.9289940828402367
TruePositive: 556 FalseNegative: 36 FalsePositive: 137 TrueNegative: 471
precision : 0.8023088023088023
recall: 0.9391891891891891
TruePositive: 556 FalseNegative: 36 FalsePositive: 137 TrueNegative: 471
f1-score: 0.8653696498054475


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [183]:
#Linear discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction)) 
f1score(prediction)

0.8766666666666667
0.9291044776119403
TruePositive: 554 FalseNegative: 38 FalsePositive: 110 TrueNegative: 498
precision : 0.8343373493975904
recall: 0.9358108108108109
TruePositive: 554 FalseNegative: 38 FalsePositive: 110 TrueNegative: 498
f1-score: 0.8821656050955415


In [205]:
#Gaussian NB
from sklearn.naive_bayes import GaussianNB 
clf = GaussianNB()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))   
f1score(prediction)

0.5216666666666666
0.84
TruePositive: 584 FalseNegative: 8 FalsePositive: 566 TrueNegative: 42
precision : 0.5078260869565218
recall: 0.9864864864864865
TruePositive: 584 FalseNegative: 8 FalsePositive: 566 TrueNegative: 42
f1-score: 0.6704936854190585


In [185]:
from sklearn.svm import SVC
clf = SVC(kernel='linear',C=1.9, gamma='auto')
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))        
f1score(prediction)

0.8708333333333333
0.9415204678362573
TruePositive: 562 FalseNegative: 30 FalsePositive: 125 TrueNegative: 483
precision : 0.8180494905385735
recall: 0.9493243243243243
TruePositive: 562 FalseNegative: 30 FalsePositive: 125 TrueNegative: 483
f1-score: 0.8788115715402659


In [186]:
# MLP - perhaps this could be fine-tuned but wouldn't be worth it
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))    
f1score(prediction)

0.7041666666666667
0.7744034707158352
TruePositive: 488 FalseNegative: 104 FalsePositive: 251 TrueNegative: 357
precision : 0.6603518267929634
recall: 0.8243243243243243
TruePositive: 488 FalseNegative: 104 FalsePositive: 251 TrueNegative: 357
f1-score: 0.7332832456799399


In [187]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))  
f1score(prediction)

0.9591666666666666
0.9589490968801314
TruePositive: 567 FalseNegative: 25 FalsePositive: 24 TrueNegative: 584
precision : 0.9593908629441624
recall: 0.9577702702702703
TruePositive: 567 FalseNegative: 25 FalsePositive: 24 TrueNegative: 584
f1-score: 0.9585798816568047


In [232]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=150) # 150 allows for a bit more speed aswell

clf.fit(X_train,y_train)

prediction = clf.predict(X_test)
# Class skewing
pred_p = clf.predict_proba(X_test)[:,1]
threshold = 0.4
for i in range(len(pred_p)):
    if pred_p[i] >= threshold:
        prediction[i] = 1
    else:
        prediction[i] = 0

print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction))    
f1score(prediction)

0.9825
0.9696
TruePositive: 573 FalseNegative: 19 FalsePositive: 2 TrueNegative: 606
precision : 0.9965217391304347
recall: 0.9679054054054054
TruePositive: 573 FalseNegative: 19 FalsePositive: 2 TrueNegative: 606
f1-score: 0.9820051413881749


In [220]:
#Producing the output file
prediction = clf.predict(test_Data)
print(prediction)
pred_p = clf.predict_proba(test_Data)[:,1]
threshold = 0.4
for i in range(len(pred_p)):
    if pred_p[i] >= threshold:
        prediction[i] = 1
    else:
        prediction[i] = 0
print(prediction)
# Combination of without and with class skew

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


Feature names unseen at fit time:
- -107.847846
- -107.915958
- -109.664122
- -109.690249
- -113.49721
- ...
Feature names seen at fit time, yet now missing:
- -100.86922
- -102.060972
- -119.550481
- -119.753909
- -120.341499
- ...

Feature names unseen at fit time:
- -107.847846
- -107.915958
- -109.664122
- -109.690249
- -113.49721
- ...
Feature names seen at fit time, yet now missing:
- -100.86922
- -102.060972
- -119.550481
- -119.753909
- -120.341499
- ...



In [230]:
testing_data = np.genfromtxt("TestingDataBinary.csv", delimiter=',')
test_Data = testing_data[:,:128]
with open("TestingResultsBinary.csv","w") as f:
    i = 0
    temp = ""
    for result in prediction:
        p_list = (test_Data[i]).tolist()
        p_list.append(int(result))
        string = "\n"
        string = str(p_list) + string
        string = string.replace("]", "")
        string = string.replace("[","")
        f.write(string)
        i = i + 1

# Multi-class

In [163]:
trainData = pd.read_csv("TrainingDataMulti.csv")
testData = pd.read_csv("TestingDataMulti.csv")

#Creating DataFrames for training and testing
train_Data = pd.DataFrame(data = trainData)
test_Data = pd.DataFrame(data = testData)


In [164]:
train_Data.info()
test_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5999 entries, 0 to 5998
Columns: 129 entries, 70.399324 to 0.15
dtypes: float64(113), int64(16)
memory usage: 5.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Columns: 128 entries, -100.141563 to 0.35
dtypes: float64(104), int64(24)
memory usage: 99.1 KB


In [165]:
X = train_Data.iloc[:, :-1]  # Features (all columns except the last one)
y = train_Data.iloc[:, -1]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [166]:
clf = LogisticRegression()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))    
f1score(prediction)

0.7
0.6716764193855532
TruePositive: 591 FalseNegative: 0 FalsePositive: 100 TrueNegative: 117
precision : 0.8552821997105644
recall: 1.0
TruePositive: 591 FalseNegative: 0 FalsePositive: 100 TrueNegative: 117
f1-score: 0.921996879875195


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [168]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro')) 
f1score(prediction)

0.7033333333333334
0.6558584810207017
TruePositive: 555 FalseNegative: 18 FalsePositive: 65 TrueNegative: 130
precision : 0.8951612903225806
recall: 0.9685863874345549
TruePositive: 555 FalseNegative: 18 FalsePositive: 65 TrueNegative: 130
f1-score: 0.9304274937133278


In [198]:

clf = GaussianNB()
clf.fit(X_normalized,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))   
f1score(prediction)

0.5025
0.7489486963835156
TruePositive: 592 FalseNegative: 0 FalsePositive: 597 TrueNegative: 11
precision : 0.4978973927670311
recall: 1.0
TruePositive: 592 FalseNegative: 0 FalsePositive: 597 TrueNegative: 11
f1-score: 0.6647950589556428




In [170]:
clf = SVC(kernel='linear',C=1.9, gamma='auto')
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))        
f1score(prediction)

0.7025
0.6592871607296419
TruePositive: 581 FalseNegative: 7 FalsePositive: 81 TrueNegative: 140
precision : 0.877643504531722
recall: 0.9880952380952381
TruePositive: 581 FalseNegative: 7 FalsePositive: 81 TrueNegative: 140
f1-score: 0.9296


In [171]:
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))  
f1score(prediction)


0.8975
0.8792675251322831
TruePositive: 562 FalseNegative: 14 FalsePositive: 10 TrueNegative: 245
precision : 0.9825174825174825
recall: 0.9756944444444444
TruePositive: 562 FalseNegative: 14 FalsePositive: 10 TrueNegative: 245
f1-score: 0.9790940766550522


In [201]:
# 3000 , 0's
# 1500 , 1's
# 1500 , 2's
# class_weight={0:1,1:4,2:4} but doesn't help accuracy
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
print(accuracy_score(y_test, prediction))
print(precision_score(y_test, prediction, average='macro'))    
f1score(prediction)
# No point skewing the classes due to them being the same

0.9875
0.9875416459421082
TruePositive: 583 FalseNegative: 9 FalsePositive: 6 TrueNegative: 602
precision : 0.9898132427843803
recall: 0.9847972972972973
TruePositive: 583 FalseNegative: 9 FalsePositive: 6 TrueNegative: 602
f1-score: 0.9872988992379339


In [173]:
#Producing the ouput file
prediction = clf.predict(test_Data)
print(prediction)

[2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]


Feature names unseen at fit time:
- -0.18182
- -0.20197
- -100.061349
- -100.072809
- -100.124375
- ...
Feature names seen at fit time, yet now missing:
- -100.86922
- -102.060972
- -119.550481
- -119.753909
- -120.341499
- ...



In [None]:
testing_data = np.genfromtxt("TestingDataMulti.csv", delimiter=',')
test_Data = testing_data[:,:128]
with open("TestingResultsMulti.csv","w") as f:
    i = 0
    temp = ""
    for result in prediction:
        p_list = (test_Data[i]).tolist()
        p_list.append(int(result))
        string = "\n"
        string = str(p_list) + string
        string = string.replace("]", "")
        string = string.replace("[","")
        f.write(string)
        i = i + 1