In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import NuSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('ML_Project_data.csv')
data_model = data.drop(['label'], axis=1)
labels = data['label']

In [3]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(data_model)
data_model_scaled = scaler.transform(data_model)

In [4]:
features = list(data_model.columns)
selectModel = SelectKBest(chi2, k=50)
selectModel.fit(data_model_scaled, labels)
selected_features = selectModel.get_feature_names_out(features)
selected_features

array(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19',
       'f20', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f33', 'f34', 'f37', 'f40', 'f41', 'f42', 'f44', 'f46',
       'f50', 'f52', 'f56', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65',
       'f66', 'f67', 'f68'], dtype=object)

In [5]:
deleted_cols = dict()
selected_cols = []

for i in range(len(selected_features)):
    col = int(selected_features[i][1:])
    deleted_cols[col] = 0

for i in range(len(data_model.columns)):
    if i not in deleted_cols:
        selected_cols.append(i)
    

data_model_scaled = np.delete(data_model_scaled, selected_cols, 1)

In [6]:
trainData, testData, trainLabels, testLabels = train_test_split(data_model_scaled, labels, test_size=0.1,  random_state=42)                                                    
# trainData, validData, trainLabels, validLabels = train_test_split(trainData, trainLabels, test_size=0.15,  random_state=42)

In [7]:
def print_report(model):
    print("TrainData: ")
    print(classification_report(trainLabels, model.predict(trainData)))
    
    print("TestData: ")
    print(classification_report(testLabels, model.predict(testData)))

In [8]:
model = MLPClassifier(hidden_layer_sizes = (128, 64, 32, 16), solver = 'sgd', batch_size = 32, random_state=4, \
                      verbose=True, momentum=0.85, max_iter=350, learning_rate_init = 0.01)
model.fit(trainData, trainLabels)

Iteration 1, loss = 1.56047502
Iteration 2, loss = 1.47224479
Iteration 3, loss = 1.42586080
Iteration 4, loss = 1.37974651
Iteration 5, loss = 1.34643339
Iteration 6, loss = 1.31424212
Iteration 7, loss = 1.28042896
Iteration 8, loss = 1.25766729
Iteration 9, loss = 1.23098971
Iteration 10, loss = 1.20510741
Iteration 11, loss = 1.18338763
Iteration 12, loss = 1.16194079
Iteration 13, loss = 1.13526774
Iteration 14, loss = 1.11784326
Iteration 15, loss = 1.09858100
Iteration 16, loss = 1.07407899
Iteration 17, loss = 1.06065167
Iteration 18, loss = 1.03517444
Iteration 19, loss = 1.01618276
Iteration 20, loss = 1.00227939
Iteration 21, loss = 0.97861959
Iteration 22, loss = 0.96990347
Iteration 23, loss = 0.94273121
Iteration 24, loss = 0.93786440
Iteration 25, loss = 0.92143999
Iteration 26, loss = 0.90140890
Iteration 27, loss = 0.89234286
Iteration 28, loss = 0.87903041
Iteration 29, loss = 0.86062311
Iteration 30, loss = 0.85130761
Iteration 31, loss = 0.83564563
Iteration 32, los

MLPClassifier(batch_size=32, hidden_layer_sizes=(128, 64, 32, 16),
              learning_rate_init=0.01, max_iter=350, momentum=0.85,
              random_state=4, solver='sgd', verbose=True)

In [9]:
model.score(trainData, trainLabels)

0.9549745824255628

In [10]:
model.score(testData, testLabels)

0.7892156862745098

In [11]:
print_report(model)

TrainData: 
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2243
           1       0.98      0.94      0.96      2082
           2       0.97      0.94      0.95      2320
           3       0.94      0.97      0.96      2384
           4       0.95      0.95      0.95      1987

    accuracy                           0.95     11016
   macro avg       0.96      0.95      0.96     11016
weighted avg       0.96      0.95      0.96     11016

TestData: 
              precision    recall  f1-score   support

           0       0.81      0.83      0.82       254
           1       0.84      0.76      0.80       239
           2       0.83      0.79      0.81       286
           3       0.71      0.83      0.76       241
           4       0.77      0.73      0.75       204

    accuracy                           0.79      1224
   macro avg       0.79      0.79      0.79      1224
weighted avg       0.79      0.79      0.79      1224


In [12]:
model = KNeighborsClassifier(n_neighbors=2, weights='distance', metric='manhattan', algorithm='ball_tree')
model.fit(trainData, trainLabels)

KNeighborsClassifier(algorithm='ball_tree', metric='manhattan', n_neighbors=2,
                     weights='distance')

In [13]:
model.score(trainData, trainLabels)

1.0

In [14]:
model.score(testData, testLabels)

0.9240196078431373

In [15]:
print_report(model)

TrainData: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2243
           1       1.00      1.00      1.00      2082
           2       1.00      1.00      1.00      2320
           3       1.00      1.00      1.00      2384
           4       1.00      1.00      1.00      1987

    accuracy                           1.00     11016
   macro avg       1.00      1.00      1.00     11016
weighted avg       1.00      1.00      1.00     11016

TestData: 
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       254
           1       0.93      0.93      0.93       239
           2       0.94      0.90      0.92       286
           3       0.93      0.93      0.93       241
           4       0.91      0.92      0.91       204

    accuracy                           0.92      1224
   macro avg       0.92      0.92      0.92      1224
weighted avg       0.92      0.92      0.92      1224


In [16]:
model = NuSVC(kernel="rbf", gamma=4, decision_function_shape='ovo', random_state=42, verbose=True)
model.fit(trainData, trainLabels)

[LibSVM]

NuSVC(decision_function_shape='ovo', gamma=4, random_state=42, verbose=True)

In [17]:
model.score(trainData, trainLabels)

0.89960058097313

In [18]:
model.score(testData, testLabels)

0.7826797385620915

In [19]:
print_report(model)

TrainData: 
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      2243
           1       0.94      0.88      0.91      2082
           2       0.94      0.93      0.94      2320
           3       0.90      0.90      0.90      2384
           4       0.88      0.92      0.90      1987

    accuracy                           0.90     11016
   macro avg       0.90      0.90      0.90     11016
weighted avg       0.90      0.90      0.90     11016

TestData: 
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       254
           1       0.86      0.74      0.80       239
           2       0.86      0.79      0.82       286
           3       0.70      0.83      0.76       241
           4       0.74      0.77      0.75       204

    accuracy                           0.78      1224
   macro avg       0.79      0.78      0.78      1224
weighted avg       0.79      0.78      0.78      1224


In [20]:
model = SVC(kernel='rbf', C=1, gamma=8, decision_function_shape='ovo', random_state=42)
model.fit(trainData, trainLabels)

SVC(C=1, decision_function_shape='ovo', gamma=8, random_state=42)

In [21]:
model.score(trainData, trainLabels)

0.9954611474219317

In [22]:
model.score(testData, testLabels)

0.8521241830065359

In [23]:
print_report(model)

TrainData: 
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2243
           1       1.00      0.99      1.00      2082
           2       1.00      0.99      1.00      2320
           3       1.00      0.99      1.00      2384
           4       0.99      1.00      0.99      1987

    accuracy                           1.00     11016
   macro avg       1.00      1.00      1.00     11016
weighted avg       1.00      1.00      1.00     11016

TestData: 
              precision    recall  f1-score   support

           0       0.91      0.85      0.88       254
           1       0.92      0.81      0.86       239
           2       0.93      0.85      0.88       286
           3       0.68      0.94      0.79       241
           4       0.90      0.80      0.85       204

    accuracy                           0.85      1224
   macro avg       0.87      0.85      0.85      1224
weighted avg       0.87      0.85      0.86      1224


In [24]:
model = RandomForestClassifier(max_depth=30, n_estimators=80, random_state=42, warm_start=True, min_samples_split=3 \
                              , bootstrap=True, oob_score=True)
model.fit(trainData, trainLabels)

RandomForestClassifier(max_depth=30, min_samples_split=3, n_estimators=80,
                       oob_score=True, random_state=42, warm_start=True)

In [25]:
model.score(trainData, trainLabels)

1.0

In [26]:
model.score(testData, testLabels)

0.7924836601307189

In [27]:
print_report(model)

TrainData: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2243
           1       1.00      1.00      1.00      2082
           2       1.00      1.00      1.00      2320
           3       1.00      1.00      1.00      2384
           4       1.00      1.00      1.00      1987

    accuracy                           1.00     11016
   macro avg       1.00      1.00      1.00     11016
weighted avg       1.00      1.00      1.00     11016

TestData: 
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       254
           1       0.89      0.77      0.82       239
           2       0.82      0.80      0.81       286
           3       0.72      0.79      0.75       241
           4       0.74      0.76      0.75       204

    accuracy                           0.79      1224
   macro avg       0.79      0.79      0.79      1224
weighted avg       0.80      0.79      0.79      1224
