In [1]:
import sys, os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from scipy.io import arff
import data_processing as dp
import warnings
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    confusion_matrix,
    classification_report,
    precision_score,
)

warnings.filterwarnings("ignore")

data = arff.loadarff("../../data/3year.arff")
df = pd.DataFrame(data[0])
df_origin = df.copy()

In [2]:
# Disable
def blockPrint():
    sys.stdout = open(os.devnull, "w")


# Restore
def enablePrint():
    sys.stdout = sys.__stdout__

In [3]:
kernel_type_ = "linear"  # global


def SVM_return_model(*args, kernel_type_):
    X_train = args[0]
    X_test = args[1]
    y_train = args[2]
    y_test = args[3]

    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    # Training the SVM model
    svm_model = SVC(kernel=kernel_type_)
    print(f"\nkernel_type: {kernel_type_}")
    svm_model.fit(X_train, y_train)

    # Predictions on the testing set
    y_pred_train = svm_model.predict(X_train)
    y_pred_test = svm_model.predict(X_test)

    # Evaluating the model
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    precision_score_ = precision_score(y_test, y_pred_test)
    recall_score_ = recall_score(y_test, y_pred_test)
    f1_score_ = f1_score(y_test, y_pred_test)

    # print(classification_report(y_test, y_pred_test))
    print(f"train_accuracy: {train_accuracy}")
    print(f"test_accuracy: {test_accuracy}")
    print(f"precision_score: {precision_score_}")
    print(f"recall_score: {recall_score_}")
    print(f"f1_score: {f1_score_}")

    return svm_model

In [4]:
train_test_dataset = dp.pre_process(df)
print(train_test_dataset[0].shape)

(13978, 30)


Prelim check on different kernel_types

In [5]:
kernel_list = ["linear", "poly", "rbf", "sigmoid"]
kernel_dict = dict()

for kernel_type in kernel_list:
    print(f"\033[96m{kernel_type}\033[00m")
    model = SVM_return_model(*train_test_dataset, kernel_type_=kernel_type)

# we found that sigmoid gives the best test accuracy
best_kernel_type = "sigmoid"

[96mlinear[00m

kernel_type: linear


train_accuracy: 0.537201316354271
test_accuracy: 0.5728340209457315
precision_score: 0.04708520179372197
recall_score: 0.4701492537313433
f1_score: 0.08559782608695651
[96mpoly[00m

kernel_type: poly
train_accuracy: 0.5633853197882387
test_accuracy: 0.510631545541098
precision_score: 0.04813863928112965
recall_score: 0.5597014925373134
f1_score: 0.08865248226950355
[96mrbf[00m

kernel_type: rbf
train_accuracy: 0.5304764630133066
test_accuracy: 0.5147572199301809
precision_score: 0.04795852235904083
recall_score: 0.5522388059701493
f1_score: 0.08825283243887895
[96msigmoid[00m

kernel_type: sigmoid
train_accuracy: 0.5241808556302762
test_accuracy: 0.5893367185020628
precision_score: 0.04258675078864353
recall_score: 0.40298507462686567
f1_score: 0.07703281027104138


Using ANOVA test for feature selection to find the best number of k features for our dataset  
Result: 
- Best k for train_accuracy: 23
- Best k for test_accuracy: 25

Hence, we use k = 25

In [6]:
# get the best number of features that gives the highest test accuracy for SVM model
def SVM_sigmoid_model(*args):  # for ANOVA
    X_train = args[0]
    X_test = args[1]
    y_train = args[2]
    y_test = args[3]

    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    model = SVM_return_model(*args, kernel_type_="sigmoid")
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Evaluating the model
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    return train_accuracy, test_accuracy


# best_train_test_dataset = dp.find_best_k_features_from_ANOVA(
#    SVM_sigmoid_model, *train_test_dataset
# )

# print(len(best_train_test_dataset[0].columns))
# Best k for train_accuracy: 23
# Best k for test_accuracy: 25

# for quick run
k_features = 25
best_train_test_dataset = dp.get_df_with_top_k_features(k_features, *train_test_dataset)

Now we create a SVM model based on the top 25 features after ANOVA test

In [7]:
# current best SVM model after ANOVA test
SVM_model2 = SVM_return_model(*best_train_test_dataset, kernel_type_=best_kernel_type)

X_train1, X_test1, y_train1, y_test1 = best_train_test_dataset
conf_matrix = confusion_matrix(y_test1, y_test1)


kernel_type: sigmoid
train_accuracy: 0.528544856202604
test_accuracy: 0.6013963821009204
precision_score: 0.041666666666666664
recall_score: 0.3805970149253731
f1_score: 0.0751104565537555


To improve model accuracy, we perform GridSearch to find the best model hyperparameters for SVM

GridSearchCV
https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

In [1]:
# defining parameter range
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
}

# grid = GridSearchCV(SVM_model2, param_grid, refit=True, verbose=3)

# fitting the model for grid search
# grid.fit(X_train1, y_train1)

# print best parameter after tuning
# print(grid.best_params_)

# print how our model looks after hyper-parameter tuning
# print(grid.best_estimator_)

From grid-search above, we found that the best hyperparameters are:
C=1, gamma=0.1, kernel=poly
Where the average accuracy score after cross-validation is 0.5122

In [None]:
# Create the SVC model with specified parameters
svm_model_after_gridsearch = SVC(C=1, gamma=0.1, kernel="poly")

In [None]:
grid_predictions = grid.predict(X_test1)

# print classification report (without grid search)
print("SVM model without grid-search")
y_pred_test = SVM_model2.predict(X_test1)
print(confusion_matrix(y_test1, y_pred_test))
clf = SVM_return_model(
    *best_train_test_dataset, best_kernel_type
)  # to print accuracy score

# print classification report with grid search
print("\nSVM model with grid-search")
print(confusion_matrix(y_test1, grid_predictions))
print(classification_report(y_test1, grid_predictions))

SVM model without grid-search
[[1744 1273]
 [  71   63]]
              precision    recall  f1-score   support

           0       0.96      0.58      0.72      3017
           1       0.05      0.47      0.09       134

    accuracy                           0.57      3151
   macro avg       0.50      0.52      0.40      3151
weighted avg       0.92      0.57      0.69      3151

precision_score: 0.04715568862275449
recall_score: 0.4701492537313433
train_accuracy: 0.5372728573472599
test_accuracy: 0.5734687400825135
f1score: 0.0857142857142857

SVM model with grid-search
              precision    recall  f1-score   support

           0       0.79      0.54      0.64      6989
           1       0.65      0.86      0.74      6989

    accuracy                           0.70     13978
   macro avg       0.72      0.70      0.69     13978
weighted avg       0.72      0.70      0.69     13978

[[1606 1411]
 [  31  103]]
              precision    recall  f1-score   support

           0

Bagging to improve prediction accuracy

In [None]:
from sklearn.ensemble import BaggingClassifier

# W/O GRID

# Define the BaggingClassifier
bagging_svm = BaggingClassifier(
    SVM_model2, n_estimators=10, random_state=42
)  # Adjust n_estimators as needed

# Train the BaggingClassifier on your training data
bagging_svm.fit(X_train1, y_train1)

# Evaluate the model
accuracy = bagging_svm.score(X_test1, y_test1)
print("Accuracy:", accuracy)  # 0.55950491907331

Accuracy: 0.55950491907331
