In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from scipy.io import arff
import data_processing as dp
import warnings
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    confusion_matrix,
    classification_report,
    precision_score,
)

warnings.filterwarnings("ignore")

data = arff.loadarff("../../data/3year.arff")
df = pd.DataFrame(data[0])
df_origin = df.copy()

In [8]:
def SVM_model(*args):
    X_train = args[0]
    X_test = args[1]
    y_train = args[2]
    y_test = args[3]

    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    # Training the SVM model
    svm_model = SVC(
        kernel="linear"
    )  # You can choose different kernels based on your data
    svm_model.fit(X_train, y_train)

    # Predictions on the testing set
    y_pred_train = svm_model.predict(X_train)
    y_pred_test = svm_model.predict(X_test)

    # Evaluating the model
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    precision_score_ = precision_score(y_test, y_pred_test)
    recall_score_ = recall_score(y_test, y_pred_test)
    f1_score_ = f1_score(y_test, y_pred_test)

    print(classification_report(y_test, y_pred_test))
    print(f"precision_score: {precision_score_}")
    print(f"recall_score: {recall_score_}")
    print(f"train_accuracy: {train_accuracy}")
    print(f"test_accuracy: {test_accuracy}")
    print(f"f1score: {f1_score_}")

    return train_accuracy, test_accuracy, y_pred_test

In [9]:
train_test_dataset = dp.pre_process(df)  # with SMOTE
print(f"X_train.shape: {train_test_dataset[0].shape}")

# previously checked that best k is 25
best_train_test_dataset = dp.get_df_with_top_k_features(25, *train_test_dataset)

X_train.shape: (13978, 30)


To Do:
- check linear separability

In [10]:
train_accuracy, test_accuracy, y_pred_test = SVM_model(*train_test_dataset)
print(train_accuracy)
print(test_accuracy)

# kernel = rbf , [train, test] = [0.5304764630133066, 0.5147572199301809]
# kernel = linear, [train, test] = [0.537201316354271, 0.5728340209457315]

              precision    recall  f1-score   support

           0       0.96      0.58      0.72      3017
           1       0.05      0.47      0.09       134

    accuracy                           0.57      3151
   macro avg       0.50      0.52      0.40      3151
weighted avg       0.92      0.57      0.69      3151

precision_score: 0.04708520179372197
recall_score: 0.4701492537313433
train_accuracy: 0.537201316354271
test_accuracy: 0.5728340209457315
f1score: 0.08559782608695651
0.537201316354271
0.5728340209457315


In [11]:
X_train, X_test, y_train, y_test = best_train_test_dataset
conf_matrix = confusion_matrix(y_test, y_test)

GridSearchCV
https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

In [12]:
# defining parameter range
from sklearn.model_selection import GridSearchCV

param_grid = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["rbf"],
}

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3, cv=3)

# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)


# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV 1/3] END ........C=0.1, gamma=1, kernel=rbf;, score=0.519 total time=  11.0s
[CV 2/3] END ........C=0.1, gamma=1, kernel=rbf;, score=0.526 total time=  10.8s
[CV 3/3] END ........C=0.1, gamma=1, kernel=rbf;, score=0.517 total time=   9.8s
[CV 1/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.524 total time=  10.2s
[CV 2/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=  10.9s
[CV 3/3] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=  10.1s
[CV 1/3] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.530 total time=   9.9s
[CV 2/3] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.500 total time=   9.9s
[CV 3/3] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.500 total time=   9.9s
[CV 1/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.529 total time=   9.7s
[CV 2/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.500 total time=   9.7s
[CV 3/3] END ....C=0.1, gamma=0.001, kernel=rbf;

In [13]:
grid_predictions = grid.predict(X_test)

# print classification report
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.98      0.53      0.69      3017
           1       0.07      0.77      0.13       134

    accuracy                           0.54      3151
   macro avg       0.52      0.65      0.41      3151
weighted avg       0.94      0.54      0.67      3151

