In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from scipy.io import arff
import data_processing as dp
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

warnings.filterwarnings("ignore")

data = arff.loadarff("../data/3year.arff")
df = pd.DataFrame(data[0])
df_origin = df.copy()

In [11]:
def SVM_model(*args):
    X_train = args[0]
    X_test = args[1]
    y_train = args[2]
    y_test = args[3]

    # Reset indices to ensure alignment
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)

    # Training the SVM model
    svm_model = SVC(kernel='linear')  # You can choose different kernels based on your data
    svm_model.fit(X_train, y_train)

    # Predictions on the testing set
    y_pred_train = svm_model.predict(X_train)
    y_pred_test = svm_model.predict(X_test)

    # Evaluating the model
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    precision_score_ = precision_score(y_test, y_pred_test)
    recall_score_ = recall_score(y_test, y_pred_test)

    print(f"precision_score: {precision_score_}")
    print(f"recall_score: {recall_score_}")

    return train_accuracy, test_accuracy

In [12]:
train_test_dataset = dp.pre_process(df)  # with SMOTE
print(f"X_train.shape: {train_test_dataset[0].shape}")

X_train.shape: (13978, 30)


To Do:
- check linear separability

In [13]:
train_accuracy, test_accuracy = SVM_model(*train_test_dataset)
print(train_accuracy)
print(test_accuracy)

#kernel = rbf , [train, test] = [0.5304764630133066, 0.5147572199301809]
#kernel = linear, [train, test] = [0.537201316354271, 0.5728340209457315]

precision_score: 0.04708520179372197
recall_score: 0.4701492537313433
0.537201316354271
0.5728340209457315


In [15]:
best_train_test_dataset = dp.find_best_k_features_from_ANOVA(SVM_model, *train_test_dataset)
print(len(best_train_test_dataset[0].columns))s

precision_score: 0.04259927797833935
recall_score: 0.44029850746268656
precision_score: 0.04404332129963899
recall_score: 0.4552238805970149
precision_score: 0.044543429844097995
recall_score: 0.44776119402985076
precision_score: 0.04491017964071856
recall_score: 0.44776119402985076
precision_score: 0.04432757325319309
recall_score: 0.44029850746268656
precision_score: 0.045722713864306784
recall_score: 0.4626865671641791
precision_score: 0.045925925925925926
recall_score: 0.4626865671641791
precision_score: 0.04595997034840623
recall_score: 0.4626865671641791
precision_score: 0.04758364312267658
recall_score: 0.47761194029850745
precision_score: 0.0475482912332838
recall_score: 0.47761194029850745
precision_score: 0.047725577926920205
recall_score: 0.47761194029850745
precision_score: 0.04776119402985075
recall_score: 0.47761194029850745
precision_score: 0.04776119402985075
recall_score: 0.47761194029850745
precision_score: 0.04776119402985075
recall_score: 0.47761194029850745
precisi