In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import os

%matplotlib inline

In [2]:
os.chdir(r'C:\Users\alanw\OneDrive\Varsity\Research\Code')

In [3]:
# Loading  data
def import_data(type):
    if type == "Balanced":
        df = pd.read_csv('data_balanced_cv.csv')
    elif type == "Original":
        df = pd.read_csv('data_not_cleaned.csv')
    return df

In [4]:
df = import_data("Balanced")

# Data Prep

Scaling Data

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [6]:
scaler = StandardScaler()

In [7]:
scaler.fit(df.drop('Y',axis=1))

StandardScaler()

In [8]:
scaled_features = scaler.fit_transform(df.drop('Y',axis=1))

Splitting labels and features

In [9]:
X = pd.DataFrame(scaled_features,columns=df.columns[:-1])
y = df['Y']

In [10]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.05, random_state=69)

In [11]:
x, z = df['Y'].value_counts()
print("Data Split: " + str(int(x/(x+z)*100)) + ":" + str(int(z/(x+z)*100)))

Data Split: 52:47


# Setting Up and Training The Model

In [12]:
from sklearn.svm import SVC

In [13]:
svm = SVC()

# Gridsearch 

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
# define search space
param_grid = dict()
param_grid['C'] = [0.1, 1, 10, 100]
param_grid['gamma'] = [1, 0.1, 0.01, 0.001]
param_grid['kernel'] = ['rbf', 'poly', 'linear']

In [16]:
# define search
grid = GridSearchCV(svm, param_grid, scoring='accuracy', n_jobs=-1, cv=3)

In [None]:
t0 = datetime.now()
grid_results = grid.fit(X, y)
t1 = datetime.now()

In [None]:
# summarize results
print("Best: %f using %s" % (grid_results.best_score_, grid_results.best_params_))
print("Execution Time: ", (t1 - t0))

# Crossvalidate Best Model

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate

In [None]:
best_svm = SVC(C=grid_results.best_params_['C'],
                kernel=grid_results.best_params_['kernel'],
                gamma=grid_results.best_params_['gamma'])

In [None]:
t0 = datetime.now()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=69)
scores = cross_validate(best_svm, X, y, scoring=['accuracy','roc_auc','precision','recall','f1'], cv=cv, n_jobs=-1)
t1 = datetime.now()

In [None]:
print('Mean ROC AUC: %.2f%% Std Dev: (+/- %.3f%%)' % (np.mean(scores['test_accuracy']*100), np.std(scores['test_accuracy']*100)))
print('Mean ROC AUC: %.2f%% Std Dev: (+/- %.3f%%)' % (np.mean(scores['test_roc_auc']*100), np.std(scores['test_roc_auc']*100)))
print('Mean Precision: %.2f%% Std Dev: (+/- %.3f%%)' % (np.mean(scores['test_precision']*100), np.std(scores['test_precision']*100)))
print('Mean Recall: %.2f%% Std Dev: (+/- %.3f%%)' % (np.mean(scores['test_recall']*100), np.std(scores['test_recall']*100)))
print('Mean F1 Score: %.2f%% Std Dev: (+/- %.3f%%)' % (np.mean(scores['test_f1']*100), np.std(scores['test_f1']*100)))
print("Execution Time: ", (t1 - t0))

### Final Evaluations

In [None]:
from sklearn.metrics import plot_confusion_matrix, cohen_kappa_score, plot_roc_curve, classification_report
import pickle
import shap

In [None]:
t0 = datetime.now()
best_svm.fit(X,y)
t1 = datetime.now()
print("Training Time: ", (t1 - t0))

In [None]:
shap_values = shap.KernelExplainer(best_svm,X).shap_values(X)
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
predictions = best_svm.predict(X_test)

In [None]:
plot_confusion_matrix(best_svm,X_test,y_test,cmap=plt.cm.Blues)

In [None]:
plt.figure(figsize=(7, 5))
ax = plot_roc_curve(best_svm,X_test,y_test)
x = np.linspace(0, 1, 100)
ax = plt.plot(x, x, linestyle='--',color='black')

plt.tight_layout()
plt.savefig(r'C:\Users\alanw\OneDrive\Varsity\Research\Paper Itself\LaTeX\images\svm_roc.png')
plt.show()

In [None]:
kappa = cohen_kappa_score(y_test, predictions)
print("Cohen's Kappa: %.2f" % (kappa))

In [None]:
import pickle

In [None]:
Pkl_Filename = "SVM_Final.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_svm, file)

In [None]:
import dill
dill.dump_session('svm_final.db')

In [None]:
import dill
dill.load_session(r'C:\Users\alanw\OneDrive\Varsity\Research\Code\svm_final.db')