In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('preprocess.csv')
df.head()

In [None]:
n = 50
random_state = 42

X = df.loc[:, df.columns != 'prob']
y = df['prob']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = MinMaxScaler(feature_range=(-1, 1)).fit(X_train)

# Apply the normalization trained in training data in both training and test sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Linear

In [None]:
#knc = LinearSVC()
knc = SVC(kernel='linear')
knc.fit(X_train, y_train)
pred = knc.predict(X_test)

print("Confusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))
print("\nF1 score on test set: ", sk.metrics.f1_score(y_test, pred, average='micro'))

In [None]:
# List of C values to test. We usualy test diverse orders of magnitude
Cs = np.logspace(-3, 6, num=10, base=10.0)
#Cs = np.logspace(-3, 5, num=9, base=10.0)

param_grid = {'C': Cs}
#grid_search = GridSearchCV(LinearSVC(), param_grid, cv=10)
grid_search = GridSearchCV(SVC(kernel='linear'), param_grid, cv=5, n_jobs=-1, scoring='f1_micro', verbose=10)
grid_search.fit(X_train, y_train)

# Let's plot the 10-fold cross.validation accuracy deppending on C
scores = grid_search.cv_results_['mean_test_score']

In [None]:
plt.semilogx(Cs,scores)
plt.ylabel('F1 score')
plt.xlabel('C')
plt.savefig('../figures/svm_linear_C_cv.pdf')

parval=grid_search.best_params_
cvacc = cross_val_score(SVC(C=parval['C'],kernel='linear'), X=X_train,  y=y_train, cv=5, scoring='f1_micro', n_jobs=-1, verbose=10)
print('F1 score 5-fold cross on train data= ', cvacc.mean())

# Let's apply the best C parameter found to the test set

In [None]:
#knc = LinearSVC(C=parval['C'])
svcl = SVC(C=parval['C'], kernel='linear')
svcl.fit(X_train, y_train)
pred = svcl.predict(X_test)

print("\nConfusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))
print("\nF1 score on test set: ", sk.metrics.f1_score(y_test, pred, average='micro'))
print("\nBest value of parameter C found: ", parval)
print("\nNumber of supports: ", np.sum(svcl.n_support_), "(",np.sum(np.abs(svcl.dual_coef_)==parval['C']) ,"of them have slacks)")
print("Prop. of supports: ", np.sum(svcl.n_support_)/X_train.shape[0])

## Poly

In [None]:
svcp2 = SVC(kernel='poly', degree=2)
svcp2.fit(X_train, y_train)

pred = svcp2.predict(X_test)
print("Confusion matrix on test set:\n",sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ",sk.metrics.accuracy_score(y_test, pred))

In [None]:
Cs = np.logspace(-3, 6, num=10, base=10.0)

param_grid = {'C': Cs}
grid_search = GridSearchCV(SVC(kernel='poly', degree=2) , param_grid, cv=5, n_jobs=-1, scoring='f1_micro', verbose=10)
grid_search.fit(X_train, y_train)

scores = grid_search.cv_results_['mean_test_score']

In [None]:
plt.semilogx(Cs,scores)
plt.ylabel('F1 score')
plt.xlabel('C')
plt.savefig('../figures/svm_poly_C_cv.pdf')

In [None]:
parval = grid_search.best_params_

cvacc = cross_val_score(SVC(kernel='poly', degree=2, C=parval['C']) , X=X_train,  y=y_train, cv=5, scoring='f1_micro', n_jobs=-1)
print('F1 score 5-fold cross on train data= ', cvacc.mean())

svcp2 = SVC(kernel='poly', degree=2, C=parval['C'])
svcp2.fit(X_train, y_train)
pred = svcp2.predict(X_test)

In [None]:
print("\nConfusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))
print("\nF1 score on test set: ", sk.metrics.f1_score(y_test, pred, average='micro'))
print("\nBest combination of parameters found: ",parval)
print("\nNumber of supports: ",np.sum(svcp2.n_support_), "(",np.sum(np.abs(svcp2.dual_coef_)==parval['C']) ,"of them have slacks)")
print("Prop. of supports: ",np.sum(svcp2.n_support_)/X_train.shape[0])

## Poly3

In [None]:
svcp3 = SVC(kernel='poly', degree=3)
svcp3.fit(X_train, y_train)
pred = svcp3.predict(X_test)

print("Confusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))

In [None]:
Cs = np.logspace(-3, 6, num=10, base=10.0)

param_grid = {'C': Cs}
grid_search = GridSearchCV(SVC(kernel='poly', degree=3) , param_grid, cv=5, n_jobs=-1, scoring='f1_micro', verbose=10)
grid_search.fit(X_train,y_train)

scores = grid_search.cv_results_['mean_test_score']

In [None]:
plt.semilogx(Cs, scores)
plt.ylabel('F1 score')
plt.xlabel('C')
plt.savefig('../figures/svm_poly3_C_cv.pdf')

In [None]:
parval = grid_search.best_params_

cvacc = cross_val_score(SVC(kernel='poly', degree=3, C=parval['C']) , X=X_train,  y=y_train, cv=5, scoring='f1_micro', n_jobs=-1)
print('F1 score 5-fold cross on train data= ', cvacc.mean())

svcp3 = SVC(kernel='poly', degree=3, C=parval['C'])
svcp3.fit(X_train, y_train)
pred = svcp3.predict(X_test)

In [None]:
print("\nConfusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))
print("\nF1 score on test set: ", sk.metrics.f1_score(y_test, pred, average='micro'))
print("\nBest combination of parameters found: ", parval)
print("\nNumber of supports: ",np.sum(svcp3.n_support_), "(",np.sum(np.abs(svcp3.dual_coef_)==parval['C']) ,"of them have slacks)")
print("Prop. of supports: ",np.sum(svcp3.n_support_)/X_train.shape[0])

## RBF Kernel

In [None]:
svcrbf = SVC()
svcrbf.fit(X_train, y_train)
pred = svcrbf.predict(X_test)
print("Confusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))

In [None]:
# Values we will test for each parameter. When observing results, consider the limits of the
# values tested and increase them if necessary
gammas = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
Cs = np.logspace(-1, 6, num=8, base=10.0)

param_grid = {'C': Cs, 'gamma' : gammas}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1, scoring='f1_micro', verbose=10)
grid_search.fit(X_train,y_train)
parval=grid_search.best_params_

# We'll show in a grid, the accuracy for each combination of parameters tester
scores = grid_search.cv_results_['mean_test_score']
scores = np.array(scores).reshape(len(param_grid['C']), len(param_grid['gamma']))

In [None]:
plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar(label='F1 score')
plt.grid(b=None)
plt.xticks(np.arange(len(param_grid['gamma'])), param_grid['gamma'], rotation='vertical')
plt.yticks(np.arange(len(param_grid['C'])), param_grid['C'])
plt.savefig("../figures/svm_rbf_C_cv.pdf")

parval = grid_search.best_params_
print("\nBest combination of parameters found: ", parval)

cvacc = cross_val_score(SVC(C=parval['C'], gamma=parval['gamma']) , X=X_train,  y=y_train, cv=5, scoring='f1_micro', n_jobs=-1)
print('\nF1 score 5-fold cross on train data= ', cvacc.mean())

In [None]:
svcrbf = SVC(C=parval['C'], gamma=parval['gamma'])
svcrbf.fit(X_train, y_train)
pred = svcrbf.predict(X_test)

print("Confusion matrix on test set:\n", sk.metrics.confusion_matrix(y_test, pred))
print("\nAccuracy on test set: ", sk.metrics.accuracy_score(y_test, pred))
print("\nF1 score on test set: ", sk.metrics.f1_score(y_test, pred, average='micro'))
print("\nNumber of supports: ",np.sum(svcrbf.n_support_), "(",np.sum(np.abs(svcrbf.dual_coef_)==parval['C']) ,"of them have slacks)")
print("Prop. of supports: ",np.sum(svcrbf.n_support_)/X_train.shape[0])