In [55]:
import numpy as np
y_train = np.loadtxt("y_train_binary.csv ", delimiter =",")
y_test = np.loadtxt("y_test_binary.csv ", delimiter =",")
X_train = np.loadtxt("X_train_binary.csv ", delimiter =",")
X_test = np.loadtxt("X_test_binary.csv ", delimiter =",")
print("Number of training examples=", y_train.size, ",Number of test examples=",y_test.size)

Number of training examples= 150 ,Number of test examples= 164


In [56]:
# Get frequencise 
train_count = np.unique(y_train, return_counts=True) # Return counts
freq = train_count[1]/np.sum(train_count[1]) # Frequencies
print('(label,frequency):', list(zip(train_count[0],freq)))

(label,frequency): [(-1.0, 0.4533333333333333), (1.0, 0.5466666666666666)]


In [57]:
mu = np.mean(X_train, axis=0) # Compute mean of each feature
sd = np.std(X_train, axis=0, ddof = 0) # Compute (in-sample) standard deviation of each feature
def normalize(X,mean,sd): # Function to normalize array given mean and standard deviation
     D = np.diag(1/sd) # diagonalize std deviation vector
     return X @ D - mu @ D

X_train_norm = normalize(X_train, mu, sd) # Normalized training data 
X_test_norm = normalize(X_test, mu, sd) # Normalized testing data

vars = np.var(X_test_norm, axis=0,ddof=0) # variance of test data
means = np.mean(X_test_norm, axis=0) # mean of test data

mean_var = np.array([means, vars]).T # Combine to one array
np.savetxt("mean_var.csv", mean_var, delimiter=",") # Send to csv for later use


In [58]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
gauss_svm = SVC(kernel="rbf") # Initialize gaussian SVM 
C = np.logspace(-2, 4, 7) # Values of C to try in CV
gamma = np.logspace(-4, 2,7) # Values of gamma to try in CV
paramgrid = {"C":C,"gamma":gamma} # Setup parametergrid for cross-validation
gaussian_cv = GridSearchCV(gauss_svm, param_grid=paramgrid, scoring="accuracy", verbose = 1) # Init gridsearch using parametergrid
gaussian_cv.fit(X_train, y_train) # Fit SVM

Fitting 5 folds for each of 49 candidates, totalling 245 fits


In [59]:
print("Best parameters:", gaussian_cv.best_estimator_) # Print best estimator
print("Training loss:" ,1-gaussian_cv.score(X_train, y_train)) # Best Loss
print("Test loss:", 1-gaussian_cv.score(X_test, y_test)) # Test loss

Best parameters: SVC(C=1000.0, gamma=0.0001)
Training loss: 0.013333333333333308
Test loss: 0.33536585365853655


In [60]:
def count_support_vectors(C, X, y, gamma):
    svm = SVC(C=C,kernel="rbf", gamma=gamma) # Initialize SVM
    svm.fit(X,y) # Fit svm
    alphas = svm.dual_coef_ # Get (signed) alphas
    bounded = np.sum(abs(alphas)>=C) #Check whether support vector is bounded
    free = alphas.size-bounded #Rest is free
    return np.reshape(np.array([C, bounded, free]),(1,3)) # return array containing C, #bounded support vectors and #free support vectors

Cs = np.logspace(-3,3,7) # C's to count bounded/free SV's for

arr = np.empty((1,3)) # Setup array to append to
for C in Cs:
    arr = np.append(arr,count_support_vectors(C, X_train,y_train, 0.0001), axis = 0) # append to array
arr = np.delete(arr, (0), axis = 0) # delete array of zeros
arr = np.round(arr, 3)
np.savetxt("support_vectors.csv", arr, delimiter=",") # Send to csv for later use
