## Sparse Vector Machine for K-Classes


In [3]:
%matplotlib inline
import numpy as np 
import sklearn.preprocessing
import sklearn.datasets
import pandas as pd
import sklearn.model_selection
import numpy.random
import math
import sklearn.metrics

In [4]:
#X, y = sklearn.datasets.load_iris(return_X_y=True)
X, y = sklearn.datasets.load_wine(return_X_y=True)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, random_state=42)
standard = sklearn.preprocessing.StandardScaler()
X_train = standard.fit_transform(X_train)
training_data = np.c_[X_train, y_train]#All of the features are continuous, so, no need to use one-hot encoder and we can directly standard normalize the features of the data set

X_test = standard.transform(X_test)
test_data = np.c_[X_test, y_test]
print(training_data.shape)
print(test_data.shape)
k = len(set(y_train))
y_train#It needs to be labeled from 0 to k


(133, 14)
(45, 14)


array([0, 1, 1, 2, 0, 1, 0, 0, 2, 2, 1, 1, 0, 1, 0, 2, 1, 1, 2, 0, 0, 0,
       2, 0, 0, 1, 2, 1, 0, 2, 1, 0, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 2, 1,
       1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 2, 2, 1, 1, 0, 1, 2, 2, 1, 2, 1, 1,
       1, 0, 0, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 2, 1, 1, 1, 2, 2,
       1, 0, 0, 1, 2, 2, 0, 1, 2, 2, 2, 2, 1, 0, 1, 0, 2, 0, 0, 1, 0, 0,
       2, 1, 0, 2, 2, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 0, 1,
       1])

In [10]:
class SVM2CLASS(object):

    def __init__(self, X_train, y_train, C = 10, tol = 0.001, max_passes = 5, passes = 0, classType = 0):
        self.classType = classType
        self.X_train = X_train
        self.y_train = y_train
        self.C = C
        self.tol = tol
        self.max_passes = max_passes
        self.passes = passes
        self.b = 0
        self.alphas = np.zeros((self.X_train.shape[0], 1))#need to be of size nx1
        self.coefficients = ()
        self.kernel_type ={"poly": self.polynomial_kernel, "gauss": self.gaussian_kernel}
        self.kernel_parameters = []
        self.kernel_choice = ""

    def polynomial_kernel(self, x_i, x_j, a):
        return np.power(np.dot(x_i.T, x_j) + a[0], a[1])

    def gaussian_kernel(self, x, x_star, sigma):
        return np.exp(np.divide(-1*(np.linalg.norm(x-x_star)**2), 2*sigma**2))
    
    def SMO(self, kernel_choice, parameters):

        choices = np.arange(0, self.y_train.shape[0])
        count = 0
        self.kernel_choice = kernel_choice
        #Better to construct the Kernel matrix from the scratch for efficiency, but this method will prevent the simplified SMO from working on large datasets, like, the mnist dataset 
        if kernel_choice == "poly":
            exponent = parameters[0]
            intercept = parameters[1]
            self.kernel_parameters = [intercept, exponent] 
            
            K = np.zeros((self.X_train.shape[0], self.X_train.shape[0]))
            for i in range(0, self.X_train.shape[0]):
                for j in range(0, self.X_train.shape[0]):
                    K[i, j] = self.kernel_type[self.kernel_choice](self.X_train[i, :], self.X_train[j, :], self.kernel_parameters)
            assert(np.any(np.linalg.eig(K)[0] == 0)  == False)#Test for PSD

        elif kernel_choice == "gauss":
            self.kernel_parameters = 2

            K = np.zeros((self.X_train.shape[0], self.X_train.shape[0]))
            for i in range(0, self.X_train.shape[0]):
                for j in range(0, self.X_train.shape[0]):
                    K[i, j] =  self.kernel_type[self.kernel_choice](self.X_train[i, :], self.X_train[j, :], self.kernel_parameters)
            assert(np.linalg.det(K) != 0)#Test for PSD
        
        else:
            print("Wrong entry")
            return -1

        while(self.passes <= self.max_passes):
        #begin while
            num_changed_alphas = 0
            for i in range(0, self.X_train.shape[0]):
            #begin for
                #compute kernel function for every iteration from scratch
                #f_x_i = np.sum( list(map(lambda x, alpha, y: alpha * y * self.kernel_type[self.kernel_choice](x, self.X_train[i, :], self.kernel_parameters) , self.X_train, self.alphas, self.y_train) ) )#instead of calculating the Kernel matrix from the start, we will calculate the inner product with each iteration in order to mitigate the problem of having a large kernel matrix that will raise an exception
                #print(f_x_i)
                f_x_i = np.sum(self.alphas.reshape(-1, 1) * (self.y_train.reshape(-1, 1) *  K[:, i].reshape(-1, 1)).reshape(-1, 1)) 
                E_i = f_x_i + self.b - self.y_train[i]
                #print(f_x_i)
                #Check if we satisfy the condition for the dual problem
                if (((self.y_train[i] * E_i) < -self.tol) and (self.alphas[i] < self.C)) or (((self.y_train[i] * E_i) > self.tol) and (self.alphas[i] > 0)):
                #begin if
                    j = np.random.choice( list(filter(lambda v: v == v, list(map(lambda c: c if c != i else np.nan, choices)))) ) 
                    #only nan will generate False at its equlaity, and the filter object will end up filtering out these wrong values
                    assert( i != j)
                    #f_x_j = np.sum( list(map(lambda x, alpha, y: alpha * y * self.kernel_type[self.kernel_choice](x, self.X_train[j, :], self.kernel_parameters) , self.X_train, self.alphas, self.y_train) ) )
                    #print(f_x_j)
                    f_x_j = np.sum(self.alphas.reshape(-1, 1) * (self.y_train.reshape(-1, 1) * K[:, j].reshape(-1, 1)).reshape(-1, 1)) 
                    #print((alphas.reshape(-1, 1) * (y_train.reshape(-1, 1) * K[i, :].reshape(-1, 1)).reshape(-1, 1)).shape)
                    #print(f_x_j)
                    E_j = f_x_j + self.b - self.y_train[j]

                    alpha_i_old = self.alphas[i].copy()#Needs to copy the value because otherwise they would be pointing to the same address
                    alpha_j_old = self.alphas[j].copy()

                    #Computing L and H
                    if(self.y_train[i] != self.y_train[j]):
                        L = max(0, self.alphas[j] - self.alphas[i])
                        H = min(self.C, self.C + self.alphas[j] - self.alphas[i])
                    else:
                        L = max(0, self.alphas[j] + self.alphas[i] - self.C)
                        H = min(self.C, self.alphas[j] + self.alphas[i])
                    
                    #Checking if L=H which indicate that the alpha would certainly wouldn't change 
                    if L == H:
                        continue

                    #eta = 2 * self.kernel_type[self.kernel_choice](self.X_train[i, :], self.X_train[j, :], self.kernel_parameters) - self.kernel_type[self.kernel_choice](self.X_train[i, :], self.X_train[i, :], self.kernel_parameters) - self.kernel_type[self.kernel_choice](self.X_train[j, :], self.X_train[j, :], self.kernel_parameters)
                    eta = 2 * K[i, j] - K[i, i] - K[j, j] 

                    #eta = 0 if the similarity between x_i and x_j is as the combination of the similarity of x_i with itself and same goes for x_j, will cause an exception to happend and this indicate we are dealing with the same x.
                    #eta > 0 if if the similarity between x_i and x_j is higher than the combination of the similarity of x_i with itself and same goes for x_j, so, this update step would have little effect on the converging to the optimal minimum and may leads to diverging the algorithm patht to a worse path
                    #eta < 0 there are small simialrity between x_i and x_j, so, this would help in discoverign the interaction of those observations in the feature space

                    if eta >= 0:
                        #print("The two vectors are too similar")
                        continue

                    alpha_j_clip = alpha_j_old - (1/eta) * self.y_train[j] * (E_i - E_j)

                    if alpha_j_clip > H:
                        self.alphas[j] = H
                    elif alpha_j_clip < L:
                        self.alphas[j] = L
                    else:
                        self.alphas[j]  = alpha_j_clip
                    
                    #print(alphas[j], alpha_j_old)
                    #Check if it is worth to update alpha_i
                    if(abs(self.alphas[j] - alpha_j_old) < 1e-3):
                        #print("No noticeable changes happened to alpha")
                        continue
                    self.alphas[i] = alpha_i_old + self.y_train[i] * self.y_train[j] * (alpha_j_old - self.alphas[j])#The signs changed from  the negative sign for updating alpha_i

                    ##KKT constrains convergence test
                    #b1 = self.b - E_i - self.y_train[i]*(self.alphas[i] - alpha_i_old) * self.kernel_type[self.kernel_choice](self.X_train[i, :], self.X_train[i, :], self.kernel_parameters) - self.y_train[j]*(self.alphas[j] - alpha_j_old) * self.kernel_type[self.kernel_choice](self.X_train[i, :], self.X_train[j, :], self.kernel_parameters)
                    b1 = self.b - E_i - self.y_train[i] * (self.alphas[i] - alpha_i_old) * K[i, i] - self.y_train[j] * (self.alphas[j] - alpha_j_old) * K[i, j]

                    #b2 = self.b - E_j - self.y_train[i]*(self.alphas[i] - alpha_i_old) * self.kernel_type[self.kernel_choice](self.X_train[i, :], self.X_train[j, :], self.kernel_parameters) - self.y_train[j]*(self.alphas[j] - alpha_j_old) * self.kernel_type[self.kernel_choice](self.X_train[j, :], self.X_train[j, :], self.kernel_parameters)
                    b2 = self.b - E_j - self.y_train[i] * (self.alphas[i] - alpha_i_old) * K[i, j] - self.y_train[j] * (self.alphas[j] - alpha_j_old) * K[j, j]

                    if (self.alphas[j] > 0) and (self.alphas[j] < self.C):
                        self.b = b2
                    elif (self.alphas[i] > 0) and (self.alphas[i] < self.C):
                        self.b = b1
                    else:
                        self.b = (b1 + b2)/2
                    
                    num_changed_alphas  =  num_changed_alphas + 1
                #end if
            #end for
            print(f"class:{self.classType}, count:{count}, passes:{self.passes}, max_passes:{self.max_passes}, b:{self.b}")
            count+=1
            if(num_changed_alphas == 0):
                self.passes = self.passes + 1
            else:
                self.passes = 0
        #end while
        #Only store in the memory the support vectors
        support_indeces = np.argwhere(self.alphas != 0)[:, 0]
        support_vectors = self.X_train[support_indeces, :]
        support_alphas = self.alphas[support_indeces]
        support_target = self.y_train[support_indeces]
        self.importantParameters = (support_vectors, support_alphas, support_target, self.b)

        return self.importantParameters

    def prediction_dataset(self, X):
        pred = list(map(lambda x: self.prediction(x), X))
        return pred

    def prediction(self, x):
        t1 = np.sum( list(map(lambda x1, alpha, y: y * alpha * self.kernel_type[self.kernel_choice](x, x1, self.kernel_parameters), self.importantParameters[0], self.importantParameters[1], self.importantParameters[2])) )
        pred = t1 + self.b
        #print(pred)
        if pred >=0:
            return 1
    
        return -1

In [11]:
class SVM_kCLasses(object):
    
    #To simplify things I will assume that I will use the same kernel function for all of the combination of 1 vs K models
    def __init__(self, X_train, y_train, C = 10, tol = 0.001, max_passes = 5, kernel_choice = "poly", parameters=[1, 1], k=None):
        assert(k != None)
        self.C = C
        self.tol = tol
        self.max_passes = max_passes
        self.kernel_choice = kernel_choice
        self.parameters = parameters
        self.K = k 
        self.alphas = []
        self.b = []
        self.coefficients = []
        self.classes = []
        self.models = []
        self.X_train = X_train
        for c in range(0, self.K):
            temp = np.array(list(map(lambda y: 1 if y == c else -1, y_train)))
            self.classes.append(temp)
        
    def fit(self):
        for k in range(0, self.K):
            self.models.append(SVM2CLASS(self.X_train, self.classes[k], self.C, self.tol, self.max_passes, 0, k))
            support_vectors, support_alphas, support_target, b = self.models[k].SMO(self.kernel_choice, self.parameters)
            self.alphas.append(support_alphas)
            self.b.append(b)
            self.coefficients.append((support_vectors, support_alphas, support_target, b))
        return self.coefficients

    def prediction(self, X):
        pred = np.zeros((X.shape[0], self.K))
        for k in range(0, self.K):
            pred[:, k] = self.models[k].prediction_dataset(X)
            
        final_prediction = np.argmax(pred, axis=1)#along the rows
        return final_prediction

In [12]:
exponent = 2
intercept =1
svm_model =SVM_kCLasses(X_train, y_train, C = 10, tol = 0.001, max_passes = 5, kernel_choice = "poly", parameters=[exponent, intercept], k=k)
support_vectors, support_alphas, support_target = svm_model.fit()
pred = svm_model.prediction(X_train)
print("Performance on the training set")
print(sklearn.metrics.confusion_matrix(y_train, pred))

class:0, count:0, passes:0, max_passes:5, b:[0.0588624]
class:0, count:1, passes:0, max_passes:5, b:[-0.04423419]
class:0, count:2, passes:0, max_passes:5, b:[0.02909278]
class:0, count:3, passes:0, max_passes:5, b:[-0.18266851]
class:0, count:4, passes:0, max_passes:5, b:[0.29532926]
class:0, count:5, passes:0, max_passes:5, b:[0.16962927]
class:0, count:6, passes:0, max_passes:5, b:[0.12942534]
class:0, count:7, passes:0, max_passes:5, b:[0.15209673]
class:0, count:8, passes:0, max_passes:5, b:[-0.4153345]
class:0, count:9, passes:0, max_passes:5, b:[0.19504563]
class:0, count:10, passes:0, max_passes:5, b:[0.09146349]
class:0, count:11, passes:0, max_passes:5, b:[0.21389233]
class:0, count:12, passes:0, max_passes:5, b:[0.12600444]
class:0, count:13, passes:0, max_passes:5, b:[0.35687334]
class:0, count:14, passes:0, max_passes:5, b:[0.1434446]
class:0, count:15, passes:0, max_passes:5, b:[0.13019911]
class:0, count:16, passes:0, max_passes:5, b:[0.20984433]
class:0, count:17, passe

In [13]:
pred = svm_model.prediction(X_test)
print("Performance on the test set")
print(sklearn.metrics.confusion_matrix(y_test, pred))

Performance on the test set
[[15  0  0]
 [ 1 17  0]
 [ 0  1 11]]


### References 
* Chapter 1, chapter 6 and Chapter 7 from Bishop, C. (2006). Pattern Recognition and Machine Learning. Cambridge: Springer.
* Andrew Ng, Lec 6: (https://www.youtube.com/watch?v=qyyJKd-zXRE)
* Andrew Ng, Lec 7: (https://www.youtube.com/watch?v=s8B4A5ubw6c)
* Andrew Ng, Lec 8: (https://www.youtube.com/watch?v=bUv9bfMPMb4)
* Simplified Sequential Minimal Optimization: (https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwiRlObmw5_qAhW7ShUIHSjJAbYQFjAAegQIAhAB&url=http%3A%2F%2Fcs229.stanford.edu%2Fmaterials%2Fsmo.pdf&usg=AOvVaw201bQxVZY0MmUn_gGAu5O8)
