In [77]:
import numpy as np
import pandas as pd
import numpy.linalg 
from tqdm import tqdm
import numba
from numba import njit,vectorize, jit
import time
import scipy

In [78]:
X_train = pd.read_csv('data/Xtr0_mat100.csv', header=None, delimiter = ' ').to_numpy()
X_test = pd.read_csv('data/Xte0_mat100.csv', header=None, delimiter = ' ').to_numpy()
y_train = pd.read_csv('data/Ytr0.csv')['Bound'].to_numpy()
y_train_rademacher = 2*y_train-1

In [79]:
!pwd

/home/bastien/Documents/ENS/KM/Kernel_Methods/machine-learning-with-kernel-methods-2021


In [80]:
@njit
def GaussianKernel(x,y,sig2 = 1):
    return np.exp(-numpy.linalg.norm(x-y)**2/(2*sig2))

In [81]:
@njit
def to_Kernel_train(X, Kernel, sig2 = 1): 
    length = X.shape[0]
    mat_K = np.zeros((length,length))
    for i in range(length):
        x_i = X[i,:]
        for j in range(i,length): 
            x_j = X[j,:]
            value = Kernel(x_i,x_j,sig2)
            mat_K[i,j] = value
            mat_K[j,i] = value 
    return mat_K

@njit 
def to_Kernel_test(Xtrain,Xtest,Kernel,sig2=1):
    length_train = Xtrain.shape[0]
    length_test = Xtest.shape[0]
    bimat_K = np.zeros((length_train,length_test))
    for i in range(length_train):
        x_i = Xtrain[i,:]
        for j in range(length_test): 
            x_j = Xtest[j,:]
            value = Kernel(x_i,x_j,sig2)
            bimat_K[i,j] = value
    return bimat_K


lam = 0
sig2 = 1
Kernel_train = to_mat_K(X_train,GaussianKernel, 1)
Kernel_test = to_bimat_K(X_train,X_test,GaussianKernel)
def standardize(K): 
    U = np.full(K.shape,1/K.shape[0])
    I = np.eye(K.shape[0])
    return (I-U)@K@(I-U)

In [82]:
#@vectorize
def loss(u): 
    return np.log(1+np.exp(-u))
def sigmoid(u): 
    return 1/(1+np.exp(-u))


def grad_loss(u): 
    return -sigmoid(-u)

def hess_loss(u): 
    return sigmoid(u)*sigmoid(-u)

def J(alpha, y = y_train, mat_K = mat_K, lam = lam):
    n = alpha.shape[0]
    regularizer = lam/2*alpha@mat_K@alpha
    vect = mat_K@alpha
    somme = 1/n*np.sum(loss(y*vect))
    return somme+regularizer
   
def grad_J(alpha, y = y_train, mat_K = mat_K, lam = lam): 
    n = y.shape[0]
    vect_P_alpha = grad_loss(y*(mat_K@alpha))
    return 1/n*mat_K@(vect_P_alpha*y)+ lam*mat_K@alpha

def hess_J(alpha, y = y_train, mat_K = mat_K, lam = lam):
    n = mat_K.shape[0]
    vect_W = hess_loss(y*(mat_K@alpha))
    return 1/n*mat_K +lam*mat_K

def Kernel_logistic_reg_fit(X= X_train, y = y_train, mat_K = mat_K, lam = lam, Niter =20):
    alpha = 0.00000*np.random.randn(X.shape[0])
    #alpha = np.ones(2000)
    mat_K = standardize(mat_K)
    lr = 5
    for i in tqdm(range(Niter)): 
        #inv = np.linalg.inv(hess_J(alpha, mat_K = mat_K))
        #alpha-= lr*inv@grad_J(alpha ,mat_K = mat_K)#, mat_K= mat_K)
        alpha-= lr*grad_J(alpha ,mat_K = mat_K)
        '''
        if i%1 ==0 : 
            print('alpha :', alpha)
            print('J :',J(alpha,mat_K = mat_K))
            print('grad :', grad_J(alpha,mat_K = mat_K))
    print('alpha_end :', alpha)
    print('J_end :',J(alpha,mat_K = mat_K))
    print('grad_end :', grad_J(alpha,mat_K = mat_K))'''
    return alpha

In [83]:
#1e-8 marche bien pour lambda
def fit_KRR(K_train,lam,y):
    #mat_K = standardize(mat_K) #marche pas si on standardise 
    n = K_train.shape[0]
    full_mat = K_train +n*lam*np.eye(n)
    alpha = np.linalg.solve(full_mat,y)
    return alpha

In [84]:
alpha_KRR = fit_KRR(mat_K,lam,y_train)
np.max(np.round(alpha_KRR@mat_K,3)-y_train)

0.0

In [85]:
def fit_WKRR(K_train,vect_W,lam,y): 
    '''
    Compute the Weighted Kernel Redge Regression. the Formula is given in the course. 
    The code is optimized, we do not take the diagonal matrix of the square root of W. Instead, 
    we only compute some np.multiply stuff. 
    
    args : 
    
            K_train : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            vect_W : the vector that contains the weight associated to each sample. here we need that all the 
            coefficient of this vector is 0. Otherwise we won't be able to compute the inverse of the square root
            lam : regularization factor 
            y : the vector we train on 
    
    returns :
            
            the vector alpha that satisfy the formula in the course. 
    alpha then needs to be transformed to a function in order to fit the data.
    '''
    min_W = np.min(vect_W)
    if (min_W < 0) or (min_W == 0) : 
        print('Non invertible Matrix W ')
    n = K_train.shape[0]
    vect_sqrt_W = np.sqrt(vect_W) # the square root of the original vector
    vect_neg_sqrt_W = 1/vect_sqrt_W # the negative square root of the original vector
    b = np.multiply(vect_sqrt_W,y) 
    big_mat = np.multiply(np.multiply(vect_sqrt_W.reshape(-1,1),K_train), vect_sqrt_W) +n*lam*np.eye(n)
    A = np.multiply(vect_neg_sqrt_W,big_mat)
    return scipy.linalg.solve(A,b)

vect_W_init = np.full(mat_K.shape[0],1)#/mat_K.shape[0])
fit_WKRR(Kernel_train,vect_W_init,lam,y_train)


array([-27046.69790471,  27039.37656427,   9038.83584669, ...,
       -17518.73700749,  -9465.23463377,  -5803.08970443])

In [86]:


def IRLS(K, y, alpha):
        """
        Iterative step to update alpha when training the classifier
        :param K: np.array, kernel
        :param y: np.array, labels
        :param alpha: np.array
        :return: - W: np.array
                 - z: np.array
        """
        m = np.dot(K, alpha)
        W = sigmoid(m) * sigmoid(-m)
        z = m + y/sigmoid(-y*m)
        return W, z

def WKRR_af(K, W, z):
        """
        Compute new alpha
        :param K: np.array, kernel
        :param W: np.array
        :param z: np.array
        :return: np.array, new alpha
        """
        n = K.shape[0]
        W_s = np.diag(np.sqrt(W))
        A = np.dot(np.dot(W_s, K), W_s) + n * lam * np.eye(n)
        A = np.dot(np.dot(W_s, np.linalg.inv(A)), W_s)
        return np.dot(A, z)
    
    
def recoding_KRL(mat_K,lam,y, max_iter = 10): 
    n = mat_K.shape[0]
    old_alpha = 0*np.ones(n)
    for i in range(max_iter): 
        W,z = IRLS(mat_K,y,old_alpha)
        alpha = fit_WKRR(mat_K, W, lam, z)
        f = f_from_alpha(alpha, GaussianKernel, X_train)
        old_alpha = np.copy(alpha)
    return alpha



In [87]:
def compute_m(mat_K,alpha): 
    return mat_K@alpha

def compute_P(y,m): 
    return -sigmoid(-np.multiply(y,m))

def compute_W(m):
    return np.multiply(sigmoid(m),sigmoid(m))

def compute_z(y,m): 
    return m + np.multiply(y,1/sigmoid(-np.multiply(y,m)))

In [88]:
def fit_KLR_IRLS(mat_K, lam, y, max_iter = 10): 
    '''
    Fonction qui optimise la loss définie par la la Kernel Logistic Regression. 
    
    args : 
            mat_K : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            
            lam : regularization factor 
            
            y : the vector we train on. Must be -1 or 1 
            
            max_iter : the maximum number of iteration we are ready to do 
    returns : 
            the vector alpha optimized 
            alpha then needs to be transformed to a function in order to fit the data.
    '''
    alpha = np.zeros(mat_K.shape[0])
    m = compute_m(mat_K,alpha)    
    W = compute_W(m)
    z = compute_z(y,m)
    for i in range(max_iter): 
        alpha = fit_WKRR(mat_K,W,lam,z)
        m = compute_m(mat_K,alpha)
        W = compute_W(m)
        z = compute_z(y,m)
        f = f_from_alpha(alpha, GaussianKernel, X_train)
        for i in range(10): 
            print(i ,' :',np.round(f(X_train[i,:]),3), ' y :', y_train[i])
    return alpha



def cross_val_split(Xtrain,ytrain, cv):
    idx = np.arange(Xtrain.shape[0])
    np.random.shuffle(idx) # we shuffle the indices to get random samples
    sample_size = Xtrain.shape[0]//cv
    Xtrainsplit = []# a list that wil contain each X_train vector. Each element will be smaller than 
                    #X_train. If cv = 3 for example, the size (on the x axis) will be 2/3 the original size 
    ytrainsplit = []
    Xvalsplit = []
    yvalsplit = []
    for i in range(cv-1): 
        #we add the new indices. Here, takes the original vector and returns the vector without the 
        # indices passes in argument 
        Xtrainsplit.append(np.delete(Xtrain,idx[i*sample_size:(i+1)*sample_size],axis = 0))
        ytrainsplit.append(np.delete(ytrain,idx[i*sample_size:(i+1)*sample_size],axis = 0))
        
        # we add the rest 
        # note that here, we keep the same labels for X ( we do not shuffle independantly X and y)
        Xvalsplit.append( Xtrain[idx[i*sample_size:(i+1)*sample_size],:])
        yvalsplit.append(ytrain[idx[i*sample_size:(i+1)*sample_size]])
    # we add the last round. It is different since we can't take float proportion of an array, 
    # we have to take an integer. So, here we just add what remains. 
    Xtrainsplit.append(np.delete(Xtrain,idx[(cv-1)*sample_size:],axis = 0))
    ytrainsplit.append(np.delete(ytrain,idx[(cv-1)*sample_size:],axis = 0))
    Xvalsplit.append( Xtrain[idx[(cv-1)*sample_size:],:])
    yvalsplit.append(ytrain[idx[(cv-1)*sample_size:]])
    return Xtrainsplit,Xvalsplit,ytrainsplit,yvalsplit

In [89]:
class estimator(): 
    def __init__(self , Kernel, lam = 1e-8, sig2 = 1 ): 
        self.Kernel = Kernel
        self.lam = lam 
        self.sig2 = sig2 
        self.mat_K = None 
        self.alpha = None 
        
    def predict_proba(self,Xtest): 
        if (self.alpha == None).any()==True  : 
            print("Il faut d'abord fitter les données")
        else : 
            mat_K_test = to_bimat_K(self.X_train,Xtest,self.Kernel)
            return  sigmoid(self.alpha@mat_K_test)
    
    def predict(self,Xtest): 
        if (self.alpha == None).any()==True : 
            print("Il faut d'abord fitter les données")
        else : 
            prob = self.predict_proba(Xtest)
            return prob>0.5
    def cross_val(self, Xtrain,ytrain,cv): 
        mistake = 0
        Xtrainsplit,Xvalsplit,ytrainsplit,yvalsplit = cross_val_split(Xtrain,ytrain,cv)
        for xtrain,xval,ytrain,yval in tqdm(zip(Xtrainsplit,Xvalsplit, ytrainsplit, yvalsplit)):
            self.fit(xtrain,ytrain)
            pred = self.predict(xval)
            mistake+=np.sum(np.abs(pred-yval))
        print('Pourcentage of errors : ', mistake/Xtrain.shape[0])
        return mistake/Xtrain.shape[0]

In [90]:
#lam = 1e-8 est bien
class KRR(estimator): 
    def __init__(self , Kernel, lam = 1e-8, sig2 = 1 ): 
        super(KRR, self).__init__(Kernel, lam,sig2)
    def fit(self, X, y): 
        self.X_train = X
        if self.Kernel == GaussianKernel : 
            self.mat_K = to_mat_K(X, self.Kernel,self.sig2)
        self.alpha = fit_KRR(self.mat_K, self.lam, y)

In [91]:

for i in range(1) : 
    KRR_estim = KRR( Kernel = GaussianKernel, lam = 10**(-13))
    KRR_estim.cross_val(X_train,y_train,cv=10)

10it [00:10,  1.05s/it]

Pourcentage of errors :  0.4805





In [97]:
regressor = KRR(GaussianKernel)
regressor.fit(X_train,y_train_rademacher)

In [99]:
np.max(regressor.predict(X_train)-y_train)



0

In [231]:
#sanity check for the cross_val function 
'''
test = np.eye(9)
print('We test with this matrix : ', test)
testy = np.arange(9)
X_train_split,X_val_split, y_train_split, y_val_split = cross_val_split(test,testy, cv = 3) 

for i in range(3):
    print('train : ')
    print(X_train_split[i], y_train_split[i])
    print('val : ')
    print(X_val_split[i],y_val_split[i])'''

We test with this matrix :  [[1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
[[0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
[2 3 5 6 7 8]
[[1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]]
[0 1 4]
[[1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
[0 1 2 4 5 8]
[[0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]]
[3 6 7]
[[1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 

"\nfor i in range(3):\n    print('train : ')\n    print(X_train_split[i], y_train_split[i])\n    print('val : ')\n    print(X_val_split[i],y_val_split[i])"

In [131]:
X_train

array([[0.01086957, 0.01086957, 0.04347826, ..., 0.01086957, 0.        ,
        0.01086957],
       [0.        , 0.        , 0.01086957, ..., 0.0326087 , 0.        ,
        0.        ],
       [0.02173913, 0.01086957, 0.02173913, ..., 0.02173913, 0.02173913,
        0.01086957],
       ...,
       [0.01086957, 0.        , 0.        , ..., 0.0326087 , 0.        ,
        0.        ],
       [0.01086957, 0.01086957, 0.        , ..., 0.        , 0.        ,
        0.01086957],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01086957,
        0.        ]])

In [38]:
(mat_K >0.9).any()

True

In [37]:
class KLR(estimator): 
    def __init__(self , Kernel = GaussianKernel, 
                 lam = 1e-9 , sig2 = 1): 
        super().__init__(Kernel, lam, sig2)
        
    def fit(self,X,y,max_iter = 10): 
        self.X_train = X
        if self.Kernel == GaussianKernel: 
            self.mat_K = to_mat_K(X,self.Kernel,self.sig2)
        self.alpha = fit_KLR_IRLS(self.mat_K, self.lam, y,max_iter)
        self.f = f_from_alpha(self.alpha,self.Kernel,X)


In [38]:
def evaluate_MSE_from_alpha(alpha,X,y,lam,mat_K, Kernel = GaussianKernel):
    '''
    Function that computes the MSE of the vector computed alpha. 
    
    args : 
            alpha : this is the final value we compute. We do not look directly for a function but for some 
            parameter that will completely determined the function. alpha is this parameter
            X : training data 
            y : target data 
            lam : regularization factor
            mat_K : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            Kernel : the kernel we are using. Normally, mat_K has been computed with the kernel K
            
    returns : 
            the MSE of the data plus the regularization factor
    '''
    n = X.shape[0]
    f_alpha = f_from_alpha(alpha,Kernel,X)
    loss = 0
    for i in range(n): 
        loss+= (y[i]-f_alpha(X[i,:]))**2.0
    loss/= n
    print(' loss without regularization : ', np.round(loss,4)) 
    reg = lam*alpha@mat_K@alpha
    print('regularization :', np.round(reg,4))
    return loss + reg 


print('WKRR :',evaluate_MSE_from_alpha(alpha_KLR, X_train, y_train, lam, mat_K))

NameError: name 'alpha_KLR' is not defined