In [8]:
import numpy as np
import pandas as pd
import numpy.linalg 
from tqdm import tqdm
import numba
from numba import njit,vectorize, jit
import time
import scipy

In [17]:
X_train = pd.read_csv('data/Xtr0_mat100.csv', header=None, delimiter = ' ').to_numpy()
y_train = pd.read_csv('data/Ytr0.csv')['Bound'].to_numpy()
y_train = 2*y_train-1

In [18]:
!pwd

/home/bastien/Documents/ENS/KM/Kernel_Methods/machine-learning-with-kernel-methods-2021


In [24]:
@njit
def GaussianKernel(x,y,sig2 = 1):
    return np.exp(-numpy.linalg.norm(x-y)**2/(2*sig2))

In [20]:


def f_for_data(alpha,mat_K,j): 
    return np.sum([alpha[i]*mat_K[i,j] for i in range(mat_K.shape[0])])

def f_from_alpha(alpha, Kernel, X):
    '''
    Calcule f à partir d'alpha. On utilise ici la forumule du representer thm : 
    f(x) = sum(alpha_i*K(x_i,x))
    
    args : 
        alpha : vecteur de taille (nombre de données dans le dataset). 
        Kernel : n'importe quel kernel 
        X : Matrice contenant les données. X.shape[0] doit etre eégal à la taille de alpha
        
    return : la fonction donnée par le representer theorem
    '''
    return  lambda x : np.sum([alpha[i]*(Kernel(X[i,:],x)) for i in range(X.shape[0])])



In [25]:
@njit
def to_mat_K(X, Kernel, sig2 = 1): 
    length = X.shape[0]
    mat_K = np.zeros((length,length))
    for i in range(length):
        x_i = X[i,:]
        for j in range(i,length): 
            x_j = X[j,:]
            value = Kernel(x_i,x_j,sig2)
            mat_K[i,j] = value
            mat_K[j,i] = value 
    return mat_K

lam = 0
sig2 = 1
alpha = np.ones(X_train.shape[0])
mat_K = to_mat_K(X_train,GaussianKernel, 1)
alpha_init = np.ones(mat_K.shape[0])/mat_K.shape[0]
vect_W_init = np.ones(mat_K.shape[0])

def standardize(K): 
    U = np.full(K.shape,1/K.shape[0])
    I = np.eye(K.shape[0])
    return (I-U)@K@(I-U)

In [26]:
#@vectorize
def loss(u): 
    return np.log(1+np.exp(-u))
def sigmoid(u): 
    return 1/(1+np.exp(-u))


def grad_loss(u): 
    return -sigmoid(-u)

def hess_loss(u): 
    return sigmoid(u)*sigmoid(-u)

def J(alpha, y = y_train, mat_K = mat_K, lam = lam):
    n = alpha.shape[0]
    regularizer = lam/2*alpha@mat_K@alpha
    vect = mat_K@alpha
    somme = 1/n*np.sum(loss(y*vect))
    return somme+regularizer
   
def grad_J(alpha, y = y_train, mat_K = mat_K, lam = lam): 
    n = y.shape[0]
    vect_P_alpha = grad_loss(y*(mat_K@alpha))
    return 1/n*mat_K@(vect_P_alpha*y)+ lam*mat_K@alpha

def hess_J(alpha, y = y_train, mat_K = mat_K, lam = lam):
    n = mat_K.shape[0]
    vect_W = hess_loss(y*(mat_K@alpha))
    return 1/n*mat_K +lam*mat_K

def Kernel_logistic_reg_fit(X= X_train, y = y_train, mat_K = mat_K, lam = lam, Niter =20):
    alpha = 0.00000*np.random.randn(X.shape[0])
    #alpha = np.ones(2000)
    mat_K = standardize(mat_K)
    lr = 5
    for i in tqdm(range(Niter)): 
        #inv = np.linalg.inv(hess_J(alpha, mat_K = mat_K))
        #alpha-= lr*inv@grad_J(alpha ,mat_K = mat_K)#, mat_K= mat_K)
        alpha-= lr*grad_J(alpha ,mat_K = mat_K)
        '''
        if i%1 ==0 : 
            print('alpha :', alpha)
            print('J :',J(alpha,mat_K = mat_K))
            print('grad :', grad_J(alpha,mat_K = mat_K))
    print('alpha_end :', alpha)
    print('J_end :',J(alpha,mat_K = mat_K))
    print('grad_end :', grad_J(alpha,mat_K = mat_K))'''
    return alpha

In [28]:


f = f_from_alpha(first_alpha, GaussianKernel, X_train)
for i in range(10): 
        print(i ,' :',np.round(f(X_train[i,:]),3), ' y :', y_train[i])



NameError: name 'first_alpha' is not defined

In [29]:
#1e-8 marche bien pour lambda
def fit_KRR(mat_K,lam,y):
    #mat_K = standardize(mat_K) #marche pas si on standardise 
    n = mat_K.shape[0]
    full_mat = mat_K +n*lam*np.eye(n)
    alpha = np.linalg.solve(full_mat,y)
    return alpha

In [30]:
# meme fonction que WKRR, mais on est environ 10 fois plus lent
'''
def fit_WKRR(mat_K,vect_W, lam, y): 
    # pour l'instant on suppose que W est bien inversible, i.e. aucune valeur à zéro
    n = mat_K.shape[0]
    mat_sqrt_W = np.diag(np.sqrt(vect_W))
    mat_neg_sqrt_W = np.diag(1/np.sqrt(vect_W))
    big_mat = mat_sqrt_W@mat_K@mat_sqrt_W + n*lam*np.eye(n)
    return scipy.linalg.solve(big_mat@mat_neg_sqrt_W,mat_sqrt_W@y)

'''

"\ndef fit_WKRR(mat_K,vect_W, lam, y): \n    # pour l'instant on suppose que W est bien inversible, i.e. aucune valeur à zéro\n    n = mat_K.shape[0]\n    mat_sqrt_W = np.diag(np.sqrt(vect_W))\n    mat_neg_sqrt_W = np.diag(1/np.sqrt(vect_W))\n    big_mat = mat_sqrt_W@mat_K@mat_sqrt_W + n*lam*np.eye(n)\n    return scipy.linalg.solve(big_mat@mat_neg_sqrt_W,mat_sqrt_W@y)\n\n"

In [31]:
def fit_WKRR(mat_K,vect_W,lam,y): 
    '''
    Compute the Weighted Kernel Redge Regression. the Formula is given in the course. 
    The code is optimized, we do not take the diagonal matrix of the square root of W. Instead, 
    we only compute some np.multiply stuff. 
    
    args : 
    
            mat_K : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            vect_W : the vector that contains the weight associated to each sample. here we need that all the 
            coefficient of this vector is 0. Otherwise we won't be able to compute the inverse of the square root
            lam : regularization factor 
            y : the vector we train on 
    
    returns :
            
            the vector alpha that satisfy the formula in the course. 
    alpha then needs to be transformed to a function in order to fit the data.
    '''
    min_W = np.min(vect_W)
    if (min_W < 0) or (min_W == 0) : 
        print('Non invertible Matrix W ')
    n = mat_K.shape[0]
    vect_sqrt_W = np.sqrt(vect_W) # the square root of the original vector
    vect_neg_sqrt_W = 1/vect_sqrt_W # the negative square root of the original vector
    b = np.multiply(vect_sqrt_W,y) 
    big_mat = np.multiply(np.multiply(vect_sqrt_W.reshape(-1,1),mat_K), vect_sqrt_W) +n*lam*np.eye(n)
    A = np.multiply(vect_neg_sqrt_W,big_mat)
    return scipy.linalg.solve(A,b)

vect_W_init = np.full(mat_K.shape[0],1)#/mat_K.shape[0])



In [32]:


def IRLS(K, y, alpha):
        """
        Iterative step to update alpha when training the classifier
        :param K: np.array, kernel
        :param y: np.array, labels
        :param alpha: np.array
        :return: - W: np.array
                 - z: np.array
        """
        m = np.dot(K, alpha)
        W = sigmoid(m) * sigmoid(-m)
        z = m + y/sigmoid(-y*m)
        return W, z

def WKRR_af(K, W, z):
        """
        Compute new alpha
        :param K: np.array, kernel
        :param W: np.array
        :param z: np.array
        :return: np.array, new alpha
        """
        n = K.shape[0]
        W_s = np.diag(np.sqrt(W))
        A = np.dot(np.dot(W_s, K), W_s) + n * lam * np.eye(n)
        A = np.dot(np.dot(W_s, np.linalg.inv(A)), W_s)
        return np.dot(A, z)
    
    
def recoding_KRL(mat_K,lam,y, max_iter = 10): 
    n = mat_K.shape[0]
    old_alpha = 0*np.ones(n)
    for i in range(max_iter): 
        W,z = IRLS(mat_K,y,old_alpha)
        alpha = fit_WKRR(mat_K, W, lam, z)
        f = f_from_alpha(alpha, GaussianKernel, X_train)
        old_alpha = np.copy(alpha)
    return alpha



In [33]:
def compute_m(mat_K,alpha): 
    return mat_K@alpha

def compute_P(y,m): 
    return -sigmoid(-np.multiply(y,m))

def compute_W(m):
    return np.multiply(sigmoid(m),sigmoid(m))

def compute_z(y,m): 
    return m + np.multiply(y,1/sigmoid(-np.multiply(y,m)))

In [34]:
def fit_KLR_IRLS(mat_K, lam, y, max_iter = 10): 
    '''
    Fonction qui optimise la loss définie par la la Kernel Logistic Regression. 
    
    args : 
            mat_K : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            
            lam : regularization factor 
            
            y : the vector we train on. Must be -1 or 1 
            
            max_iter : the maximum number of iteration we are ready to do 
    returns : 
            the vector alpha optimized 
            alpha then needs to be transformed to a function in order to fit the data.
    '''
    alpha = np.zeros(mat_K.shape[0])
    m = compute_m(mat_K,alpha)    
    W = compute_W(m)
    z = compute_z(y,m)
    for i in range(max_iter): 
        alpha = fit_WKRR(mat_K,W,lam,z)
        m = compute_m(mat_K,alpha)
        W = compute_W(m)
        z = compute_z(y,m)
        f = f_from_alpha(alpha, GaussianKernel, X_train)
        for i in range(10): 
            print(i ,' :',np.round(f(X_train[i,:]),3), ' y :', y_train[i])
    return alpha



#def fit_KLR_IRLS(mat_K, lam, y, max_iter = 10):
alpha_KLR = fit_KLR_IRLS(mat_K, lam,y_train)  
f_KLR = f_from_alpha(alpha_KLR, GaussianKernel, X_train)
for i in range(10): 
        print(i ,' :',np.round(f_KLR(X_train[i,:]),3), ' y :', y_train[i])

0  : -2.0  y : -1
1  : 2.0  y : 1
2  : 2.0  y : 1
3  : 2.0  y : 1
4  : 2.0  y : 1
5  : -2.0  y : -1
6  : -2.0  y : -1
7  : -2.0  y : -1
8  : -2.0  y : -1
9  : 2.0  y : 1
0  : -10.389  y : -1
1  : 10.389  y : 1
2  : 10.389  y : 1
3  : 10.389  y : 1
4  : 10.389  y : 1
5  : -10.389  y : -1
6  : -10.389  y : -1
7  : -10.389  y : -1
8  : -10.389  y : -1
9  : 10.389  y : 1
0  : -32513.364  y : -1
1  : 32513.359  y : 1
2  : 32513.367  y : 1
3  : 32513.361  y : 1
4  : 32513.36  y : 1
5  : -32513.363  y : -1
6  : -32513.364  y : -1
7  : -32513.365  y : -1
8  : -32513.365  y : -1
9  : 32513.362  y : 1
Non invertible Matrix W 


  return 1/(1+np.exp(-u))
  return m + np.multiply(y,1/sigmoid(-np.multiply(y,m)))
  vect_neg_sqrt_W = 1/vect_sqrt_W # the negative square root of the original vector
  b = np.multiply(vect_sqrt_W,y)
  A = np.multiply(vect_neg_sqrt_W,big_mat)


ValueError: array must not contain infs or NaNs

In [35]:
class estimator(): 
    def __init__(self , Kernel = GaussianKernel, lam = 1e-8, sig2 = 1 ): 
        self.Kernel = Kernel
        self.lam = lam 
        self.sig2 = sig2 
        self.mat_K = None 
        self.alpha = None 
        self.f = None 
        
    def predict_proba(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            probs = np.empty(X.shape[0])
            for i in range(X.shape[0]): 
                probs[i] = self.f(X[i,:])
            return probs 
    
    def predict(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            prob = self.predict_proba(X)
            return prob>0.5

In [36]:
#lam = 1e-8 est bien
class KRR(): 
    def __init__(self , Kernel = GaussianKernel, lam = 1e-8, sig2 = 1 ): 
        self.Kernel = Kernel
        self.lam = lam 
        self.sig2 = sig2 
        self.mat_K = None 
        self.alpha = None 
        self.f = None 
        #self.vect_W = vect_W 
    def fit(self, X, y): 
        if self.Kernel == GaussianKernel : 
            self.mat_K = to_mat_K(X, self.Kernel,self.sig2)
        self.alpha = fit_KRR(self.mat_K, self.lam, y)
        self.f = f_from_alpha(self.alpha,self.Kernel,X)
    
    def predict_proba(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            probs = np.empty(X.shape[0])
            for i in range(X.shape[0]): 
                probs[i] = self.f(X[i,:])
            return probs 
    def predict(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            prob = self.predict_proba(X)
            return prob>0.5
        

In [37]:
class KLR(estimator): 
    def __init__(self , Kernel = GaussianKernel, 
                 lam = 1e-9 , sig2 = 1): 
        super().__init__(Kernel, lam, sig2)
        
    def fit(self,X,y,max_iter = 10): 
        if self.Kernel == GaussianKernel: 
            self.mat_K = to_mat_K(X,self.Kernel,self.sig2)
        self.alpha = fit_KLR_IRLS(self.mat_K, self.lam, y,max_iter)
        self.f = f_from_alpha(self.alpha,self.Kernel,X)


In [38]:
def evaluate_MSE_from_alpha(alpha,X,y,lam,mat_K, Kernel = GaussianKernel):
    '''
    Function that computes the MSE of the vector computed alpha. 
    
    args : 
            alpha : this is the final value we compute. We do not look directly for a function but for some 
            parameter that will completely determined the function. alpha is this parameter
            X : training data 
            y : target data 
            lam : regularization factor
            mat_K : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            Kernel : the kernel we are using. Normally, mat_K has been computed with the kernel K
            
    returns : 
            the MSE of the data plus the regularization factor
    '''
    n = X.shape[0]
    f_alpha = f_from_alpha(alpha,Kernel,X)
    loss = 0
    for i in range(n): 
        loss+= (y[i]-f_alpha(X[i,:]))**2.0
    loss/= n
    print(' loss without regularization : ', np.round(loss,4)) 
    reg = lam*alpha@mat_K@alpha
    print('regularization :', np.round(reg,4))
    return loss + reg 


print('WKRR :',evaluate_MSE_from_alpha(alpha_KLR, X_train, y_train, lam, mat_K))

NameError: name 'alpha_KLR' is not defined