In [1]:
import numpy as np
import pandas as pd 
import numpy.linalg
from tqdm import tqdm
import numba 
from numba import njit,vectorize, jit
import time 
import scipy

In [2]:
X_train = pd.read_csv('data/Xtr0_mat100.csv', header=None, delimiter = ' ').to_numpy()
y_train = pd.read_csv('data/Ytr0.csv')['Bound'].to_numpy()

In [3]:
X_train

array([[0.01086957, 0.01086957, 0.04347826, ..., 0.01086957, 0.        ,
        0.01086957],
       [0.        , 0.        , 0.01086957, ..., 0.0326087 , 0.        ,
        0.        ],
       [0.02173913, 0.01086957, 0.02173913, ..., 0.02173913, 0.02173913,
        0.01086957],
       ...,
       [0.01086957, 0.        , 0.        , ..., 0.0326087 , 0.        ,
        0.        ],
       [0.01086957, 0.01086957, 0.        , ..., 0.        , 0.        ,
        0.01086957],
       [0.        , 0.        , 0.        , ..., 0.        , 0.01086957,
        0.        ]])

In [4]:
y_train.shape

(2000,)

In [5]:
def test(required, **kwargs): 
    print(kwargs)
    
def other(**kwargs): 
    test(0,**kwargs)
    
other(kw = 1)    

{'kw': 1}


In [6]:
@njit
def GaussianKernel(x,y,sig2 = 1): 
    return np.exp(-numpy.linalg.norm(x-y)**2/(2*sig2))

x1 = X_train[0,:]
x2 = X_train[1,:]

In [7]:
def f_from_alpha(alpha, Kernel, X):
    '''
    Calcule f à partir d'alpha. On utilise ici la forumule du representer thm : 
    f(x) = sum(alpha_i*K(x_i,x))
    
    args : 
        alpha : vecteur de taille (nombre de données dans le dataset). 
        Kernel : n'importe quel kernel 
        X : Matrice contenant les données. X.shape[0] doit etre eégal à la taille de alpha
        
    return : la fonction donnée par le representer theorem
    '''
    
    return  lambda x : np.sum([alpha[i]*Kernel(X[i,:],x) for i in range(X.shape[0])])

In [8]:
@njit
def to_mat_K(X, Kernel, sig2 = 1): 
    length = X.shape[0]
    mat_K = np.zeros((length,length))
    for i in range(length):
        x_i = X[i,:]
        for j in range(i,length): 
            x_j = X[j,:]
            value = Kernel(x_i,x_j,sig2)
            mat_K[i,j] = value
            mat_K[j,i] = value 
    return mat_K     
to_mat_K(X_train,GaussianKernel, sig2 = 1)

array([[1.        , 0.98475818, 0.98371162, ..., 0.98848832, 0.98942306,
        0.98895558],
       [0.98475818, 1.        , 0.9799995 , ..., 0.98545651, 0.98173781,
        0.97988373],
       [0.98371162, 0.9799995 , 1.        , ..., 0.97826428, 0.98196982,
        0.97699373],
       ...,
       [0.98848832, 0.98545651, 0.97826428, ..., 1.        , 0.98708787,
        0.98545651],
       [0.98942306, 0.98173781, 0.98196982, ..., 0.98708787, 1.        ,
        0.98650493],
       [0.98895558, 0.97988373, 0.97699373, ..., 0.98545651, 0.98650493,
        1.        ]])

In [9]:
lam = 0
alpha = np.ones(X_train.shape[0])
mat_K = to_mat_K(X_train,GaussianKernel, 1)
alpha_init = np.ones(mat_K.shape[0])/mat_K.shape[0]

In [10]:
def standardize(K): 
    U = np.full(K.shape,1/K.shape[0])
    I = np.eye(K.shape[0])
    return (I-U)@K@(I-U)

np.mean(standardize(mat_K), axis = 1)

array([ 1.97758476e-19, -2.88831459e-19,  1.03389519e-18, ...,
        1.31838984e-19, -1.38777878e-19,  4.16333634e-20])

On implémente quelques fonctions. Ici, on fait simplement une descente de gradient pour commencer. 

In [11]:
#@vectorize
def loss(u): 
    return np.log(1+np.exp(-u))
def sigmoid(u): 
    return 1/(1+np.exp(-u))


def grad_loss(u): 
    return -sigmoid(-u)

def hess_loss(u): 
    return sigmoid(u)*sigmoid(-u)

def J(alpha, y = y_train, mat_K = mat_K, lam = lam):
    n = alpha.shape[0]
    regularizer = lam/2*alpha@mat_K@alpha
    vect = mat_K@alpha
    somme = 1/n*np.sum(loss(y*vect))
    return somme+regularizer
   
def grad_J(alpha, y = y_train, mat_K = mat_K, lam = lam): 
    n = y.shape[0]
    vect_P_alpha = grad_loss(y*(mat_K@alpha))
    return 1/n*mat_K@(vect_P_alpha*y)+ lam*mat_K@alpha

def hess_J(alpha, y = y_train, mat_K = mat_K, lam = lam):
    n = mat_K.shape[0]
    vect_W = hess_loss(y*(mat_K@alpha))
    return 1/n*mat_K +lam*mat_K

def Kernel_logistic_reg_fit(X= X_train, y = y_train, mat_K = mat_K, lam = lam, Niter =400):
    alpha = 0.001*np.random.randn(X.shape[0])
    #alpha = np.ones(2000)
    #mat_K = standardize(mat_K)
    lr = 0.2
    for i in range(Niter): 
        #print('alpha :', alpha)
        #print('grad : ', grad_J(alpha,mat_K = mat_K))
        alpha-= lr*grad_J(alpha,mat_K = mat_K)#, mat_K= mat_K)
        #print('J ##########:', J(alpha))
    print(J(alpha)-lam/2*alpha@mat_K@alpha)
    return alpha
lam = 0
first_alpha = Kernel_logistic_reg_fit()
f = f_from_alpha(first_alpha, GaussianKernel, X_train)
f(X_train[3,:])

0.3597433867106116


91.91108501701223

In [33]:
y_train

array([0, 1, 1, ..., 0, 0, 0])

## Kernel Ridge Regression

In [31]:
#1e-8 marche bien pour lambda
def fit_KRR(mat_K,lam,y):
    #mat_K = standardize(mat_K) #marche pas si on standardise 
    n = mat_K.shape[0]
    full_mat = mat_K +n*lam*np.eye(n)
    alpha = np.linalg.solve(full_mat,y)
    return alpha 
lam = 1e-8
%time alpha_KRR = fit_KRR(mat_K,lam,y_train)
f_KRR = f_from_alpha(alpha_KRR, GaussianKernel, X_train)
print('test :', f_KRR(X_train[0,:]))
print('test :', f_KRR(X_train[1,:]))
print('test :', f_KRR(X_train[2,:]))
evaluate_MSE_from_alpha(alpha_KRR,X_train,y_train,lam, mat_K)

CPU times: user 1.18 s, sys: 301 ms, total: 1.48 s
Wall time: 255 ms
test : 0.2557912422635127
test : 0.7252261043031467
test : 0.8535851730994182
 loss without regularization :  0.0357
regularization : 0.0378


0.0734819399593179

In [15]:
# meme fonction que WKRR, mais on est environ 10 fois plus lent
'''
def WKRR(mat_K,vect_W, lam, y): 
    # pour l'instant on suppose que W est bien inversible, i.e. aucune valeur à zéro
    n = mat_K.shape[0]
    mat_sqrt_W = np.diag(np.sqrt(vect_W))
    mat_neg_sqrt_W = np.diag(1/np.sqrt(vect_W))
    big_mat = mat_sqrt_W@mat_K@mat_sqrt_W + n*lam*np.eye(n)
    return scipy.linalg.solve(big_mat@mat_neg_sqrt_W,mat_sqrt_W@y)
'''


vect_W_init = np.ones(mat_K.shape[0])
def fit_WKRR(mat_K,vect_W,lam,y): 
    '''
    Compute the Weighted Kernel Redge Regression. the Formula is given in the course. 
    The code is optimized, we do not take the diagonal matrix of the square root of W. Instead, 
    we only compute some np.multiply stuff. 
    
    args : 
    
            mat_K : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            vect_W : the vector that contains the weight associated to each sample. here we need that all the 
            coefficient of this vector is 0. Otherwise we won't be able to compute the inverse of the square root
            lam : regularization factor 
            y : the vector we train on 
    
    returns :
            
            the vector alpha that satisfy the formula in the course. 
    alpha then needs to be transformed to a function in order to fit the data.
    '''
    min_W = np.min(vect_W)
    if (min_W < 0) or (min_W == 0) : 
        print('Non invertible Matrix W ')
        return None 
    
    
    n = mat_K.shape[0]
    vect_sqrt_W = np.sqrt(vect_W) # the square root of the original vector
    vect_neg_sqrt_W = 1/vect_sqrt_W # the negative square root of the original vector
    
    b = np.multiply(vect_sqrt_W,y_train) 
    #here we compute the matrix that needs to be inverted. We just compute it and will solve a linear system 
    #instead of computing the inverse (more efficient)
    big_mat = np.multiply(np.multiply(vect_sqrt_W.reshape(-1,1),mat_K), vect_sqrt_W) +n*lam*np.eye(n)
    A = np.multiply(vect_neg_sqrt_W,big_mat)
    return scipy.linalg.solve(A,b)


%time alpha_WKRR = fit_WKRR(mat_K,vect_W_init,lam,y_train)

CPU times: user 474 ms, sys: 17.6 ms, total: 491 ms
Wall time: 213 ms


In [55]:
class estimator(): 
    def __init__(self , Kernel = GaussianKernel, lam = 1e-8, sig2 = 1 ): 
        self.Kernel = Kernel
        self.lam = lam 
        self.sig2 = sig2 
        self.mat_K = None 
        self.alpha = None 
        self.f = None 
        
    def predict_proba(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            probs = np.empty(X.shape[0])
            for i in range(X.shape[0]): 
                probs[i] = self.f(X[i,:])
            return probs 
    
    
    def predict(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            prob = self.predict_proba(X)
            return prob>0.5
    

In [39]:
#lam = 1e-8 est bien
class KRR(): 
    def __init__(self , Kernel = GaussianKernel, lam = 1e-8, sig2 = 1 ): 
        self.Kernel = Kernel
        self.lam = lam 
        self.sig2 = sig2 
        self.mat_K = None 
        self.alpha = None 
        self.f = None 
        #self.vect_W = vect_W 
    def fit(self, X, y): 
        if self.Kernel == GaussianKernel : 
            self.mat_K = to_mat_K(X, self.Kernel,self.sig2)
        self.alpha = fit_KRR(self.mat_K, self.lam, y)
        self.f = f_from_alpha(self.alpha,self.Kernel,X)
    
    def predict_proba(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            probs = np.empty(X.shape[0])
            for i in range(X.shape[0]): 
                probs[i] = self.f(X[i,:])
            return probs 
    def predict(self,X): 
        if self.f == None : 
            print("Il faut d'abord fitter les données")
        else : 
            prob = self.predict_proba(X)
            return prob>0.5
        
# test = KRR(sig2=1)
# test.fit(X_train,y_train)
# np.sum(np.abs(test.predict(X_train)-y_train))

9

In [69]:
class KRL(estimator): 
    def __init__(self , Kernel = GaussianKernel, 
                 lam = 1e-8 , sig2 = 1): 
        super().__init__(Kernel, lam, sig2)
        
    def fit(self,X,y,max_iter = 10): 
        if self.Kernel == GaussianKernel: 
            self.mat_K = to_mat_K(X,self.Kernel,self.sig2)
        vect_W_init = np.ones(self.mat_K.shape[0])
        self.alpha = fit_KLR_IRLS(self.mat_K, self.lam, y,vect_W_init,max_iter)
        self.f = f_from_alpha(self.alpha,self.Kernel,X)
        
test = KRL()
test.fit(X_train,y_train)

100%|██████████| 20/20 [00:05<00:00,  3.94it/s]


In [20]:
y_train 


array([0, 1, 1, ..., 0, 0, 0])

In [21]:
def evaluate_MSE_from_alpha(alpha,X,y,lam,mat_K, Kernel = GaussianKernel):
    '''
    Function that computes the MSE of the vector computed alpha. 
    
    args : 
            alpha : this is the final value we compute. We do not look directly for a function but for some 
            parameter that will completely determined the function. alpha is this parameter
            X : training data 
            y : target data 
            lam : regularization factor
            mat_K : Kernel Matrix that contains the information in the data (K_ij=K(x_i,x_j))
            Kernel : the kernel we are using. Normally, mat_K has been computed with the kernel K
            
    returns : 
            the MSE of the data plus the regularization factor
    '''
    n = X.shape[0]
    f_alpha = f_from_alpha(alpha,Kernel,X)
    loss = 0
    for i in range(n): 
        loss+= (y[i]-f_alpha(X[i,:]))**2.0
    loss/= n
    print(' loss without regularization : ', np.round(loss,4)) 
    reg = lam*alpha@mat_K@alpha
    print('regularization :', np.round(reg,4))
    return loss + reg 


print('WKRR :',evaluate_MSE_from_alpha(alpha_WKRR, X_train, y_train, lam, mat_K))

 loss without regularization :  0.0
regularization : 0.0
WKRR : 1.7515959306586573e-08


In [65]:
def fit_KLR_IRLS(mat_K, lam, y,  vect_W = vect_W_init, max_iter = 10): 
    alpha_init = 0.0001*np.random.randn(mat_K.shape[0])
    m_t = mat_K@alpha_init
    sigmoid_negy_m = sigmoid(np.multiply(-y,m_t))
    P_t = -sigmoid(sigmoid_negy_m)
    W_t = np.multiply(sigmoid(m_t),sigmoid(-m_t))
    z_t = m_t + y/sigmoid_negy_m
    for i in tqdm(range(max_iter)): 
        alpha_t = fit_WKRR(mat_K, W_t, lam, z_t)
        m_t = mat_K@alpha_t
        sigmoid_negy_m = sigmoid(np.multiply(-y,m_t))
        P_t = -sigmoid(sigmoid_negy_m)
        W_t = np.multiply(sigmoid(m_t),sigmoid(-m_t))
        z_t = m_t + y/sigmoid_negy_m
        #print(J(alpha_t))
    return alpha_t
        
lam = 1e-9
alpha_KLR = fit_KLR_IRLS(mat_K,lam,y_train)
f_KLR = f_from_alpha(alpha_KLR, GaussianKernel,X_train)
print('test :', f_KLR(X_train[0,:]))
print('test :', f_KLR(X_train[1,:]))
print('test :', f_KLR(X_train[2,:]))
evaluate_MSE_from_alpha(alpha_KLR, X_train,y_train,lam, mat_K)

100%|██████████| 10/10 [00:02<00:00,  4.78it/s]


test : 0.1484107417636551
test : 0.820260876629618
test : 0.9131023120571626
 loss without regularization :  0.0158
regularization : 0.0068


0.022552382939882546

In [23]:
evaluate_MSE_from_alpha(alpha_KRL, X_train,y_train,lam, mat_K)

 loss without regularization :  0.0158
regularization : 0.0068


0.02255238294801606

In [61]:
print(vect_W_init)

[1. 1. 1. ... 1. 1. 1.]
