In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import numpy as np
from itertools import product, permutations

import numba
from numba import njit,vectorize, jit
from tqdm import tqdm

import Estimators

In [3]:
d = 2
c = 0

In [4]:
#@njit
sig2 = 1
@njit
def GaussianKernel(x,y):
    return np.exp(-np.linalg.norm(x-y)**2/(2*sig2))

@njit
def PolynomialKernel(x,y): 
    return (x.dot(y)+c)**d
@njit
def LinearKernel(x,y): 
    return x.dot(y)

@njit
def Laplace_kernel(x, y, gamma=1):
    return 0.5 * np.exp(-gamma * np.linalg.norm(x-y))

In [5]:
def importation(): 
    X_train = []
    X_test = []
    Y_train = []
    for i in range(3): 
        xtrain = pd.read_csv('data/Xtr'+str(i)+'.csv',delimiter= ',', header= None)
        xtrain = xtrain.iloc[1:,1].to_numpy()
        X_train.append(xtrain)
    
        xtest = pd.read_csv('data/Xte'+str(i)+'.csv',delimiter= ',', header= None)
        xtest = xtest.iloc[1:,1]
        X_test.append(xtest)
    
        Y_train.append(pd.read_csv('data/Ytr'+str(i)+'.csv',delimiter= ',')['Bound'].to_numpy())
    return X_train,X_test,Y_train

X_train,X_test,Y_train = importation()

In [6]:
X_mat100_0 = pd.read_csv('data/Xtr0_mat100.csv',delimiter= ' ', header= None).values
Y_mat100_0 = pd.read_csv('data/Ytr0.csv',delimiter= ',')['Bound'].to_numpy()

In [7]:
X_traintest = []
for i in range(3): 
    X_traintest.append(np.concatenate((X_train[i],X_test[i])))


In [8]:
def to_Kernel_train(X, Kernel): 
    length = X.shape[0]
    mat_K = np.zeros((length,length))
    for i in range(length):
        x_i = np.squeeze(X[i])
        for j in range(i,length): 
            x_j = np.squeeze(X[j])
            value = Kernel(x_i,x_j)
            mat_K[i,j] = value
            mat_K[j,i] = value 
    return mat_K

#@njit 
def to_Kernel_test(Xtrain,Xtest,Kernel):
    length_train = Xtrain.shape[0]
    length_test = Xtest.shape[0]
    bimat_K = np.zeros((length_train,length_test))
    for i in range(length_train):
        x_i = np.squeeze(Xtrain[i])
        for j in range(length_test): 
            x_j = np.squeeze(Xtest[j])
            value = Kernel(x_i,x_j)
            bimat_K[i,j] = value
    return bimat_K


def standardize(K): 
    U = np.full(K.shape,1/K.shape[0])
    I = np.eye(K.shape[0])
    return (I-U)@K@(I-U)

In [9]:
# Compute phi for Spectrum kernel
def phi_spectrum(x,k,U):
    """U: list of the sequences of size k to look for in x"""
    phi_spec = np.zeros(len(U))
    for j, u in enumerate(U):
        for i in range(len(x)-k+1):
            if x[i:i+k] == u:
                phi_spec[j] +=1
    return phi_spec

#Compute phi for Substring kernel
def l(i):
    return i[-1] - i[0] + 1

def I(k,n):
    I = set()
    for ele in permutations(range(0,n),k):
        I.add(tuple(sorted(list(ele))))
    return list(I)

def phi_substring(x,k,U,lamb=0.5): #fonctionne (testé avec l'exemple page 392)
    """U: list of the sequences of size k to look for in x"""
    phi = np.zeros(len(U))
    I_kn = I(k,len(x))
    for j, u in enumerate(U):
        for i in I_kn:
            x_i = "".join([x[idx]  for idx in i])
            if x_i==u:
                phi[j] += lamb**l(i)
    return phi


def make_dict_phi(X, phi,k):
    U = [''.join(letter) for letter in product('ACGT', repeat=k)]
    phi_dict = {seq:phi(seq,k,U) for seq in tqdm(X)}
    return phi_dict
    

# Compute K(x_i,x) 
def K_substring_2(x, y):
    """
    Compute K(x_i,x) with K a string kernel
    
    Parameters:
    Ne prend pas en argument X_train mais seulement phi_X_train(qui a été calculé précédement)
    et i .
    
    phi_X_train: list of phi(X_train) which is also needed to construct the matrix K 
    phi: the function phi
    k: hyperparameter of the spectrum kernel (size of sequences in U)
    """
    value = np.sum([dict_sub_traintest_2[str(x)] * dict_sub_traintest_2[str(y)]])  
    return value


    



In [10]:
def get_list_K_train(Kernel) : 
    '''
    only works for the dataset with strings inside
    '''
    list_K_train = list()
    for i in range(3): 
        list_Kernel_train.append(to_Kernel_train(X_train[i], Kernel))
        


In [11]:
##CONSTRUCTION DES DICTIONNAIRES. On fait ça pour eviter de les recalculer à chaque fois
dict_spectrum_traintest = [0]*7

#dict_spectrum_traintest[3] = make_dict_phi(X_traintest[0],phi_spectrum,3)
#dict_spectrum_traintest[4] = make_dict_phi(X_traintest[0],phi_spectrum,4)

dict_spectrum_traintest[5] = make_dict_phi(np.concatenate(X_traintest),phi_spectrum,5)
dict_spectrum_traintest[6] = make_dict_phi(np.concatenate(X_traintest),phi_spectrum,6)
dict_sub_traintest_2 = make_dict_phi(X_traintest[0], phi_substring,2)

#dict_sub_traintest_3 = make_dict_phi(X_traintest[0], phi_substring,3)

100%|██████████| 9000/9000 [01:40<00:00, 89.18it/s] 
100%|██████████| 9000/9000 [06:57<00:00, 21.56it/s]


In [12]:
def make_K_spectrum(k): 
    
    def K_spectrum(x,y): 
        value = np.sum([dict_spectrum_traintest[k][str(x)] * dict_spectrum_traintest[k][str(y)]])
    return K_spectrum
K_spectrum_5 = make_K_spectrum(5)
K_spectrum_6 = make_K_spectrum(6)


In [11]:
Kernel_train_Gaussian = to_Kernel_train(X_mat100_0,GaussianKernel)
Kernel_test_Gaussian = to_Kernel_test(X_mat100_0,X_mat100_0,GaussianKernel)

In [19]:
Kernel_train_Linear = to_Kernel_train(X_mat100_0,LinearKernel)
Kernel_test_Linear = to_Kernel_test(X_mat100_0,X_mat100_0,LinearKernel)

  return x.dot(y)


In [13]:
Kernel_train_spectrum_3 = to_Kernel_train(X_train[0],K_spectrum_3)
Kernel_test_spectrum_3 = to_Kernel_test(X_train[0], X_train[0],K_spectrum_3)

NameError: name 'dict_spectrum_traintest_3' is not defined

In [14]:
Kernel_train_spectrum_5 = to_Kernel_train(X_train[0],K_spectrum_5)
Kernel_test_spectrum_5 = to_Kernel_test(X_train[0], X_train[0],K_spectrum_5)

In [17]:
Kernel_train_spectrum_6 = to_Kernel_train(X_train[0],K_spectrum_6)
Kernel_test_spectrum_6 = to_Kernel_test(X_train[0], X_train[0],K_spectrum_6)

In [12]:
def try_Kernel(Kernel,X,Kernel_train = 0,Kernel_test = 0,pre_computed = False): 
    if pre_computed == False : 
        Kernel_train = to_Kernel_train(X,Kernel)
        Kernel_test = to_Kernel_test(X,X,Kernel)
    regressor = Estimators.KRR(Kernel = Kernel)
    regressor.fit(Kernel_train,Y_train[0])
    pred = regressor.predict(Kernel_test)
    print('predictions-true : ', (pred-Y_train[0])[0:100])
    print('predictions : ', pred[0:100]*1)
    print('true : ', Y_train[0][0:100])
    print('SCORE : ', 1-np.sum(np.abs(pred-Y_train[0]))/pred.shape[0])

In [18]:
regressor_Gaussian = Estimators.KRR(Kernel = GaussianKernel)
regressor_Gaussian.cross_val(standardize(Kernel_train_Gaussian),Y_train[0], 5)

5it [00:00, 13.51it/s]

Score :  0.571





0.429

In [22]:
regressor_Linear = Estimators.KRR(Kernel = LinearKernel,lam = 1e-12)
regressor_Linear.cross_val(Kernel_train_Linear,Y_train[0],5)

5it [00:00, 12.70it/s]

Score :  0.5895





0.4105

In [25]:
regressor_Laplace = Estimators.KRR(Kernel = Laplace_kernel)
regressor_Linear.cross_val(Kernel_train_Linear,Y_train[0],5)

5it [00:00, 13.23it/s]

Score :  0.5865





0.4135

In [30]:
regressor_spectrum = Estimators.KRR(Kernel = K_spectrum_3,lam = 0.002)
regressor_spectrum.cross_val(Kernel_train_spectrum_3,Y_train[0],5)

5it [00:00,  8.56it/s]

Score :  0.5845





0.4155

In [18]:
def grid_search_cv(model, parameters, Kernel_train): 
    for parameter in parameters : 
        model.set_parameter = parameter
        model.cross_val(Kernel_train,5)
for i in range(10): 
    print('lambda = ', 10**(-i+5))
    regressor_spectrum_6 = Estimators.KRR(Kernel = K_spectrum_6,lam = 10**(-i+5))
    regressor_spectrum_6.cross_val(Kernel_train_spectrum_6,Y_train[0],5)
    

0it [00:00, ?it/s]

lambda =  100000


5it [00:00,  6.08it/s]
0it [00:00, ?it/s]

Score :  0.5685
lambda =  10000


5it [00:00,  8.52it/s]


Score :  0.5549999999999999
lambda =  1000


5it [00:00,  8.29it/s]


Score :  0.5705
lambda =  100


5it [00:00,  6.97it/s]


Score :  0.5665
lambda =  10


5it [00:00,  5.41it/s]


Score :  0.5935
lambda =  1


5it [00:00,  7.91it/s]


Score :  0.6365000000000001
lambda =  0.1


5it [00:00,  7.30it/s]


Score :  0.6405000000000001
lambda =  0.01


5it [00:00,  7.03it/s]


Score :  0.613
lambda =  0.001


5it [00:00,  5.82it/s]


Score :  0.5860000000000001
lambda =  0.0001


5it [00:01,  3.20it/s]

Score :  0.5885





In [None]:
for i in range(10): 
    print('lambda = ', 10**(-i+5))
    regressor_spectrum_5 = Estimators.KRR(Kernel = K_spectrum_5,lam = 10**(-i+5))
    regressor_spectrum_5.cross_val(Kernel_train_spectrum_5,Y_train[0],5)

In [32]:
for i in range(10): 
    print('lambda = ', 10**(-i-3))
    regressor_Linear = Estimators.KRR(Kernel = LinearKernel, lam = 10**(-i-3))
    regressor_Linear.cross_val(Kernel_train_Linear,Y_train[0],5)

0it [00:00, ?it/s]

lambda =  0.001


5it [00:00, 11.25it/s]
0it [00:00, ?it/s]

Score :  0.5665
lambda =  0.0001


5it [00:00, 10.99it/s]
0it [00:00, ?it/s]

Score :  0.5935
lambda =  1e-05


5it [00:00,  9.23it/s]
0it [00:00, ?it/s]

Score :  0.5875
lambda =  1e-06


5it [00:00, 12.95it/s]
0it [00:00, ?it/s]

Score :  0.5945
lambda =  1e-07


5it [00:00, 12.83it/s]
0it [00:00, ?it/s]

Score :  0.5945
lambda =  1e-08


5it [00:00, 12.11it/s]
0it [00:00, ?it/s]

Score :  0.5855
lambda =  1e-09


5it [00:00, 12.02it/s]
0it [00:00, ?it/s]

Score :  0.5855
lambda =  1e-10


5it [00:00, 12.82it/s]
0it [00:00, ?it/s]

Score :  0.5825
lambda =  1e-11


5it [00:00, 12.61it/s]
0it [00:00, ?it/s]

Score :  0.5740000000000001
lambda =  1e-12


5it [00:00,  7.79it/s]

Score :  0.5860000000000001





In [None]:
def download_results(Kernel , cs,name_dossier):
    Y_predicted = []
    for i in range(3) : 
        model = SVM_1(Kernel, C = cs[i])
        model.fit(X_train[i], Y_train[i])
        Y_predicted.append(model.predict(X_test[i]))
        print('Model {} Predicted'.format(i))

    d = { 'Id' : np.arange(3000), 'Bound' : np.concatenate(Y_predicted)}
    out = pd.DataFrame(data=d)
    out.to_csv('predictions_KM'+name_dossier+'.csv', index=False)