# Kernel methods : Data Challenge (TF)

## Packages import

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import os
import datetime
import pickle as pkl
import warnings
import utils  
import kernels
import SVM
import SVM2
from scipy.optimize import minimize

param = {'WD1': 1, 'WD2': 2, 'WD3': 3, 'WD4': 4, 'WD5': 5}

## Getting training data

If you have not generated the data yet, please select ``all=True``. Once the data have been created, running the same line of code will allow you to directly pick the data into the file where it was stored.

In [6]:
X_train, y_train, X_val, y_val, X_test, K, ID = utils.get_training_datas(method="WD3", all=True, param=param, replace=False)

k = 0
Number of training samples: 1499 (74.95%), testing samples: 501 (25.05%)
Count train : -1 (51.1674%), 1 (48.8326%)
Count val : -1 (51.0978%), 1 (48.9022%)

k = 1
Number of training samples: 1499 (74.95%), testing samples: 501 (25.05%)
Count train : -1 (50.1001%), 1 (49.8999%)
Count val : -1 (50.0998%), 1 (49.9002%)

k = 2
Number of training samples: 1499 (74.95%), testing samples: 501 (25.05%)
Count train : -1 (50.0334%), 1 (49.9666%)
Count val : -1 (50.0998%), 1 (49.9002%)

Final shape: train (4497, 3), val (1503, 3), test (3000, 3)


HBox(children=(IntProgress(value=0, description='Building kernel', max=9000, style=ProgressStyle(description_w…

In [7]:
X_train.head()

Unnamed: 0,Id,k,seq
710,230,1,ACCTTGTCTTTTTGGTCTTGGAAAGTGTGTGTGAGTGGTTGTTTTG...
3837,5302,3,GAGGAAAGAAAATAGTTTCCCTCGACCCCCACCCCCATCCTAAGAT...
4416,5944,3,GTGGAGCAGGGGTCAGCACAGGTTGGGGAGGCTCAGGCAGCACAGG...
1764,2210,2,CGGGGCGGGGGGAGGGGAGAGGACGCTTGCAGAAGCGTCCTCAGTT...
4458,5744,3,TAGAACTAGAAATACCATTTGACCCAGCAATCCCATTACTGGATTC...


In [14]:
s1 = X_train.iloc[0,2]
s2 = X_train.iloc[1, 2]

In [15]:
s1

'ACCTTGTCTTTTTGGTCTTGGAAAGTGTGTGTGAGTGGTTGTTTTGTTGGGTAAACATGTGGCAAAGAGGAGCTTCTTGTCCTGGGTTAGGGCATGCATCA'

In [13]:
kernels.get_WD_k(X_train.iloc[0,2], X_train.iloc[2,2], 3, 101)

12.166666666666666

Restrict to any particular kind of TF if needed

In [3]:
X_train_1, y_train_1, X_val_1, y_val_1, X_test_1, K_1, id_1 = utils.select_k(1, X_train, y_train, X_val, y_val, X_test, K, ID)
X_train_2, y_train_2, X_val_2, y_val_2, X_test_2, K_2, id_2 = utils.select_k(2, X_train, y_train, X_val, y_val, X_test, K, ID)
X_train_3, y_train_3, X_val_3, y_val_3, X_test_3, K_3, id_3 = utils.select_k(3, X_train, y_train, X_val, y_val, X_test, K, ID)

## C-SVM with Weighted Degree Kernel (WDK)

### Cross validation on constant C

#### Data set 1

In [4]:
Cs_1 = [0.0001, 0.001]
data_1 = [X_train_1.iloc[:100], y_train_1.iloc[:100], X_val_1.iloc[:100], y_val_1.iloc[:100]]
svm_1 = SVM.C_SVM(K, ID)
pickleName = 'cv_C_SVM_k1.pkl'
kfolds = 5
C_opt_1, scores_tr_1, scores_te_1, mean_scores_tr_1, mean_scores_te_1 = svm_1.cv(Cs_1, data_1, kfolds, pickleName)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Fold 0
C=0.0001, accuracy on train (0.4938) and val (0.4750)
C=0.001, accuracy on train (0.4938) and val (0.4750)
Fold 1
C=0.0001, accuracy on train (0.4688) and val (0.5750)
C=0.001, accuracy on train (0.4688) and val (0.5750)
Fold 2
C=0.0001, accuracy on train (0.5000) and val (0.4500)
C=0.001, accuracy on train (0.5000) and val (0.4500)
Fold 3
C=0.0001, accuracy on train (0.4750) and val (0.5500)
C=0.001, accuracy on train (0.4750) and val (0.5500)
Fold 4
C=0.0001, accuracy on train (1.0000) and val (1.0000)
C=0.001, accuracy on train (0.5125) and val (0.4000)

Best constant C: 0.0001, accuracy on val 0.6100


  open(os.path.join('./Data', pickleName+'.pkl'), 'wb'))


#### Data set 2

In [5]:
Cs_2 = [0.0001, 0.001]
data_2 = [X_train_2.iloc[:100], y_train_2.iloc[:100], X_val_2.iloc[:100], y_val_2.iloc[:100]]
svm_2 = SVM.C_SVM(K, ID)
pickleName = 'cv_C_SVM_k2.pkl'
kfolds = 5
C_opt_2, scores_tr_2, scores_te_2, mean_scores_tr_2, mean_scores_te_2 = svm_2.cv(Cs_2, data_2, kfolds, pickleName)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Fold 0
C=0.0001, accuracy on train (0.5312) and val (0.6000)
C=0.001, accuracy on train (0.5312) and val (0.6000)
Fold 1
C=0.0001, accuracy on train (0.5125) and val (0.6750)
C=0.001, accuracy on train (0.5125) and val (0.6750)
Fold 2
C=0.0001, accuracy on train (0.5750) and val (0.4250)
C=0.001, accuracy on train (0.5750) and val (0.4250)
Fold 3
C=0.0001, accuracy on train (0.5563) and val (0.5000)
C=0.001, accuracy on train (0.5563) and val (0.5000)
Fold 4
C=0.0001, accuracy on train (0.5500) and val (0.5250)
C=0.001, accuracy on train (0.5500) and val (0.5250)

Best constant C: 0.0001, accuracy on val 0.5450


  open(os.path.join('./Data', pickleName+'.pkl'), 'wb'))


#### Data set 3

In [6]:
Cs_3 = [0.0001, 0.001]
data_3 = [X_train_3.iloc[:100], y_train_3.iloc[:100], X_val_3.iloc[:100], y_val_3.iloc[:100]]
svm_3 = SVM.C_SVM(K, ID)
pickleName = 'cv_C_SVM_k3.pkl'
kfolds = 5
C_opt_3, scores_tr_3, scores_te_3, mean_scores_tr_3, mean_scores_te_3 = svm_3.cv(Cs_3, data_3, kfolds, pickleName)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Fold 0
C=0.0001, accuracy on train (0.5188) and val (0.4500)
C=0.001, accuracy on train (0.5188) and val (0.4500)
Fold 1
C=0.0001, accuracy on train (1.0000) and val (1.0000)
C=0.001, accuracy on train (0.4938) and val (0.5500)
Fold 2
C=0.0001, accuracy on train (0.4938) and val (0.5500)
C=0.001, accuracy on train (0.4938) and val (0.5500)
Fold 3
C=0.0001, accuracy on train (0.5125) and val (0.4750)
C=0.001, accuracy on train (0.5125) and val (0.4750)
Fold 4
C=0.0001, accuracy on train (0.5062) and val (0.5000)
C=0.001, accuracy on train (0.5062) and val (0.5000)

Best constant C: 0.0001, accuracy on val 0.5950


  open(os.path.join('./Data', pickleName+'.pkl'), 'wb'))


### Predict with best constant C

#### Data set 1

In [4]:
svm_1 = SVM.C_SVM(K, ID, C=C_opt_1, print_callbacks=False)
res = svm_1.fit(X_train_1.iloc[:100,:], y_train_1.iloc[:100,:])
pred_tr_1 = svm_1.predict(X_train_1)
print('Accuracy on train set: {:0.4f}'.format(svm_1.score(pred_tr_1, y_train_1)))
pred_val_1 = svm_1.predict(X_val_1)
print('Accuracy on val set: {:0.4f}'.format(svm_1.score(pred_val_1, y_val_1)))

NameError: name 'K' is not defined

#### Data set 2

In [8]:
svm_2 = SVM.C_SVM(K, ID, C=C_opt_2, print_callbacks=False)
res = svm_2.fit(X_train_2.iloc[:100,:], y_train_2.iloc[:100,:])
pred_tr_2 = svm_2.predict(X_train_2)
print('Accuracy on train set: {:0.4f}'.format(svm_2.score(pred_tr_2, y_train_2)))
pred_val_2 = svm_2.predict(X_val_2)
print('Accuracy on val set: {:0.4f}'.format(svm_2.score(pred_val_2, y_val_2)))

Accuracy on train set: 1.0000
Accuracy on val set: 1.0000


#### Data set 3

In [9]:
svm_3 = SVM.C_SVM(K, ID, C=C_opt_3, print_callbacks=False)
res = svm_3.fit(X_train_3.iloc[:100,:], y_train_3.iloc[:100,:])
pred_tr_3 = svm_3.predict(X_train_3)
print('Accuracy on train set: {:0.4f}'.format(svm_3.score(pred_tr_3, y_train_3)))
pred_val_3 = svm_3.predict(X_val_3)
print('Accuracy on val set: {:0.4f}'.format(svm_3.score(pred_val_3, y_val_3)))

Accuracy on train set: 1.0000
Accuracy on val set: 1.0000


### Export predictions

In [3]:
pred_test_1 = svm_1.predict(X_test_1).astype(int)
pred_test_2 = svm_2.predict(X_test_2).astype(int)
pred_test_3 = svm_3.predict(X_test_3).astype(int)
y_test_1 = pd.DataFrame({'Id': X_test_1.Id, 'Bound': pred_test_1})
y_test_2 = pd.DataFrame({'Id': X_test_2.Id, 'Bound': pred_test_2})
y_test_3 = pd.DataFrame({'Id': X_test_3.Id, 'Bound': pred_test_3})
y_test = pd.concat((y_test_1, y_test_2, y_test_3))
y_test.Id = -y_test.Id-1
t = datetime.datetime.now().time()
y_test.to_csv(os.path.join('./Predictions/', 'y_test_'+str(t)+'.csv'), index = False)

NameError: name 'svm_1' is not defined