In [None]:
from src.utils import *
from src.kgrams import *
from src.levenshtein import *
from sklearn import svm
import time
from tqdm import tqdm_notebook
import os

## Evaluation functions

### Grid Search

In [None]:
def grid_search(K, Y, folds=5, C_min=-7, C_max=6):
    """
     Grid search on C values, from 10^C_min to 10^C_max, using k-fold cross validation.
    """
    results = []
    for C in 10. ** np.arange(C_min, C_max + 1):
        results.append(np.array(evaluate(svm.SVC(kernel='precomputed', C=C), K, Y, folds=folds)))
        print('%.0e \t ' % C,
              'Validation %.2f%% +- %.2f\t Train %.2f%%\t +- %.2f' %
              tuple(100 * results[-1]))
    
    return 10. ** np.arange(C_min, C_max + 1), results

### Test predictions

In [None]:
def test(model, train_Xs, train_Ys, test_Xs, Cs, file_name='predictions', **params):
    """
    For each data set, trains the model on the train data, then computes predictions on the test data.
    Saves the predictions to the specified file.
    
    Also evaluates the performance of the specified model using cross-validation on the train data.
    
    NB: train_Xs and test_Xs can be either precomputed kernel matrices, or simply the train and test features,
    depending on the model and parameters.
    """
    # Evaluation using k-fold validation.
    print('Evaluation on train data (using cross validation)')
    print('Validation %.2f\%% +- %.2f\t Train %.2f%%\t +- %.2f' %
          tuple(100 * global_evaluate(model, train_Xs, train_Ys, Cs, **params)))
    predictions = []
    for k, (X_train, Y_train, X_test, C) in enumerate(zip(train_Xs, train_Ys, test_Xs, Cs)):
        model_k = model(C=C, **params)
        model_k.fit(X_train, Y_train)
        
        predictions.append(model_k.predict(X_test))
    
    predictions = np.concatenate(predictions)
    
    predictions_dir = 'predictions'
    if not os.path.isdir(predictions_dir):
        os.mkdir(predictions_dir)
    
    np.savetxt('%s/%s.csv' % (predictions_dir, file_name),
               np.stack((np.arange(len(predictions)),
                         predictions), axis=1),
               header='Id,Bound', comments='', fmt='%d', delimiter=',')
    return predictions

## Baseline using Spectrum kernel

The computations are slowed down over $k=13$.

In [None]:
spectrum_k = 12

### Data loading

In [None]:
train_Xs = [load(k=k) for k in range(3)]
train_Ys = [load(X=False, k=k) for k in range(3)]
test_Xs = [load(k=k, train=False) for k in range(3)]

### Kernel Computing

In [None]:
start = time.time()
tqdm = True

train_Ks = [cum_spectrum(train_X, k=spectrum_k, tqdm=tqdm) for train_X in train_Xs]
test_Ks  = [cum_spectrum(test_X, train_X, k=spectrum_k, tqdm=tqdm) for (test_X, train_X) in zip(test_Xs, train_Xs)]

print("Total Duration: %.1f seconds." % (time.time() - start))

### Grid-search to find the best values for parameter C

#### Dataset 0

In [None]:
gs0 = grid_search(train_Ks[0], train_Ys[0])

#### Dataset 1

In [None]:
gs1 = grid_search(train_Ks[1], train_Ys[1])

#### Dataset 2

In [None]:
gs2 = grid_search(train_Ks[2], train_Ys[2])

#### Best values

In [None]:
greedy_Cs = [Cs[np.argmax(np.array(scores)[:, 0] - np.array(scores)[:, 1])]
             for (Cs, scores) in [gs0, gs1, gs2]]  # Values with best validation score.

Cs = (1, 1, 1)               # Values without strong overfitting (i.e. perfect training accuracy).

In [None]:
print(greedy_Cs)

### Generating test predictions

#### Greedy values of C

In [None]:
_ = test(svm.SVC,
         train_Ks, train_Ys, test_Ks,
         Cs=greedy_Cs, kernel='precomputed',
         file_name='spectrum_greedy_predictions_%d' % spectrum_k)

#### Soft values of C

In [None]:
_ = test(svm.SVC,
         train_Ks, train_Ys, test_Ks,
         Cs=Cs, kernel='precomputed',
         file_name='spectrum_soft_predictions_%d' % spectrum_k)

# Levenshtein Kernel

### Data loading

In [None]:
train_Xs = [load(k=k, numeric=False) for k in range(3)]
train_Ys = [load(X=False, k=k) for k in range(3)]
test_Xs = [load(k=k, train=False, numeric=False) for k in range(3)]

### Kernel Computing

In [None]:
start = time.time()
tqdm = True

train_ds = [levenshtein_distance(train_X, tqdm=tqdm) for train_X in train_Xs]
test_ds  = [levenshtein_distance(test_X, train_X, tqdm=tqdm) for (test_X, train_X) in zip(test_Xs, train_Xs)]

print("Total Duration: %.1f seconds." % (time.time() - start))

In [None]:
def distance_to_kernel(d):
    return 1 / (1 + d ** .66)
#     σ = 10
#     return np.exp(-(d / σ)**2)

train_Ks = [distance_to_kernel(d) for d in train_ds]
test_Ks = [distance_to_kernel(d) for d in test_ds]

### Grid-search to find the best values for parameter C

#### Dataset 0

In [None]:
gs0 = grid_search(train_Ks[0], train_Ys[0])

#### Dataset 1

In [None]:
gs1 = grid_search(train_Ks[1], train_Ys[1])

#### Dataset 2

In [None]:
gs2 = grid_search(train_Ks[2], train_Ys[2])

#### Best values

In [None]:
greedy_Cs = [Cs[np.argmax(np.array(scores)[:, 0] - np.array(scores)[:, 1])]
             for (Cs, scores) in [gs0, gs1, gs2]]  # Values with best validation score.

Cs = (1, 1, 1)               # Values without strong overfitting (i.e. perfect training accuracy).

In [None]:
print(greedy_Cs)

### Generating test predictions

#### Greedy values of C

In [None]:
_ = test(svm.SVC,
         train_Ks, train_Ys, test_Ks,
         Cs=greedy_Cs, kernel='precomputed',
         file_name='spectrum_greedy_predictions_%d' % spectrum_k)

#### Soft values of C

In [None]:
_ = test(svm.SVC,
         train_Ks, train_Ys, test_Ks,
         Cs=Cs, kernel='precomputed',
         file_name='spectrum_soft_predictions_%d' % spectrum_k)