In [1]:
import pickle
import pandas as pd
import numpy as np

meta = pd.read_csv('kernels_meta.csv')
    
with open('enc_states_sync_all.p', 'rb') as fp:
    all_comps = pickle.load(fp)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import GridSearchCV

r2_scorer = make_scorer(r2_score)

def sigmoid(x):
    return x / (1.0 + np.abs(x)) 

def learning(model, params, X_train, X_test, y_train, y_test):
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    gs = GridSearchCV(pipe, params, scoring = r2_scorer, verbose = 1, n_jobs = -1, cv = 4)

    gs.fit(X_train, y_train)
    best = gs.best_estimator_
    res = best.score(X_test, y_test)
    print("Best params:", gs.best_params_)
    print("R^2:", res)
    print("MSE:", MSE(best.predict(X_test), y_test))
    print("MAE:", MAE(best.predict(X_test), y_test))
    return res
    
def transform_target(target):
    ans = np.zeros(78)
    coef = 100.0
    gamma = 0.5
    
    for snip in target:
        ans[snip] += coef
        coef *= gamma
        if coef < 0.001:
            break

    return ans

def generate_data_pair(data):
    X, y = [], []
    
    for note in data:
        score = meta[meta.kernel_id == note["kernel_id"]].kaggle_score.iloc[0]
        
        X.append(np.concatenate([
            note["encoded_sequence"], transform_target(note["target"]), [len(note["target"]), 1]
        ]))

        y.append(sigmoid(score))
        
    return np.array(X), np.array(y)

In [3]:
from sklearn.model_selection import train_test_split

def test(data, model, params): 
    train_data, test_data = train_test_split(
        data,
        test_size = 0.25,
        random_state = 14
    )
    
    X_train, y_train = generate_data_pair(train_data)
    X_test, y_test = generate_data_pair(test_data)
    return learning(model, params, X_train, X_test, y_train, y_test)

In [4]:
from sklearn.linear_model import Ridge

params = {
    "model__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
}

res1 = test(all_comps, Ridge(), params)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
Best params: {'model__alpha': 100.0}
R^2: 0.7645598416584544
MSE: 0.014466935402305762
MAE: 0.062342816650052896


In [5]:
from sklearn.ensemble import GradientBoostingRegressor

params = {
    "model__max_depth": [3, 5, None],
    "model__n_iter_no_change": [None, 5, 10],
    "model__criterion": ("friedman_mse", "squared_error")
}

res2 = test(all_comps, GradientBoostingRegressor(random_state = 14), params)

Fitting 4 folds for each of 18 candidates, totalling 72 fits
Best params: {'model__criterion': 'friedman_mse', 'model__max_depth': 5, 'model__n_iter_no_change': None}
R^2: 0.799568915197791
MSE: 0.012315756058238935
MAE: 0.05775271735308624


In [None]:
from sklearn.ensemble import RandomForestRegressor

params = {
    "model__max_depth": [3, 5, None],
    "model__criterion": ("friedman_mse", "squared_error", "poisson")
}

res3 = test(all_comps, RandomForestRegressor(random_state = 14), params)

In [None]:
from sklearn.linear_model import ElasticNet

params = {
    "model__alpha": [0.01, 0.1, 1.0, 10.0],
    "model__l1_ratio": [0.0, 0.25, 0.5, 0.75, 1.0],
}

res4 = test(all_comps, ElasticNet(random_state = 14, max_iter = 3000), params)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
