In [1]:
import pickle
import pandas as pd
import numpy as np

meta = pd.read_csv('kernels_meta.csv')
    
with open('enc_states_sync_all.p', 'rb') as fp:
    all_comps = pickle.load(fp)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

r2_scorer = make_scorer(r2_score)

train_data, test_data = train_test_split(
    all_comps,
    test_size = 0.25,
    random_state = 14
)

def transform_target(target):
    ans = np.zeros(78)
    coef = 1.0
    gamma = 0.5
    
    for snip in target:
        ans[snip] += coef
        coef *= gamma
        if coef < 0.0001:
            break

    return ans

def sigmoid(x):
    return x / (1.0 + np.abs(x))

def generate_data_pair(data):
    X, y = [], []
    
    for note in data:
        score = meta[meta.kernel_id == note["kernel_id"]].kaggle_score.iloc[0]
        
        X.append(np.concatenate([
            note["encoded_sequence"], transform_target(note["target"]), [len(note["target"])]
        ]))

        y.append(sigmoid(score))
        
    return np.array(X), np.array(y)

X_train, y_train = generate_data_pair(train_data)
X_test, y_test = generate_data_pair(test_data) 

def test(model, params):
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])
    
    gs = GridSearchCV(pipe, params, scoring = r2_scorer, verbose = 3, n_jobs = 4)

    gs.fit(X_train, y_train)
    
    best = gs.best_estimator_
    
    print("Best params:", gs.best_params_)
    print("R^2:", best.score(X_test, y_test))
    print("MSE:", MSE(best.predict(X_test), y_test))
    print("MAE:", MAE(best.predict(X_test), y_test))
    return gs

models = dict()

In [3]:
from sklearn.linear_model import Ridge

params = {
    "model__alpha": [0.1, 1.0, 10.0, 100.0, 1000.0]
}

models["Ridge"] = test(Ridge(), params)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params: {'model__alpha': 100.0}
R^2: 0.7645602780282413
MSE: 0.014466908588980613
MAE: 0.06234268360718438


In [4]:
from sklearn.linear_model import ElasticNet

params = {
    "model__alpha": [0.001, 0.01, 0.1],
    "model__l1_ratio": [0.1, 0.5, 0.9],
}

models["ElasticNet"] = test(ElasticNet(random_state = 14, max_iter = 3000), params)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best params: {'model__alpha': 0.001, 'model__l1_ratio': 0.1}
R^2: 0.7624323090637526
MSE: 0.014597664487907213
MAE: 0.06264463800733885


  model = cd_fast.enet_coordinate_descent(


In [5]:
from sklearn.ensemble import GradientBoostingRegressor

params = {
    "model__max_depth": [4, 6, 8],
    "model__n_estimators": [50, 100, 150]
}

models["GradientBoostingRegressor"] = test(GradientBoostingRegressor(random_state = 14), params)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ..................model__alpha=0.1;, score=0.780 total time=   0.7s
[CV 2/5] END ..................model__alpha=1.0;, score=0.767 total time=   0.6s
[CV 4/5] END ..................model__alpha=1.0;, score=0.768 total time=   0.5s
[CV 3/5] END .................model__alpha=10.0;, score=0.785 total time=   0.5s
[CV 2/5] END ................model__alpha=100.0;, score=0.777 total time=   0.6s
[CV 1/5] END ...............model__alpha=1000.0;, score=0.766 total time=   0.5s
[CV 5/5] END ...............model__alpha=1000.0;, score=0.751 total time=   0.5s
[CV 4/5] END model__alpha=0.001, model__l1_ratio=0.1;, score=0.763 total time=  34.0s
[CV 5/5] END model__alpha=0.001, model__l1_ratio=0.1;, score=0.774 total time=  40.6s
[CV 1/5] END model__alpha=0.001, model__l1_ratio=0.9;, score=0.767 total time=  30.0s
[CV 3/5] END model__alpha=0.001, model__l1_ratio=0.9;, score=0.759 total time=  22.7s
[CV 3/5] END model__alpha=0.0

Best params: {'model__max_depth': 8, 'model__n_estimators': 150}
R^2: 0.8035986209225289
MSE: 0.012068145400733716
MAE: 0.05384242357521649


In [6]:
from sklearn.ensemble import RandomForestRegressor

params = {
    "model__max_depth": [4, 6, 8, None],
    "model__n_estimators": [50, 100, 150]
}

models["RandomForestRegressor"] = test(RandomForestRegressor(random_state = 14), params)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END model__max_depth=4, model__n_estimators=50;, score=0.542 total time=  34.0s
[CV 5/5] END model__max_depth=4, model__n_estimators=50;, score=0.512 total time=  33.8s
[CV 4/5] END model__max_depth=4, model__n_estimators=100;, score=0.498 total time= 1.1min
[CV 3/5] END model__max_depth=4, model__n_estimators=150;, score=0.535 total time= 1.7min
[CV 2/5] END model__max_depth=6, model__n_estimators=50;, score=0.598 total time=  50.2s
[CV 5/5] END model__max_depth=6, model__n_estimators=50;, score=0.607 total time=  49.9s
[CV 4/5] END model__max_depth=6, model__n_estimators=100;, score=0.593 total time= 1.7min
[CV 3/5] END model__max_depth=6, model__n_estimators=150;, score=0.637 total time= 2.5min
[CV 2/5] END model__max_depth=8, model__n_estimators=50;, score=0.670 total time= 1.1min
[CV 4/5] END model__max_depth=8, model__n_estimators=50;, score=0.670 total time= 1.1min
[CV 3/5] END model__max_depth=8, model__n_est

In [12]:
from sklearn.svm import LinearSVR

params = {
    "model__loss": ("epsilon_insensitive", "squared_epsilon_insensitive"),
    "model__C": [0.01, 0.1, 1.0, 10.0, 100.0]
}

models["LinearSVR"] = test(LinearSVR(random_state = 14, max_iter = 5000), params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best params: {'model__C': 0.01, 'model__loss': 'squared_epsilon_insensitive'}
R^2: 0.7659806957404518
MSE: 0.014379629123015294
MAE: 0.061805803990027386
[CV 4/5] END model__C=0.01, model__loss=epsilon_insensitive;, score=0.745 total time= 1.5min
[CV 3/5] END model__C=0.01, model__loss=squared_epsilon_insensitive;, score=0.788 total time=   9.3s
[CV 5/5] END model__C=0.01, model__loss=squared_epsilon_insensitive;, score=0.773 total time=   9.7s
[CV 3/5] END model__C=0.1, model__loss=epsilon_insensitive;, score=0.749 total time= 2.7min
[CV 1/5] END model__C=0.1, model__loss=squared_epsilon_insensitive;, score=0.783 total time= 1.6min
[CV 4/5] END model__C=0.1, model__loss=squared_epsilon_insensitive;, score=0.769 total time= 1.4min
[CV 3/5] END model__C=1.0, model__loss=epsilon_insensitive;, score=0.699 total time= 3.1min
[CV 2/5] END model__C=1.0, model__loss=squared_epsilon_insensitive;, score=0.764 total time= 3.0min
[CV 1/5] END model__C=10.0, model__loss=epsilon_insensitive;, score

In [13]:
df = pd.DataFrame(models["Ridge"].cv_results_)
df.to_csv("Ridge.csv")
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.652214,0.020995,0.029574,0.003468,0.1,{'model__alpha': 0.1},0.780254,0.76438,0.776462,0.767002,0.75307,0.768234,0.009576,4
1,0.595645,0.062688,0.025659,0.001189,1.0,{'model__alpha': 1.0},0.780223,0.767413,0.778555,0.7677,0.755455,0.769869,0.008957,3
2,0.575381,0.038133,0.024686,0.00173,10.0,{'model__alpha': 10.0},0.785315,0.77364,0.785186,0.769044,0.764571,0.775551,0.008423,2
3,0.578961,0.018919,0.024993,0.001552,100.0,{'model__alpha': 100.0},0.786389,0.777181,0.786678,0.762779,0.773515,0.777309,0.008897,1
4,0.553888,0.04863,0.021308,0.008039,1000.0,{'model__alpha': 1000.0},0.766342,0.753737,0.756758,0.733209,0.750895,0.752188,0.010821,5


In [14]:
df = pd.DataFrame(models["ElasticNet"].cv_results_)
df.to_csv("ElasticNet.csv")
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,35.716229,2.462536,0.028339,0.003555,0.001,0.1,"{'model__alpha': 0.001, 'model__l1_ratio': 0.1}",0.786949,0.775127,0.786845,0.762821,0.773938,0.777136,0.009053,1
1,39.755957,4.929681,0.034823,0.004319,0.001,0.5,"{'model__alpha': 0.001, 'model__l1_ratio': 0.5}",0.777588,0.763834,0.771933,0.742391,0.759348,0.763019,0.012095,2
2,23.93058,3.253267,0.036122,0.004659,0.001,0.9,"{'model__alpha': 0.001, 'model__l1_ratio': 0.9}",0.767092,0.749825,0.7589,0.724965,0.744107,0.748978,0.014336,3
3,24.365487,2.581859,0.030674,0.010755,0.01,0.1,"{'model__alpha': 0.01, 'model__l1_ratio': 0.1}",0.759515,0.742461,0.750817,0.71654,0.736601,0.741187,0.014552,4
4,2.365579,0.203261,0.032324,0.004178,0.01,0.5,"{'model__alpha': 0.01, 'model__l1_ratio': 0.5}",0.651266,0.625773,0.633303,0.605142,0.626992,0.628495,0.01481,5
5,1.301325,0.048831,0.035071,0.003406,0.01,0.9,"{'model__alpha': 0.01, 'model__l1_ratio': 0.9}",0.566047,0.53687,0.54353,0.525642,0.546595,0.543737,0.01327,6
6,1.099869,0.071947,0.033259,0.005486,0.1,0.1,"{'model__alpha': 0.1, 'model__l1_ratio': 0.1}",0.531104,0.502823,0.50888,0.493334,0.513138,0.509856,0.012535,7
7,0.666797,0.015189,0.032061,0.002425,0.1,0.5,"{'model__alpha': 0.1, 'model__l1_ratio': 0.5}",0.143762,0.14186,0.142403,0.145188,0.148243,0.144291,0.002288,8
8,0.614359,0.041245,0.028622,0.003744,0.1,0.9,"{'model__alpha': 0.1, 'model__l1_ratio': 0.9}",-0.000917,-2e-06,-0.00395,-1.8e-05,-0.000739,-0.001125,0.00146,9


In [15]:
df = pd.DataFrame(models["LinearSVR"].cv_results_)
df.to_csv("LinearSVR.csv")
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__loss,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,90.259837,1.320352,0.057826,0.021372,0.01,epsilon_insensitive,"{'model__C': 0.01, 'model__loss': 'epsilon_ins...",0.761934,0.732325,0.757505,0.744981,0.745794,0.748508,0.01042,4
1,9.569485,0.284682,0.016162,0.000988,0.01,squared_epsilon_insensitive,"{'model__C': 0.01, 'model__loss': 'squared_eps...",0.787689,0.777539,0.787956,0.766106,0.772761,0.77841,0.008501,1
2,159.295112,6.707989,0.019537,0.003786,0.1,epsilon_insensitive,"{'model__C': 0.1, 'model__loss': 'epsilon_inse...",0.754498,0.712556,0.748722,0.743867,0.726648,0.737258,0.01546,5
3,91.483284,5.11617,0.016444,0.001434,0.1,squared_epsilon_insensitive,"{'model__C': 0.1, 'model__loss': 'squared_epsi...",0.783267,0.771273,0.782579,0.768802,0.760735,0.773331,0.008575,2
4,182.356764,8.644955,0.020906,0.003263,1.0,epsilon_insensitive,"{'model__C': 1.0, 'model__loss': 'epsilon_inse...",0.734493,0.7095,0.698961,0.732278,0.705868,0.71622,0.014436,6
5,175.827229,11.167339,0.016459,0.001458,1.0,squared_epsilon_insensitive,"{'model__C': 1.0, 'model__loss': 'squared_epsi...",0.770812,0.763647,0.765426,0.764314,0.751483,0.763136,0.00635,3
6,161.023362,1.752178,0.01588,0.001246,10.0,epsilon_insensitive,"{'model__C': 10.0, 'model__loss': 'epsilon_ins...",0.589766,0.39249,0.145206,0.542638,0.403936,0.414807,0.155082,9
7,158.728495,0.860527,0.01644,0.001372,10.0,squared_epsilon_insensitive,"{'model__C': 10.0, 'model__loss': 'squared_eps...",0.661289,0.537052,0.371887,0.640227,0.560688,0.554228,0.102414,7
8,160.284822,1.114932,0.016532,0.001588,100.0,epsilon_insensitive,"{'model__C': 100.0, 'model__loss': 'epsilon_in...",0.589766,0.39249,0.145206,0.542638,0.403936,0.414807,0.155082,9
9,145.407548,13.92806,0.013573,0.002072,100.0,squared_epsilon_insensitive,"{'model__C': 100.0, 'model__loss': 'squared_ep...",0.598931,0.410548,0.17315,0.55554,0.424509,0.432536,0.148759,8


In [16]:
df = pd.DataFrame(models["GradientBoostingRegressor"].cv_results_)
df.to_csv("GradientBoostingRegressor.csv")
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,52.02758,0.748877,0.027726,0.001582,4,50,"{'model__max_depth': 4, 'model__n_estimators':...",0.792115,0.774761,0.774254,0.745993,0.778461,0.773117,0.015028,9
1,103.970696,0.72533,0.03239,0.00041,4,100,"{'model__max_depth': 4, 'model__n_estimators':...",0.810389,0.79873,0.796236,0.76684,0.79833,0.794105,0.014508,8
2,158.188339,0.820744,0.036558,0.000372,4,150,"{'model__max_depth': 4, 'model__n_estimators':...",0.817855,0.806258,0.804243,0.77294,0.805043,0.801268,0.015004,6
3,73.431068,0.708227,0.030543,0.000637,6,50,"{'model__max_depth': 6, 'model__n_estimators':...",0.814558,0.801478,0.80249,0.773001,0.801524,0.79861,0.013725,7
4,154.791722,0.910157,0.036936,0.000568,6,100,"{'model__max_depth': 6, 'model__n_estimators':...",0.823518,0.809647,0.808897,0.779351,0.808739,0.80603,0.014465,5
5,236.340085,0.361171,0.042633,0.001349,6,150,"{'model__max_depth': 6, 'model__n_estimators':...",0.825907,0.812607,0.809128,0.780032,0.811237,0.807782,0.015071,3
6,93.786086,0.76714,0.032558,0.000955,8,50,"{'model__max_depth': 8, 'model__n_estimators':...",0.8213,0.812602,0.808815,0.78102,0.806524,0.806052,0.013489,4
7,202.546509,0.697256,0.041576,0.000632,8,100,"{'model__max_depth': 8, 'model__n_estimators':...",0.825275,0.815836,0.809512,0.784025,0.809474,0.808824,0.01368,2
8,295.853334,19.945219,0.041941,0.00658,8,150,"{'model__max_depth': 8, 'model__n_estimators':...",0.826283,0.816615,0.808535,0.785856,0.809489,0.809356,0.013358,1


In [17]:
df = pd.DataFrame(models["RandomForestRegressor"].cv_results_)
df.to_csv("RandomForestRegressor.csv")
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,34.15316,0.252268,0.027616,0.00227,4.0,50,"{'model__max_depth': 4, 'model__n_estimators':...",0.541868,0.498536,0.532178,0.498986,0.511606,0.516635,0.017565,12
1,67.942859,0.287103,0.033145,0.002024,4.0,100,"{'model__max_depth': 4, 'model__n_estimators':...",0.541919,0.499342,0.534423,0.498167,0.511671,0.517105,0.017996,11
2,102.135229,0.40276,0.036365,0.001387,4.0,150,"{'model__max_depth': 4, 'model__n_estimators':...",0.541591,0.500656,0.534694,0.500264,0.513089,0.518059,0.017174,10
3,50.157058,0.336967,0.026386,0.001088,6.0,50,"{'model__max_depth': 6, 'model__n_estimators':...",0.624369,0.598066,0.638092,0.595173,0.607387,0.612617,0.016312,8
4,100.250093,0.489691,0.034009,0.000638,6.0,100,"{'model__max_depth': 6, 'model__n_estimators':...",0.624894,0.597378,0.636771,0.593201,0.609436,0.612336,0.01645,9
5,149.471696,0.587328,0.042181,0.000372,6.0,150,"{'model__max_depth': 6, 'model__n_estimators':...",0.626845,0.598343,0.637497,0.592197,0.609761,0.612929,0.017028,7
6,64.125926,0.237221,0.028869,0.000455,8.0,50,"{'model__max_depth': 8, 'model__n_estimators':...",0.690941,0.66976,0.704055,0.670033,0.679074,0.682773,0.013153,5
7,128.197812,0.306128,0.041595,0.000748,8.0,100,"{'model__max_depth': 8, 'model__n_estimators':...",0.691143,0.670439,0.704133,0.668502,0.679289,0.682701,0.013373,6
8,192.188969,0.411613,0.052409,0.002175,8.0,150,"{'model__max_depth': 8, 'model__n_estimators':...",0.693599,0.671084,0.705705,0.668045,0.679477,0.683582,0.014172,4
9,157.689254,2.339161,0.085481,0.002341,,50,"{'model__max_depth': None, 'model__n_estimator...",0.814747,0.81325,0.815844,0.781531,0.809861,0.807047,0.012916,3
