In [1]:
import pickle
import pandas as pd
import numpy as np

meta = pd.read_csv('kernels_meta.csv')
    
with open('enc_states_sync_all.p', 'rb') as fp:
    all_comps = pickle.load(fp)

In [2]:
len(all_comps)

22249

In [3]:
all_comps[0].keys()

dict_keys(['kernel_id', 'encoded_sequence', 'encoded_tokens', 'target', 'classes', 'comp_name', 'weight'])

In [4]:
all_comps[0]["classes"]

[3, 8, 8, 6, 6, 2, 7, 7, 2, 2, 1, 4]

In [5]:
all_comps[0]["target"]

tensor([75, 22, 41, 35, 20, 51,  7, 12,  6, 23, 76])

In [6]:
names = set()

for comp in all_comps:
    names.add(comp["comp_name"])
    
print(len(names))

412


In [7]:
classes = set()

for comp in all_comps:
    classes |= set(comp["classes"])
    
print(len(classes))
print(classes)

12
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

def test(X_train, X_test, y_train, y_test):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', GradientBoostingRegressor(
            random_state = 14,
            n_iter_no_change = 10,
            tol = 1e-6,
            n_estimators = 200,
        ))
    ])

    pipe.fit(X_train, y_train)
    print("R^2:", pipe.score(X_test, y_test))
    print("MSE:", MSE(pipe.predict(X_test), y_test))
    print("MAE:", MAE(pipe.predict(X_test), y_test))

In [9]:
comps_by_loss = []

for _ in range(20):
    comps_by_loss.append([])

for note in all_comps:
    loss_num = np.argmax(note["encoded_sequence"][-20:]).item()
    comps_by_loss[loss_num].append(note)

In [10]:
assert(len(comps_by_loss[1]) == 4926)

In [11]:
def transform_target(target):
    ans = np.zeros(78)
    coef = 1.0
    gamma = 0.9
    
    for snip in target:
        ans[snip] += coef
        coef *= gamma
        if coef < 0.01:
            break

    return ans

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

In [12]:
from sklearn.model_selection import train_test_split

def test_loss_num(num): 
    if num != -1:
        train_data, test_data = train_test_split(
            comps_by_loss[num],
            test_size = 0.25,
            random_state = 14
        )
        back = 20
        print("Проверка номера", num)
    
    else:
        train_data, test_data = train_test_split(
            all_comps,
            test_size = 0.25,
            random_state = 14
        )
        back = 0
        print("Проверка номера на всех данных")
        
    
    X_train, X_test, y_train, y_test = [], [], [], []

    for note in train_data:
        score = meta[meta.kernel_id == note["kernel_id"]].kaggle_score.iloc[0]
        
        X_train.append(np.concatenate([
            note["encoded_sequence"][:-back], transform_target(note["target"]), [1]
        ]))

        y_train.append(sigmoid(score))

    for note in test_data:
        score = meta[meta.kernel_id == note["kernel_id"]].kaggle_score.iloc[0]

        X_test.append(np.concatenate([
            note["encoded_sequence"][:-back], transform_target(note["target"]), [1]
        ]))

        y_test.append(sigmoid(score))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    test(X_train, X_test, y_train, y_test)
    print('=' * 100)

In [13]:
test_loss_num(-1)

Проверка номера на всех данных
R^2: 0.12497980410285625
MSE: 0.015844819745134856
MAE: 0.08254130004136755


In [14]:
for i in range(20):
    test_loss_num(i)

Проверка номера 0
R^2: 0.6526084563521481
MSE: 0.0011834428500894008
MAE: 0.020329706871055196
Проверка номера 1
R^2: 0.43513189601998836
MSE: 0.001260560418729581
MAE: 0.016410443263264497
Проверка номера 2
R^2: 0.6689751214660202
MSE: 0.000515520752227089
MAE: 0.014708632464488334
Проверка номера 3
R^2: 0.5809398518966272
MSE: 0.002680506016459311
MAE: 0.033963015417023186
Проверка номера 4
R^2: 0.8299941617192232
MSE: 0.003881442097419606
MAE: 0.03318259390681577
Проверка номера 5
R^2: 0.17523020767778674
MSE: 0.0018591019714357864
MAE: 0.026318600405517107
Проверка номера 6
R^2: 0.7980455803439191
MSE: 0.005565342989251095
MAE: 0.04237346241909835
Проверка номера 7
R^2: 0.3894773133034244
MSE: 0.002136024888609946
MAE: 0.03382532976621094
Проверка номера 8
R^2: 0.32926758594334393
MSE: 0.00863772519760923
MAE: 0.05910240761487739
Проверка номера 9
R^2: 0.8502947121192279
MSE: 0.005990353970647399
MAE: 0.03536250560019678
Проверка номера 10
R^2: 0.9950698866867251
MSE: 0.00025342375