In [1]:
import pickle
import pandas as pd
import numpy as np

meta = pd.read_csv('kernels_meta.csv')
    
with open('enc_states_sync_all.p', 'rb') as fp:
    all_comps = pickle.load(fp)

In [2]:
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

def test(X_train, X_test, y_train, y_test):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', MLPRegressor(
            hidden_layer_sizes = (200, 150, 100, 50), 
            max_iter = 1000, 
            random_state = 14,
            early_stopping = True,
            tol = 1e-7,
            n_iter_no_change = 20,
        ))
    ])

    pipe.fit(X_train, y_train)
    print("R^2:", pipe.score(X_test, y_test))
    print("MSE:", MSE(pipe.predict(X_test), y_test))
    print("MAE:", MAE(pipe.predict(X_test), y_test))

In [3]:
comps_by_loss = []

for _ in range(20):
    comps_by_loss.append([])

for note in all_comps:
    loss_num = np.argmax(note["encoded_sequence"][-20:]).item()
    comps_by_loss[loss_num].append(note)

In [4]:
assert(len(comps_by_loss[1]) == 4926)

In [5]:
def transform_target(target):
    ans = np.zeros(78)
    coef = 1.0
    gamma = 0.9
    
    for snip in target:
        ans[snip] += coef
        coef *= gamma
        if coef < 0.01:
            break

    return ans

In [6]:
from sklearn.model_selection import train_test_split

def test_loss_num(num): 
    if num != -1:
        train_data, test_data = train_test_split(
            comps_by_loss[num],
            test_size = 0.25,
            random_state = 14
        )
        back = 20
        print("Проверка номера", num)
    
    else:
        train_data, test_data = train_test_split(
            all_comps,
            test_size = 0.25,
            random_state = 14
        )
        back = 0
        print("Проверка номера на всех данных")
        
    
    X_train, X_test, y_train, y_test = [], [], [], []

    for note in train_data:
        score = meta[meta.kernel_id == note["kernel_id"]].kaggle_score.iloc[0]

        X_train.append(np.append(
            note["encoded_sequence"][:-back], transform_target(note["target"])
        ))

        y_train.append(score)

    for note in test_data:
        score = meta[meta.kernel_id == note["kernel_id"]].kaggle_score.iloc[0]

        X_test.append(np.append(
            note["encoded_sequence"][:-back], transform_target(note["target"])
        ))

        y_test.append(score)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    test(X_train, X_test, y_train, y_test)
    print('=' * 100)

In [7]:
test_loss_num(-1)

Проверка номера на всех данных
R^2: -0.009122166927861164
MSE: 39149030575656.62
MAE: 600531.0762405951


In [8]:
for i in range(20):
    test_loss_num(i)

Проверка номера 0
R^2: 0.6435435980457294
MSE: 0.02317092018069148
MAE: 0.09347923464041695
Проверка номера 1
R^2: -0.24375926620597554
MSE: 0.054108527091837125
MAE: 0.09988047812691282
Проверка номера 2
R^2: 0.5989017603137994
MSE: 0.012451970053844399
MAE: 0.07325414986575857
Проверка номера 3
R^2: -0.47889933197845425
MSE: 0.17405110279677508
MAE: 0.20379288503364887
Проверка номера 4
R^2: 0.9573884315259129
MSE: 6983536217.850132
MAE: 14560.076821312161
Проверка номера 5
R^2: 0.16330899582005576
MSE: 0.03468645279666179
MAE: 0.11369033042822575
Проверка номера 6
R^2: 0.3931813799118944
MSE: 9.391415324637993
MAE: 1.0234647545001068
Проверка номера 7
R^2: 0.3625756141902001
MSE: 0.040042275498701985
MAE: 0.144805821690168
Проверка номера 8
R^2: -0.02498290161269856
MSE: 5.37189039384626
MAE: 0.8860318392536284
Проверка номера 9
R^2: 0.9090595235613775
MSE: 172851203895.88956
MAE: 67858.17804469004
Проверка номера 10
R^2: -0.006802721088435382
MSE: 1.8244653886248904e+76
MAE: 1.1102