In [1]:
import pickle
import pandas as pd
import numpy as np

meta = pd.read_csv('kernels_meta.csv')

with open('train_untrained_bert_tabular.p', 'rb') as fp:
    train_data = pickle.load(fp)

with open('test_untrained_bert_tabular.p', 'rb') as fp:
    test_data = pickle.load(fp)    
    
with open('generations-before-rl.p', 'rb') as fp:
    data_before = pickle.load(fp)

with open('generations-after-rl.p', 'rb') as fp:
    data_after = pickle.load(fp)

In [2]:
dict_before = {
    data["kernel_id"] : data["sequence"] for data in data_before
}

dict_after = {
    data["kernel_id"] : data["sequence"] for data in data_after
}

In [3]:
def transform_target(target):
    ans = np.zeros(78)
    coef = 1.0
    gamma = 0.5
    
    for snip in target:
        ans[snip] += coef
        coef *= gamma
        if coef < 0.0001:
            break

    return ans

def sigmoid(x):
    return x / (1.0 + np.abs(x))

def generate_data_pair(data):
    X, y = [], []
    
    for note in data:
        score = meta[meta.kernel_id == note["kernel_id"]].kaggle_score.iloc[0]
        
        X.append(np.concatenate([
            note["encoded_sequence"], transform_target(note["target"]), [len(note["target"])]
        ]))

        y.append(sigmoid(score))
        
    return np.array(X), np.array(y)

X_train, y_train = generate_data_pair(train_data)

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

model = RandomForestRegressor(
    random_state = 14,
    max_depth = None,
    n_estimators = 150,
)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", model)
])

pipe.fit(X_train, y_train)

In [5]:
data_after[0]

{'kernel_id': 8179034, 'sequence': [75, 21, 41, 6, 43, 23, 76]}

In [6]:
i = 0

for bef, aft in zip(data_before, data_after):
    print(bef["sequence"])
    print(aft["sequence"])
    print("=" * 100)
    i += 1
    if i == 10:
        break

[75, 21, 41, 6, 43, 23, 76]
[75, 21, 41, 6, 43, 23, 76]
[75, 21, 41, 20, 12, 6, 43, 23, 76]
[75, 21, 41, 20, 43, 23, 76]
[75, 21, 41, 20, 12, 6, 23, 76]
[75, 21, 41, 61, 20, 76]
[75, 22, 41, 10, 7, 9, 19, 9, 19, 9, 20, 12, 3, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43]
[75, 21, 41, 9, 10, 9, 19, 9, 19, 20, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6, 43, 6]
[75, 21, 41, 20, 3, 6, 43, 23, 76]
[75, 21, 41, 15, 20, 43, 23, 76]
[75, 21, 41, 61, 41, 61, 20, 21, 12, 21, 6

In [7]:
X_before, y_before = [], []

for note in test_data:
    kernel_id = note["kernel_id"]
    
    obj = dict_before.get(kernel_id)
    
    if obj is None:
        continue
    
    score = meta[meta.kernel_id == kernel_id].kaggle_score.iloc[0]
    
    X_before.append(np.concatenate([
        note["encoded_sequence"], transform_target(obj), [len(obj)]
    ]))

    y_before.append(sigmoid(score))
    
X_before, y_before = np.array(X_before), np.array(y_before)

In [8]:
X_after, y_after = [], []

for note in test_data:
    kernel_id = note["kernel_id"]
    
    obj = dict_after.get(kernel_id)
    
    if obj is None:
        continue
    
    score = meta[meta.kernel_id == kernel_id].kaggle_score.iloc[0]

    X_after.append(np.concatenate([
        note["encoded_sequence"], transform_target(obj), [len(obj)]
    ]))

    y_after.append(sigmoid(score))
    
X_after, y_after = np.array(X_after), np.array(y_after)

In [9]:
print(pipe.score(X_before, y_before))

0.6893014765427417


In [10]:
print(pipe.score(X_after, y_after))

0.6794429511129919


In [11]:
from sklearn.metrics import mean_squared_error as MSE

print(pipe.predict(X_before).mean())

0.4397216771815016


In [12]:
print(pipe.predict(X_after).mean())

0.43671685272525906


In [13]:
def antisigmoid(x):
    return x / (1.0 - np.abs(x))

In [14]:
arr = []

for elem in pipe.predict(X_before):
    arr.append(abs(antisigmoid(elem)))
    
print(np.mean(arr))

14732.257042041985


In [15]:
arr = []

for elem in pipe.predict(X_after):
    arr.append(abs(antisigmoid(elem)))
    
print(np.mean(arr))

69888.55992367145


In [16]:
arr = []

for elem in y_before:
    arr.append(abs(antisigmoid(elem)))
    
print(np.mean(arr))

72136.89316984237


In [17]:
arr = []

for elem in y_after:
    arr.append(abs(antisigmoid(elem)))
    
print(np.mean(arr))

72136.89316984237
