In [19]:
from sklearn.linear_model import (
    LinearRegression,
    RidgeCV,
    BayesianRidge,
    Ridge,
    ElasticNetCV,
    ElasticNet,
    Lasso,
    LassoCV,
    LarsCV,
    SGDRegressor
)
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.model_selection import cross_val_score, LeaveOneOut, KFold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pathlib import Path

INPUT_PATH = Path("/mnt/storage_dimm2/kaggle_data/commonlitreadabilityprize")
OUTPUT_PATH = Path("/mnt/storage_dimm2/kaggle_output/commonlitreadabilityprize")

In [69]:
# model_folders = [
#     # complex-heron-of-science - roberta-base
#     "20210609-171109",
#     "20210609-174639",
#     "20210609-182121",
#     "20210609-192843",
#     "20210609-200242",
#     # impetuous-marvellous-cockle - roberta-large
#     "20210608-233655",
#     "20210609-004922",
#     "20210609-020213",
#     "20210609-205046",
#     "20210609-220344",
#     # zippy-caped-leech - albert-large
#     "20210609-125306",
#     "20210609-141352",
#     "20210609-154233",
#     "20210610-000227",
#     "20210610-013100",
#     # armored-cobalt-crow - distill roberta
#     "20210610-074205",
#     "20210610-080716",
#     "20210610-083206",
#     "20210610-085718",
#     "20210610-093912",
#     # big-slug-of-tranquility - funnel transformer
#     "20210610-100607",
#     "20210610-111551",
#     "20210610-122301",
#     "20210610-133140",
#     "20210610-144044",
#     # nocturnal-winged-lionfish - bert-base-uncased
#     "20210610-184414",
#     "20210610-191826",
#     "20210610-195230",
#     "20210610-202640",
#     # adaptable-scallop-of-anger - roberta-base (hidden)
#     "20210610-222705",
#     "20210610-230256",
#     "20210610-233847",
#     "20210611-001430",
#     "20210611-005055",
#     # fluffy-dandelion-skua - roberta-large (hidden)
#     "20210611-012655",
#     "20210611-024254",
#     "20210611-035830",
#     "20210611-051501",
#     "20210611-063430",
#     # gregarious-classic-yak - albert-large (hidden)
#     "20210611-080150",
#     "20210611-092625",
#     "20210611-105105",
#     "20210611-121556",
#     "20210611-134040",
#     # serious-outrageous-caribou - deberta-base
#     "20210611-151313",
#     "20210611-155942",  # This seed seems amazing
#     "20210611-164611",
#     "20210611-173302",
#     "20210611-182030",
#     # ostrich-of-abstract-art - deberta-large
#     "20210611-191919",
#     "20210611-212749",
#     "20210611-233646",
#     "20210612-014457",
#     "20210612-035134",
# ]

model_folders = [
    # cherubic-nifty-serval - deberta-large
    "20210614-173633",
    "20210614-203831",
    "20210614-234025",
    "20210615-024138",
    "20210615-054256",
    # scrupulous-mink-of-amplitude - deberta-base
    "20210615-084357",
    "20210615-094729",
    "20210615-105329",
    "20210615-120001",
    "20210615-130640",
    # hilarious-daffodil-turtle - funnel (failed), but keep them in just in case
    "20210615-141341",
    "20210615-154843",
    "20210615-172313",
    "20210615-190040",
    # notorious-sticky-gibbon - roberta-base (with hidden)
    "20210615-220146",
    "20210615-225055",
    "20210615-234038",
    "20210616-003038",
    "20210616-012048",
    # fortunate-cherry-mandrill - roberta-large
    "20210616-021135",
    "20210616-041221",
    "20210616-060255",
    "20210616-075451",
    "20210616-094506",
    # fortunate-cherry-mandrill - funnel (no swa) (failed)
    "20210616-093026",
    "20210616-110644",
    "20210616-124320",
    "20210616-142115",
    # mottled-certain-caracal - distilroberta-base
    "20210616-113626",
    "20210616-121203",
    "20210616-124738",
    "20210616-132341",
    "20210616-140300",
    
    # proud-smilodon-from-mars - funnel low LR
#     "20210616-230232",
#     "20210617-004504",
#     "20210617-022543",
#     "20210617-040816",
#     "20210617-055118",
    # gorgeous-rapid-camel - Albert low LR
    "20210616-230221",
#     "20210617-010530",
#     "20210617-030914",
#     "20210617-051322",
#     "20210617-071738",
]

dataset_paths = [OUTPUT_PATH / f for f in model_folders]

In [70]:
mpaths, oof_paths = [], []
for p in dataset_paths:
    mpaths.append(sorted(list(p.glob(f"*/*/*.ckpt"))))
    oof_paths.extend(sorted(list(p.glob(f"*.csv"))))
    
len(model_folders), len(oof_paths)

(39, 37)

In [71]:
oofs = pd.read_csv(INPUT_PATH / "train.csv", usecols=["id", "target", "standard_error"]).sort_values(
        by="id"
    )
for i, (p, f) in enumerate(zip(oof_paths, model_folders)):
    x = pd.read_csv(p).sort_values(by="id")
    oofs[f] = x["prediction"].values

# pred_cols = [f"model_{i}" for i in range(len(mpaths))]
pred_cols = model_folders
    
# oofs.head()

In [72]:
reg = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50, 100, 500, 1000))
reg.fit(oofs[pred_cols], oofs["target"])
print(f"Best score: {np.sqrt(-reg.best_score_):0.5f}. Alpha {reg.alpha_}")
reg.coef_

KeyError: "['20210617-055118', '20210616-230221'] not in index"

In [None]:
corr = oofs[pred_cols].corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=0.9, vmax=1.0, # center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
oofs.describe()

In [None]:
scores = cross_val_score(
#     LinearRegression(),
    Ridge(alpha=100),
#     BaggingRegressor(n_estimators=100),
    oofs[pred_cols],
    oofs["target"],
    scoring="neg_mean_squared_error",
#     cv=LeaveOneOut(),
    n_jobs=-1,
)
print(f"Best score: {np.sqrt(np.mean(-scores)):0.5f}")

In [57]:
# # reg = LassoCV(max_iter=5000, cv=LeaveOneOut(), n_jobs=-1)  # 0.45689
# reg = ElasticNetCV(l1_ratio=[0.01, 0.05, .1, .5, .7, .9, .95, .99, 1], max_iter=5000, n_jobs=-1, tol=1e-5)  # 0.45619

# reg.fit(oofs[pred_cols], oofs["target"])
# # print(f"Best score: {np.sqrt(reg.mse_path_[-1].mean()):0.5f}")
# print(reg.l1_ratio_)

# np.sqrt(reg.mse_path_[1, -1].mean())

# Brute force model selection

In [58]:
# def powerset(iterable):
#     "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
#     s = list(iterable)  # allows duplicate elements
#     return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s)+1))

# subsets = list(powerset(pred_cols))
# len(subsets)

In [59]:
# best_score = 10
# best_cols = None

# for s in tqdm(subsets[1:]):
#     reg = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50, 100, 500, 1000))
#     reg.fit(oofs[list(s)], oofs["target"])
#     score = np.sqrt(-reg.best_score_)
    
#     if score < best_score:
#         best_score = score
#         best_cols = list(s)
        
# print(f"Best score: {best_score:0.5f}")
# print("Best cols", best_cols)

In [60]:
# ['model_1', 'model_2', 'model_4', 'model_5', 'model_6', 'model_7', 'model_8', 'model_9', 'model_11', 'model_13', 'model_16']

# Hill climbing

In [61]:
def get_score(X, y):
    reg = RidgeCV(alphas=(0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50, 100, 500, 1000))
    reg.fit(X, y)
    return np.sqrt(-reg.best_score_)

In [62]:
candidates = model_folders.copy()

best_score = np.inf
selection = []
# tol = 0.00001
tol = 0
y = oofs["target"]

# Find best initial model
initial_scores = [np.sqrt(mean_squared_error(oofs[c], oofs["target"])) for c in candidates]
idx = np.argmin(initial_scores)
best_score = initial_scores[idx]
selection.append(candidates[idx])
print(f"Initial {candidates[idx]}. Score {best_score:0.5f}")
del candidates[idx]


while True:
    scores = [get_score(oofs[selection + [c]], y) for c in candidates]
    idx = np.argmin(scores)
    if scores[idx] < best_score - tol:
        best_score = scores[idx]
        selection.append(candidates[idx])
        print(f"Added {candidates[idx]}. New best score {best_score:0.5f}")
        del candidates[idx]
    else:
        break
        
print(len(selection), "models selected")
        
selection

KeyError: '20210617-051322'

In [39]:
selection = [
#     # complex-heron-of-science - roberta-base
#     "20210609-171109",  # deleted
#     "20210609-174639",
    "20210609-182121",
#     "20210609-192843",  # deleted
    "20210609-200242",
#     # impetuous-marvellous-cockle - roberta-large
#     "20210608-233655",  # deleted
#     "20210609-004922",  # deleted
    "20210609-020213",
#     "20210609-205046",  # deleted
#     "20210609-220344",  # deleted
#     # zippy-caped-leech - albert-large
#     "20210609-125306",  # deleted
#     "20210609-141352",  # deleted
#     "20210609-154233",
    "20210610-000227",
    "20210610-013100",
#     # armored-cobalt-crow - distill roberta
    "20210610-074205",
    "20210610-080716",
#     "20210610-083206",  # deleted
    "20210610-085718",
#     "20210610-093912",  # deleted
#     # big-slug-of-tranquility - funnel transformer
    "20210610-100607",
#     "20210610-111551",  # deleted
#     "20210610-122301",  # deleted
    "20210610-133140",
    "20210610-144044",
#     # nocturnal-winged-lionfish - bert-base-uncased
#     "20210610-184414",
#     "20210610-191826",
#     "20210610-195230",
#     "20210610-202640",
#     # adaptable-scallop-of-anger - roberta-base (hidden)
    "20210610-222705",
#     "20210610-230256",
    "20210610-233847",
    "20210611-001430",
#     "20210611-005055",
#     # fluffy-dandelion-skua - roberta-large (hidden)
    "20210611-012655",
    "20210611-024254",
#     "20210611-035830",
#     "20210611-051501",
#     "20210611-063430",
#     # gregarious-classic-yak - albert-large (hidden)
#     "20210611-080150",
#     "20210611-092625",
#     "20210611-105105",
#     "20210611-121556",
#     "20210611-134040",
#     # serious-outrageous-caribou - deberta-base
#     "20210611-151313",
    "20210611-155942",  # This seed seems amazing
    "20210611-164611",
#     "20210611-173302",
#     "20210611-182030",
#     # ostrich-of-abstract-art - deberta-large
#     "20210611-191919",
    "20210611-212749",
    "20210611-233646",
#     "20210612-014457",
#     "20210612-035134",
]