In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import LeaveOneOut, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score
from pathlib import Path

INPUT_PATH = Path("/mnt/storage_dimm2/kaggle_data/commonlitreadabilityprize")
OUTPUT_PATH = Path("/mnt/storage_dimm2/kaggle_output/commonlitreadabilityprize")

torch.manual_seed(48)
np.random.seed(48)

In [2]:
model_folders = [
    "20210614-203831",
    "20210615-094729",
    "20210616-003038",
    "20210616-041221",
    "20210616-132341",
    "20210617-135233",
    "20210618-183719",
    "20210618-223208",
    "20210619-004022",
    "20210619-035747",
    "20210619-064351",
    "20210619-093050",
    "20210623-201514",
    "20210623-232231",
    "20210624-012102",
    "20210624-015812",
    "20210624-101855",
    "20210624-044356",
    "20210624-113506",
    "20210624-150250",
    "20210627-105133",
    "20210627-152616",
    "20210627-105144",
    "20210627-151904",
    "20210628-045559",
    "20210628-085322",
    "20210627-213946",
    "20210628-031447",
    "20210628-114738",
    "20210628-145921",
    "20210628-212819",
    "20210629-012726",
    "20210629-035901",
    "20210629-163239",
    "20210705-162253",
    "20210710-124531",
    "20210710-173710",
]

In [3]:
def build_oof_df(folders=model_folders):
    dataset_paths = [OUTPUT_PATH / f for f in folders]
    mpaths, oof_paths = [], []
    for p in dataset_paths:
        mpaths.append(sorted(list(p.glob(f"*/*/*.ckpt"))))
        oof_paths.extend(sorted(list(p.glob(f"*.csv"))))

    oofs = pd.read_csv(
        INPUT_PATH / "train.csv", usecols=["id", "target", "standard_error"]
    ).sort_values(by="id")
    for i, p in enumerate(oof_paths):
        x = pd.read_csv(p).sort_values(by="id")
        oofs[p.parent.name] = x["prediction"].values

    return oofs.reset_index(drop=True)


def create_folds(data, n_splits, random_state=None):
    # we create a new column called fold and fill it with -1
    data["fold"] = -1

    # the next step is to randomize the rows of the data
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))

    # bin targets
    data.loc[:, "bins"] = pd.cut(data["target"], bins=num_bins, labels=False)

    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, "fold"] = f

    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [4]:
oofs = build_oof_df()
oofs = create_folds(oofs, 5, 48)

In [5]:
oofs["std_dev"] = oofs[model_folders].std(1)
oofs["mean"] = oofs[model_folders].mean(1)
oofs["min"] = oofs[model_folders].min(1)
oofs["max"] = oofs[model_folders].max(1)
oofs["range"] = oofs[model_folders].max(1) - oofs[model_folders].min(1)

stat_feats = ["std_dev", "min", "max", "range"]

# Linear baseline

In [6]:
fold_scores = []

for fold in range(5):
    trn_df = oofs.query(f"fold != {fold}")
    val_df = oofs.query(f"fold == {fold}")
#     trn_df = oofs_trfm.query(f"fold != {fold}")
#     val_df = oofs_trfm.query(f"fold == {fold}")

    train_X = torch.tensor(trn_df[model_folders].values, dtype=torch.float32)
    train_y = torch.tensor(trn_df["target"].values, dtype=torch.float32).view(-1, 1)
    valid_X = torch.tensor(val_df[model_folders].values, dtype=torch.float32)
    valid_y = torch.tensor(val_df["target"].values, dtype=torch.float32).view(-1, 1)

    W = torch.linalg.lstsq(train_X, train_y).solution
    y_pred = valid_X @ W
    rmse = torch.sqrt(torch.nn.functional.mse_loss(y_pred, valid_y)).numpy()

    print(f"Fold {fold} RMSE: {rmse:0.5f}")
    fold_scores.append(rmse)

print(f"Mean: {np.mean(fold_scores):0.5f}")

Fold 0 RMSE: 0.45144
Fold 1 RMSE: 0.43806
Fold 2 RMSE: 0.42641
Fold 3 RMSE: 0.46526
Fold 4 RMSE: 0.43669
Mean: 0.44357


# PyTorch

In [7]:
class Net(nn.Module):
    def __init__(self, n_inputs, n_hidden=64):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_inputs, n_hidden, bias=False),
            nn.ReLU(),
            nn.Linear(n_hidden, n_hidden, bias=False),
            nn.ReLU(),
            nn.Linear(n_hidden, 1),
        )
#         self.net = nn.Linear(n_inputs, 1, bias=False)

    def forward(self, x):
        return self.net(x)

In [8]:
EPOCHS = 500
fold_scores = []

for fold in range(5):
    trn_df = oofs.query(f"fold != {fold}")
    val_df = oofs.query(f"fold == {fold}")

    train_X = torch.tensor(trn_df[model_folders + stat_feats].values, dtype=torch.float32)
    train_y = torch.tensor(trn_df["target"].values, dtype=torch.float32).view(-1, 1)
    valid_X = torch.tensor(val_df[model_folders + stat_feats].values, dtype=torch.float32)
    valid_y = torch.tensor(val_df["target"].values, dtype=torch.float32).view(-1, 1)

    train_dataset = TensorDataset(train_X, train_y)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    model = Net(train_X.shape[1])
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

    val_loss_curve = []

    for epoch in range(EPOCHS):
        # Training
        train_loss = []
        for features, target in train_loader:
            optimizer.zero_grad()
            y_pred = model(features)
            loss = loss_fn(y_pred, target)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.detach().numpy())
        scheduler.step()

        # Validation
        y_pred = model(valid_X)
        loss = loss_fn(y_pred, valid_y)
        valid_loss = loss.detach().numpy()
        val_loss_curve.append(np.sqrt(valid_loss))

#         print(
#             f"Fold: {fold}, Epoch: {epoch}, "
#             f"Train loss: {np.sqrt(np.mean(train_loss)):0.5f}, "
#             f"Valid loss: {np.sqrt(np.mean(valid_loss)):0.5f}",
#         )

    print(
        f"Fold {fold} best loss {np.min(val_loss_curve):0.5f} at epoch {np.argmin(val_loss_curve)}"
    )
    fold_scores.append(np.min(val_loss_curve))

print(f"Mean: {np.mean(fold_scores):0.5f}")

Fold 0 best loss 0.45220 at epoch 11
Fold 1 best loss 0.44303 at epoch 10
Fold 2 best loss 0.42409 at epoch 9
Fold 3 best loss 0.46575 at epoch 12
Fold 4 best loss 0.44276 at epoch 6
Mean: 0.44557


In [9]:
# Mean: 0.44111 - Linear only
# Mean: 0.44423 - 2 layers
# Mean: 0.44539 - 3 layers

# XGBoost

In [10]:
import xgboost as xgb

In [11]:
param = {
    "seed": 48,
    "max_depth": 3,
    "eta": 0.01,  # learning rate
    "gamma": 0.01,
    "objective": "reg:squarederror",
#     "colsample_bytree": 0.5,
#     "colsample_bylevel": 0.5,
#     "lambda": 10,
}


fold_scores = []

for fold in range(5):
    trn_df = oofs.query(f"fold != {fold}")
    val_df = oofs.query(f"fold == {fold}")

    train_dataset = xgb.DMatrix(trn_df[model_folders], label=trn_df["target"])
    valid_dataset = xgb.DMatrix(val_df[model_folders], val_df["target"])

    num_round = 1000
    bst = xgb.train(
        param,
        train_dataset,
        num_round,
        evals=[(train_dataset, "train"), (valid_dataset, "valid")],
        early_stopping_rounds=50,
        verbose_eval=10,
    )
    # make prediction
    y_pred = bst.predict(valid_dataset)

    rmse = np.sqrt(mean_squared_error(y_pred, val_df["target"]))
    print(f"Fold {fold} RMSE: {rmse:0.5f}")
    fold_scores.append(rmse)

print(f"Mean: {np.mean(fold_scores):0.5f}")

[0]	train-rmse:1.77438	valid-rmse:1.76026
[10]	train-rmse:1.61744	valid-rmse:1.60484
[20]	train-rmse:1.47635	valid-rmse:1.46531
[30]	train-rmse:1.34969	valid-rmse:1.33990
[40]	train-rmse:1.23604	valid-rmse:1.22753
[50]	train-rmse:1.13427	valid-rmse:1.12719
[60]	train-rmse:1.04335	valid-rmse:1.03753
[70]	train-rmse:0.96230	valid-rmse:0.95786
[80]	train-rmse:0.89026	valid-rmse:0.88737
[90]	train-rmse:0.82639	valid-rmse:0.82532
[100]	train-rmse:0.76996	valid-rmse:0.77085
[110]	train-rmse:0.72029	valid-rmse:0.72312
[120]	train-rmse:0.67675	valid-rmse:0.68167
[130]	train-rmse:0.63872	valid-rmse:0.64563
[140]	train-rmse:0.60567	valid-rmse:0.61469
[150]	train-rmse:0.57705	valid-rmse:0.58822
[160]	train-rmse:0.55236	valid-rmse:0.56564
[170]	train-rmse:0.53114	valid-rmse:0.54651
[180]	train-rmse:0.51299	valid-rmse:0.53031
[190]	train-rmse:0.49746	valid-rmse:0.51682
[200]	train-rmse:0.48426	valid-rmse:0.50554
[210]	train-rmse:0.47303	valid-rmse:0.49606
[220]	train-rmse:0.46345	valid-rmse:0.48813

[580]	train-rmse:0.39633	valid-rmse:0.43950
[582]	train-rmse:0.39625	valid-rmse:0.43951
Fold 2 RMSE: 0.43951
[0]	train-rmse:1.77094	valid-rmse:1.77404
[10]	train-rmse:1.61425	valid-rmse:1.61710
[20]	train-rmse:1.47340	valid-rmse:1.47571
[30]	train-rmse:1.34677	valid-rmse:1.34945
[40]	train-rmse:1.23319	valid-rmse:1.23634
[50]	train-rmse:1.13152	valid-rmse:1.13540
[60]	train-rmse:1.04064	valid-rmse:1.04564
[70]	train-rmse:0.95961	valid-rmse:0.96584
[80]	train-rmse:0.88757	valid-rmse:0.89523
[90]	train-rmse:0.82370	valid-rmse:0.83316
[100]	train-rmse:0.76723	valid-rmse:0.77882
[110]	train-rmse:0.71751	valid-rmse:0.73134
[120]	train-rmse:0.67391	valid-rmse:0.69005
[130]	train-rmse:0.63582	valid-rmse:0.65453
[140]	train-rmse:0.60270	valid-rmse:0.62390
[150]	train-rmse:0.57401	valid-rmse:0.59782
[160]	train-rmse:0.54928	valid-rmse:0.57569
[170]	train-rmse:0.52804	valid-rmse:0.55712
[180]	train-rmse:0.50987	valid-rmse:0.54156
[190]	train-rmse:0.49434	valid-rmse:0.52863
[200]	train-rmse:0.481

In [12]:
# Mean: 0.45378