In [1]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import random
import itertools

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(" Running on:", DEVICE)

TRAIN_PATH = "/kaggle/input/da5401-2025-data-challenge/train_data.json"
TEST_PATH = "/kaggle/input/da5401-2025-data-challenge/test_data.json"
METRIC_PATH = "/kaggle/input/da5401-2025-data-challenge/metric_names.json"
METRIC_EMB_PATH = "/kaggle/input/da5401-2025-data-challenge/metric_name_embeddings.npy"

GEMMA_TRAIN_EMB = "/kaggle/input/embeddings-1/gemma_train_emb.npy"
GEMMA_TEST_EMB = "/kaggle/input/embeddings-1/gemma_test_emb.npy"


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


set_seed(42)

print(" Loading JSON data...")
with open(TRAIN_PATH, "r") as f:
    train_json = json.load(f)
with open(TEST_PATH, "r") as f:
    test_json = json.load(f)

print(" Loading metric names and embeddings...")
with open(METRIC_PATH, "r") as f:
    metric_names_raw = json.load(f)
metric_name_emb_table = np.load(METRIC_EMB_PATH)

if isinstance(metric_names_raw, list):
    if isinstance(metric_names_raw[0], str):
        metric_name_to_idx = {name: i for i, name in enumerate(metric_names_raw)}
    elif isinstance(metric_names_raw[0], dict):
        if "metric_name" in metric_names_raw[0]:
            metric_name_to_idx = {
                d["metric_name"]: i for i, d in enumerate(metric_names_raw)
            }
        elif "name" in metric_names_raw[0]:
            metric_name_to_idx = {d["name"]: i for i, d in enumerate(metric_names_raw)}
        else:
            raise ValueError("Do not know how to read metric_names.json dict format.")
else:
    raise ValueError("Unexpected format in metric_names.json")

sample_train_row = train_json[0]
if "metric_name" in sample_train_row:
    metric_field = "metric_name"
elif "metric" in sample_train_row:
    metric_field = "metric"
else:
    raise ValueError("Train JSON rows do not have 'metric_name' or 'metric' key.")

print(" Building metric embedding matrices aligned with rows...")
train_metric_embs = np.stack(
    [metric_name_emb_table[metric_name_to_idx[row[metric_field]]] for row in train_json]
).astype(np.float32)
test_metric_embs = np.stack(
    [metric_name_emb_table[metric_name_to_idx[row[metric_field]]] for row in test_json]
).astype(np.float32)

print(" Loading text (Gemma) embeddings...")
train_text_embs = np.load(GEMMA_TRAIN_EMB).astype(np.float32)
test_text_embs = np.load(GEMMA_TEST_EMB).astype(np.float32)

if len(train_metric_embs) != len(train_text_embs):
    raise ValueError(
        f"Train metric emb len {len(train_metric_embs)} "
        f"!= train text emb len {len(train_text_embs)}"
    )

y_train = pd.DataFrame(train_json)["score"].values.astype(np.float32)

print(" Performing data augmentation...")
rng = np.random.default_rng(42)
N = len(train_metric_embs)

perm = rng.permutation(N)
neg_m1, neg_t1 = train_metric_embs, train_text_embs[perm]

noise = rng.normal(scale=0.6, size=train_text_embs.shape).astype(np.float32)
neg_m2, neg_t2 = train_metric_embs, train_text_embs + noise

perm2 = rng.permutation(N)
neg_m3, neg_t3 = train_metric_embs[perm2], train_text_embs

neg_y = rng.integers(0, 4, size=N * 3).astype(np.float32)

m_all = np.vstack([train_metric_embs, neg_m1, neg_m2, neg_m3])
t_all = np.vstack([train_text_embs,   neg_t1, neg_t2, neg_t3])
y_all = np.concatenate([y_train,      neg_y])

print(" Final training set shapes ->",
      "metrics:", m_all.shape,
      "| texts:", t_all.shape,
      "| labels:", y_all.shape)


def build_features(metric_emb, text_emb):
    abs_diff = np.abs(metric_emb - text_emb)
    prod = metric_emb * text_emb
    m_norm = metric_emb / (np.linalg.norm(metric_emb, axis=1, keepdims=True) + 1e-9)
    t_norm = text_emb / (np.linalg.norm(text_emb, axis=1, keepdims=True) + 1e-9)
    cosine = np.sum(m_norm * t_norm, axis=1, keepdims=True)
    return np.hstack([metric_emb, text_emb, abs_diff, prod, cosine]).astype(np.float32)


print(" Building feature matrices...")
X_train_full = build_features(m_all, t_all)
X_test_full = build_features(test_metric_embs, test_text_embs)

print("X_train_full:", X_train_full.shape, "| X_test_full:", X_test_full.shape)


class DynamicMLP(nn.Module):
    def __init__(self, input_dim, hidden_layers=[1024, 512, 128], dropout=0.2):
        super().__init__()
        layers = []
        curr = input_dim
        for h in hidden_layers:
            layers.append(nn.Linear(curr, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            curr = h
        layers.append(nn.Linear(curr, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x).squeeze(-1)


class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X, self.y = X, y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]


def run_training(config, X, y, n_folds=3):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    scores = []
    for fold, (tr, va) in enumerate(kf.split(X)):
        print(f" Fold {fold+1}/{n_folds} for this trial")
        X_tr, y_tr = X[tr], y[tr]
        X_val, y_val = X[va], y[va]

        train_dl = DataLoader(
            TabularDataset(X_tr, y_tr),
            batch_size=config["batch_size"],
            shuffle=True,
        )
        val_dl = DataLoader(
            TabularDataset(X_val, y_val),
            batch_size=config["batch_size"],
            shuffle=False,
        )

        model = DynamicMLP(X.shape[1], config["layers"], config["dropout"]).to(DEVICE)
        opt = optim.AdamW(
            model.parameters(),
            lr=config["lr"],
            weight_decay=config["wd"],
        )
        sch = optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=2)
        crit = nn.MSELoss()

        best = float("inf")
        patience = 0

        for epoch in range(config["epochs"]):
            model.train()
            for xb, yb in train_dl:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                opt.zero_grad()
                loss = crit(model(xb), yb)
                loss.backward()
                opt.step()

            model.eval()
            preds = []
            with torch.no_grad():
                for xb, yb in val_dl:
                    preds.append(model(xb.to(DEVICE)).cpu().numpy())
            rmse = np.sqrt(mean_squared_error(y_val, np.concatenate(preds)))
            sch.step(rmse)

            if rmse < best:
                best = rmse
                patience = 0
            else:
                patience += 1
                if patience >= 4:
                    break

        scores.append(best)
        print(f"    RMSE for this fold: {best:.4f}")

    return np.mean(scores)


print(" Starting hyperparameter search...")

param_grid = {
    "layers": [
        [1024, 512, 128],
        [2048, 1024, 256],
        [1024, 1024, 512, 128],
    ],
    "dropout": [0.1, 0.2, 0.3],
    "lr": [1e-3, 5e-4, 1e-4],
    "batch_size": [128, 256],
    "wd": [1e-4, 1e-5],
    "epochs": [15],
}

keys, values = zip(*param_grid.items())
all_trials = [dict(zip(keys, v)) for v in itertools.product(*values)]
trials = random.sample(all_trials, k=min(10, len(all_trials)))

best_score = float("inf")
best_config = None

for i, cfg in enumerate(trials):
    print(f"\n====")
    print(f" Trial {i+1}/{len(trials)}")
    print("Config:", cfg)
    try:
        score = run_training(cfg, X_train_full, y_all)
        print(f" Trial RMSE: {score:.4f}")
        if score < best_score:
            best_score = score
            best_config = cfg
            print(" New best config so far!")
    except Exception as e:
        print(" Trial failed with error:", e)

print("\n Hyperparameter search finished.")
print("Best RMSE:", best_score)
print("Best Config:", best_config)

print("\n Final training with best config...")
best_config["epochs"] = 25

kf = KFold(n_splits=5, shuffle=True, random_state=42)
test_preds_accum = np.zeros((5, len(X_test_full)))
oof = np.zeros(len(X_train_full))

for fold, (tr, va) in enumerate(kf.split(X_train_full)):
    print(f"\n Final Fold {fold+1}/5")
    X_tr, y_tr = X_train_full[tr], y_all[tr]
    X_val, y_val = X_train_full[va], y_all[va]

    train_dl = DataLoader(
        TabularDataset(X_tr, y_tr),
        batch_size=best_config["batch_size"],
        shuffle=True,
    )
    val_dl = DataLoader(
        TabularDataset(X_val, y_val),
        batch_size=best_config["batch_size"],
        shuffle=False,
    )

    model = DynamicMLP(X_train_full.shape[1], best_config["layers"], best_config["dropout"]).to(DEVICE)
    opt = optim.AdamW(
        model.parameters(),
        lr=best_config["lr"],
        weight_decay=best_config["wd"],
    )
    sch = optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=3)
    crit = nn.MSELoss()
    best_rmse = float("inf")

    for epoch in enumerate(range(best_config["epochs"])):
        model.train()
        for xb, yb in train_dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad()
            loss = crit(model(xb), yb)
            loss.backward()
            opt.step()

        model.eval()
        preds = []
        with torch.no_grad():
            for xb, yb in val_dl:
                preds.append(model(xb.to(DEVICE)).cpu().numpy())
        rmse = np.sqrt(mean_squared_error(y_val, np.concatenate(preds)))
        sch.step(rmse)

        if rmse < best_rmse:
            best_rmse = rmse
            torch.save(model.state_dict(), f"final_model_fold{fold}.pt")

    model.load_state_dict(torch.load(f"final_model_fold{fold}.pt"))
    model.eval()

    preds = []
    with torch.no_grad():
        for xb, yb in val_dl:
            preds.append(model(xb.to(DEVICE)).cpu().numpy())
    oof[va] = np.concatenate(preds)

    tpreds = []
    test_dl = DataLoader(torch.tensor(X_test_full), batch_size=best_config["batch_size"], shuffle=False)
    with torch.no_grad():
        for xb in test_dl:
            tpreds.append(model(xb.to(DEVICE)).cpu().numpy())
    test_preds_accum[fold] = np.concatenate(tpreds)

print("\n Calibrating with simple linear regression on OOF...")
cal = LinearRegression()
cal.fit(oof.reshape(-1, 1), y_all)

avg = test_preds_accum.mean(axis=0)
final = cal.predict(avg.reshape(-1, 1))
final = np.clip(final, 0, 10)

ids = np.arange(1, len(final) + 1)
sub = pd.DataFrame({"ID": ids, "score": final})
sub.to_csv("submission.csv", index=False)

print("\n Saved submission.csv")

 Running on: cpu
 Loading JSON data...
 Loading metric names and embeddings...
 Building metric embedding matrices aligned with rows...
 Loading text (Gemma) embeddings...
 Performing data augmentation...
 Final training set shapes -> metrics: (20000, 768) | texts: (20000, 768) | labels: (20000,)
 Building feature matrices...
X_train_full: (20000, 3073) | X_test_full: (3638, 3073)
 Starting hyperparameter search...

====
 Trial 1/10
Config: {'layers': [1024, 1024, 512, 128], 'dropout': 0.1, 'lr': 0.0001, 'batch_size': 128, 'wd': 1e-05, 'epochs': 15}
 Fold 1/3 for this trial
    RMSE for this fold: 3.1462
 Fold 2/3 for this trial
    RMSE for this fold: 3.1382
 Fold 3/3 for this trial
    RMSE for this fold: 3.0989
 Trial RMSE: 3.1278
 New best config so far!

====
 Trial 2/10
Config: {'layers': [1024, 512, 128], 'dropout': 0.2, 'lr': 0.001, 'batch_size': 256, 'wd': 0.0001, 'epochs': 15}
 Fold 1/3 for this trial
    RMSE for this fold: 2.9463
 Fold 2/3 for this trial
    RMSE for this f