 # IRP-ResidualMLP Experimentation Notebook
 
 This notebook imports all logic from the `src/irp_refiner` package.

In [None]:
import sys
sys.path.append("/kaggle/input/aml-irp/pytorch/default/3/AML-Competition-Notebook")


In [None]:
# %%
# Add src directory to Python path
import sys
import os
import gc
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

# This assumes the notebook is in the 'notebooks' directory
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
!mkdir -p "/kaggle/working/checkpoints"
# %%
# Import our modules
from src.irp_refiner import config
from src.irp_refiner.utils import set_seed
from src.irp_refiner.data_processing import load_and_clean_data
from src.irp_refiner.models.irp import IRPTranslator
from src.irp_refiner.models.mlp import ResidualMLP
from src.irp_refiner.training import train_model
from src.irp_refiner.evaluation import evaluate_retrieval
from src.irp_refiner.baseline_utils import load_data, prepare_train_data

In [None]:
# %%
# --- 1. Configuration & Setup ---
worker_init_fn = set_seed(config.SEED)
DEVICE = config.DEVICE
print(f"Using device: {DEVICE}")

In [None]:
# %%
# --- 2. Load and Clean Data ---
# We can load the full, clean dataset once
X_train_np_cleaned, Y_train_np_cleaned = load_and_clean_data(
    config.TRAIN_DATA_PATH, config.NOISE_THRESHOLD
)


In [None]:
# %%
# --- 3. Create a single Train/Val split for this experiment ---
X_train_np, X_val_np, Y_train_np, Y_val_np = train_test_split(
    X_train_np_cleaned, Y_train_np_cleaned, 
    test_size=config.VAL_SIZE, 
    random_state=config.SEED
)

print(f"Training samples: {len(X_train_np)}")
print(f"Validation samples: {len(X_val_np)}")

# Clean up
del X_train_np_cleaned, Y_train_np_cleaned
gc.collect()


In [None]:
# %%
# --- 4. Fit IRP Translator (on training data only) ---
from sklearn.preprocessing import StandardScaler
import joblib

# Use anchors from the training set
anchor_indices = np.random.choice(len(X_train_np), config.K_ANCHORS, replace=False)
X_anchor = X_train_np[anchor_indices]
Y_anchor = Y_train_np[anchor_indices]

scaler_X = StandardScaler().fit(X_anchor)
scaler_Y = StandardScaler().fit(Y_anchor)

irp_translator = IRPTranslator(
    scaler_X, scaler_Y, 
    omega=config.IRP_OMEGA, delta=config.IRP_DELTA, 
    ridge=config.IRP_RIDGE, verbose=True
)
irp_translator.fit(X_anchor, Y_anchor)

# Save the translator
irp_path = f"{config.CHECKPOINT_DIR}irp_translator_notebook.pkl"
joblib.dump(irp_translator, irp_path)
print(f"IRP Translator saved to {irp_path}")


In [None]:
# %%
# --- 5. Transform Data & Create DataLoaders ---
print("Transforming data with IRP...")
X_train_IRP = torch.from_numpy(irp_translator.translate(X_train_np)).float()
X_val_IRP = torch.from_numpy(irp_translator.translate(X_val_np)).float()

train_dataset = TensorDataset(X_train_IRP, torch.from_numpy(Y_train_np).float())
val_dataset = TensorDataset(X_val_IRP, torch.from_numpy(Y_val_np).float())

train_loader = DataLoader(
    train_dataset, 
    batch_size=config.BATCH_SIZE, 
    shuffle=True, 
    worker_init_fn=worker_init_fn
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=config.BATCH_SIZE, 
    shuffle=False
)

print("DataLoaders are ready.")


In [None]:
# %%
# --- 6. Define Model ---
model = ResidualMLP(
    input_dim=config.D_X, 
    output_dim=config.D_Y, 
    hidden_dim=config.HIDDEN_DIM,
    num_hidden_layers=config.NUM_HIDDEN_LAYERS, 
    dropout_p=config.DROPOUT_P
).to(DEVICE)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")


In [None]:
# %%
# --- 7. Run Training ---
model_path = f"{config.CHECKPOINT_DIR}mlp_notebook.pth"

best_loss = train_model(
    model,
    train_loader,
    val_loader,
    DEVICE,
    epochs=config.EPOCHS,
    lr=config.LR,
    save_path=model_path,
    patience=config.EARLY_STOP_PATIENCE,
    min_delta=config.MIN_IMPROVEMENT_DELTA,
    resume=True # Set to False to force re-training
)

In [None]:
# %%
# --- 8. Evaluate Model ---
print("Loading best model for evaluation...")
checkpoint = torch.load(model_path)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# We already have X_val_IRP and Y_val_np
gt_indices_val = np.arange(len(Y_val_np))

# Get predictions
all_preds = []
with torch.no_grad():
    for (batch_X,) in DataLoader(TensorDataset(X_val_IRP), batch_size=config.BATCH_SIZE*2):
        pred_batch = model(batch_X.to(DEVICE)).cpu()
        all_preds.append(pred_batch)
        
translated_embd_preds = torch.cat(all_preds, dim=0)

# Run evaluation
results = evaluate_retrieval(
    translated_embd_preds,
    Y_val_np,
    gt_indices_val,
    batch_size=config.BATCH_SIZE
)

print("\\n--- EVALUATION ---")
for metric, value in results.items():
    if 'recall' in metric:
        print(f"  {metric}: {value:.2%}")
    else:
        print(f"  {metric}: {value:.4f}")
print("------------------------------------------")