# Text-to-Image Model Stiching

## Clone repository

In [None]:
!git clone https://github.com/CristianApost0l/AML-Competition-Notebook.git
import sys
sys.path.append("/kaggle/working/AML-Competition-Notebook")

## Setup and Import libraries

In [None]:
import os
import gc
import sys
import torch
import joblib
import numpy as np
import pandas as pd
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold

# Add src directory to Python path
# This assumes the notebook is in the 'notebooks' directory
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
CHECKPOINTS_LOAD = False

## Install Checkpoints for both submissions
Run only if you want to use them!

In [None]:
!pip install gdown
folder_id = "1N7KO7zFjJ8PvtwABlRW7Ry5QsBlafTy8"
!gdown --folder $folder_id -O ./checkpoints
CHECKPOINTS_LOAD = True

## Install the requirements

In [None]:
!pip install -r /kaggle/working/AML-Competition-Notebook/requirements.txt

## K-Fold Ensemble (Submission 2)

### Import modules and setup

In [None]:
from src import config
from src.utils import set_seed, load_direct_ensemble
from src.data_processing import load_and_clean_data
from src.models.irp import IRPTranslator
from src.models.mlp import ResidualMLP
from src.ensembling import EnsembleWrapper
from src.evaluation import evaluate_retrieval
from src.baseline_utils import load_data, generate_submission
from src.training import train_irp_refiner

print("All modules imported successfully.")

worker_init_fn = set_seed(config.SEED)
DEVICE = config.DEVICE
print(f"Using device: {DEVICE}")
print(f"Kaggle data path: {config.TRAIN_DATA_PATH}")

### Load, Clean, and Split Data 

Load the dataset from the path TRAIN_DATA_PATH specified in the config.py, split it and clean it from noisy captions with NOISE_THRESHOLD.

In [None]:
X_train_np_cleaned, Y_train_np_cleaned = load_and_clean_data(
    config.TRAIN_DATA_PATH, config.NOISE_THRESHOLD
)

### Training the model

Skip if you want to use the checkpoints!

In [None]:
# 1. Initialize KFold
kf = KFold(n_splits=config.K_FOLDS, shuffle=True, random_state=config.SEED)

# 2. K-Fold Training Loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_np_cleaned)):
    print("\n" + "="*80)
    print(f"=============== FOLD {fold+1}/{config.K_FOLDS} ===============")
    print("="*80)

    # --- Split data for this fold ---
    X_train_fold, X_val_fold = X_train_np_cleaned[train_idx], X_train_np_cleaned[val_idx]
    Y_train_fold, Y_val_fold = Y_train_np_cleaned[train_idx], Y_train_np_cleaned[val_idx]

    # --- IRP Stage ---
    print(f"--- FOLD {fold+1}: IRP Stage ---")
    anchor_indices = np.random.choice(len(X_train_fold), config.K_ANCHORS, replace=False)
    X_anchor = X_train_fold[anchor_indices]
    Y_anchor = Y_train_fold[anchor_indices]

    scaler_X = StandardScaler().fit(X_anchor)
    scaler_Y = StandardScaler().fit(Y_anchor)

    irp_translator_fold = IRPTranslator(
        scaler_X, scaler_Y, 
        omega=config.IRP_OMEGA, delta=config.IRP_DELTA, 
        ridge=config.IRP_RIDGE, verbose=False
    )
    irp_translator_fold.fit(X_anchor, Y_anchor)
    print(f"   ✓ IRP translator for fold {fold+1} fitted.")

    irp_path = f"{config.CHECKPOINT_DIR}irp_translator_fold_{fold}.pkl"
    joblib.dump(irp_translator_fold, irp_path)
    print(f"   ✓ IRP translator saved to {irp_path}")

    X_train_IRP_fold = torch.from_numpy(irp_translator_fold.translate(X_train_fold)).float()
    X_val_IRP_fold = torch.from_numpy(irp_translator_fold.translate(X_val_fold)).float()
    print(f"   ✓ Train and Val data transformed for fold {fold+1}.")

    # --- DataLoader Stage ---
    train_ds_fold = TensorDataset(X_train_IRP_fold, torch.from_numpy(Y_train_fold).float())
    val_ds_fold = TensorDataset(X_val_IRP_fold, torch.from_numpy(Y_val_fold).float())

    train_loader_fold = DataLoader(train_ds_fold, batch_size=config.BATCH_SIZE, shuffle=True, worker_init_fn=worker_init_fn)
    val_loader_fold = DataLoader(val_ds_fold, batch_size=config.BATCH_SIZE, shuffle=False)

    # --- Model Training Stage ---
    print(f"--- FOLD {fold+1}: MLP Refiner Training Stage ---")
    model_fold = ResidualMLP(
        input_dim=config.D_X, output_dim=config.D_Y, hidden_dim=config.HIDDEN_DIM,
        num_hidden_layers=config.NUM_HIDDEN_LAYERS, dropout_p=config.DROPOUT_P
    ).to(config.DEVICE)

    model_path_fold = f"{config.CHECKPOINT_DIR}mlp_fold_{fold}.pth"

    train_irp_refiner(
        model_fold, train_loader_fold, val_loader_fold, config.DEVICE,
        epochs=config.EPOCHS, lr=config.LR, save_path=model_path_fold,
        patience=config.EARLY_STOP_PATIENCE, min_delta=config.MIN_IMPROVEMENT_DELTA,
        resume=False 
    )

    # --- Clean up memory ---
    del model_fold, train_loader_fold, val_loader_fold, X_train_IRP_fold, X_val_IRP_fold
    gc.collect()
    torch.cuda.empty_cache()

print("\n" + "="*80)
print("K-Fold Training Complete. All models saved.")
print("="*80)

## Direct Model Experimentation (Submission 1)

### Import modules

In [None]:
from src import config
from src import baseline_utils
from src import evaluation
from src import training
from src import ensembling
from src.data_processing import load_and_prep_data_direct
from src.models import mlp_direct
from src.utils import set_seed

print("All modules imported successfully.")

worker_init_fn = set_seed(config.SEED)
DEVICE = config.DEVICE
print(f"Using device: {DEVICE}")
print(f"Kaggle data path: {config.TRAIN_DATA_PATH}")

### Load, Clean, and Split Data 

Load the dataset from the path TRAIN_DATA_PATH specified in the config.py, split it and clean it from noisy captions.

In [None]:
print("Loading, cleaning, and splitting data...")

(X_train, y_train, X_val, y_val, 
 val_text_embd, val_img_embd_unique, val_label_gt) = load_and_prep_data_direct(
    train_path=config.TRAIN_DATA_PATH,
    coco_path=config.MY_DATA_PATH,
    use_coco=config.USE_COCO_DATASET,
    noise_threshold=config.NOISE_THRESHOLD,
    val_split_ratio=config.VAL_SIZE,
    random_seed=config.SEED
)

print("\n--- Data Loading Complete ---")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val (queries) shape: {X_val.shape}")
print(f"val_img_embd_unique (gallery) shape: {val_img_embd_unique.shape}")
print(f"val_label_gt (ground truth) shape: {val_label_gt.shape}")


### Create DataLoaders

In [None]:
train_dl_std = DataLoader(
    TensorDataset(X_train, y_train), 
    batch_size=config.MODERN_HPARAMS['batch_size'], 
    shuffle=True
)
val_dl_std = DataLoader(
    TensorDataset(X_val, y_val), 
    batch_size=config.MODERN_HPARAMS['batch_size'], 
    shuffle=False
)
print("DataLoaders created.")

### Model Training
Skip if you want to use the checkpoints!

In [None]:
print("\n--- 4. Training Modello E ---")
# This function trains 3 different models and returns them in a list
models_E = training.create_direct_ensemble(
    X_train, y_train, X_val, y_val, DEVICE
)

## Generate Submissions

In [None]:
print("\n--- Submissions Generation ---")

print("\n Loading K-Fold models and IRP translators...")

model_paths = [f"{config.CHECKPOINT_DIR}mlp_fold_{f}.pth" for f in range(config.K_FOLDS)]
irp_paths = [f"{config.CHECKPOINT_DIR}irp_translator_fold_{f}.pkl" for f in range(config.K_FOLDS)]

# Check if the files exist first
if not os.path.exists(model_paths[0]):
    print("="*80)
    print(f"ERROR: Model file not found at {model_paths[0]}")
    print("Please run 'python scripts/train.py' to train the K-Fold models before running this notebook.")
    print("="*80)
    ensemble_wrapper = None
else:
    ensemble_wrapper = EnsembleWrapper(model_paths, irp_paths, DEVICE)

if ensemble_wrapper:
    print("\n--- Generating Ensemble Submission ---")
    
    # 1. Load test data
    print("Loading test data...")
    test_data = load_data(config.TEST_DATA_PATH)
    test_embds_raw_np = test_data['captions/embeddings']
    print(f"Test data loaded: {len(test_embds_raw_np)} samples.")

    # 2. Generate predictions using the ensemble
    print("Applying ensemble pipeline to test data... (this may take a moment)")
    pred_embds_ensemble = ensemble_wrapper.translate(test_embds_raw_np)
    print("Ensemble predictions generated.")

    # 3. Save submission file
    submission_filename = 'submission_K_Fold_IRP_MLP_Refiner.csv'
    generate_submission(test_data['captions/ids'], pred_embds_ensemble, submission_filename)
    
    print("\n" + "="*50)
    print(f"✅ Submission file '{submission_filename}' generated.")
    print("="*50)
else:
    print("Skipping submission generation because ensemble was not loaded.")


## Ensemble model Submission 1

if CHECKPOINTS_LOAD:
    paths = ["/kaggle/working/checkpoints/ensemble_m1.pth", "/kaggle/working/checkpoints/ensemble_m2.pth", "/kaggle/working/checkpoints/ensemble_m3.pth"]
    models_E = load_direct_ensemble(DEVICE, paths)
    wrapper_E = ensembling.DirectEnsembleWrapper(models_E, DEVICE)
    wrapper_submission = wrapper_E
else:
    wrapper_E = ensembling.DirectEnsembleWrapper(models_E, DEVICE)
    wrapper_submission = wrapper_E
# ---------------------------------

print(f"Using model: Diverse Ensemble (E)")

# 1. Load test data
print("Loading test data...")
test_data = baseline_utils.load_data(config.TEST_DATA_PATH)
X_test_np = test_data['captions/embeddings']
test_ids = test_data['captions/ids']
print(f"Test data loaded: {len(X_test_np)} samples.")

# 2. Generate predictions
print("Generating predictions on the test set...")
# The wrapper handles normalization internally
y_test_pred = wrapper_submission.translate(X_test_np, batch_size=512)

# 3. Save submission file
submission_filename = "submission_Ensemble_model.csv"
baseline_utils.generate_submission(test_ids, y_test_pred, submission_filename)

print(f"\n✅ Submission salvata: {submission_filename}")