In [None]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
class Config:
    """Centralized configuration"""
    EMBEDDING_MODEL = "intfloat/multilingual-e5-large"
    EMBEDDING_DIM = 1024
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Training parameters
    NUM_EPOCHS = 20
    BATCH_SIZE = 256
    LEARNING_RATE = 0.001
    NUM_FOLDS = 5
    RANDOM_SEED = 42

    # Augmentation parameters
    NOISE_SCALE = 0.6
    NEG_SAMPLE_RATIO = 3  # How many negative samples per positive

In [None]:
# =====================================
# DATA LOADING & PREPROCESSING
# =====================================

def load_dataset_files():
    """Load all required JSON files"""
    with open("train_data.json", "r", encoding="utf8") as f:
        train_data = json.load(f)

    with open("test_data.json", "r", encoding="utf8") as f:
        test_data = json.load(f)

    with open("metric_names.json", "r", encoding="utf8") as f:
        metric_lookup = json.load(f)

    return pd.DataFrame(train_data), pd.DataFrame(test_data), metric_lookup

In [None]:
def create_combined_text(dataframe):
    """Merge system prompt, user query, and response into one text"""
    def merge_fields(row):
        system = str(row.get("system_prompt", "")) if pd.notna(row.get("system_prompt")) else ""
        user = str(row.get("user_prompt", ""))
        response = str(row.get("response", ""))

        # Custom separator tokens for better semantic parsing
        return f"{system} <SYS> {user} <USER> {response} <RESP>"

    dataframe["full_text"] = dataframe.apply(merge_fields, axis=1)
    return dataframe

In [None]:
# =====================================
# EMBEDDING GENERATION
# =====================================

class EmbeddingGenerator:
    """Handles all embedding generation tasks"""

    def __init__(self, model_name, device='cuda'):
        self.model = SentenceTransformer(model_name)
        self.model = self.model.to(device)
        self.device = device

    def encode_batch(self, texts, batch_size=64, show_progress=False):
        """Encode a list of texts to embeddings"""
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            show_progress_bar=show_progress,
            device=self.device
        )
        return embeddings.astype(np.float32)

    def encode_metrics(self, metric_list):
        """Create embeddings for unique metric names"""
        unique_metrics = list(set(metric_list))
        print(f"Encoding {len(unique_metrics)} unique metrics...")

        metric_embeddings = {}
        for i in tqdm(range(0, len(unique_metrics), 64)):
            batch = unique_metrics[i:i+64]
            batch_emb = self.encode_batch(batch)

            for metric, emb in zip(batch, batch_emb):
                metric_embeddings[metric] = emb

        return metric_embeddings

In [None]:
# =====================================
# NEGATIVE SAMPLE GENERATION
# =====================================

def generate_negative_samples(metric_embs, text_embs, labels, config):
    """Create synthetic negative samples using multiple strategies"""

    rng = np.random.default_rng(config.RANDOM_SEED)
    n_samples = len(metric_embs)

    # Strategy 1: Random shuffling (breaks correct pairing)
    shuffled_indices = rng.permutation(n_samples)
    neg_metrics_v1 = metric_embs.copy()
    neg_text_v1 = text_embs[shuffled_indices]
    neg_labels_v1 = rng.integers(0, 3, size=n_samples).astype(np.float32)

    # Strategy 2: Add Gaussian noise to text embeddings
    noise = rng.normal(loc=0, scale=config.NOISE_SCALE, size=text_embs.shape)
    neg_metrics_v2 = metric_embs.copy()
    neg_text_v2 = text_embs + noise
    neg_labels_v2 = rng.integers(0, 3, size=n_samples).astype(np.float32)

    # Strategy 3: Swap metric embeddings
    metric_shuffle = rng.permutation(n_samples)
    neg_metrics_v3 = metric_embs[metric_shuffle]
    neg_text_v3 = text_embs.copy()
    neg_labels_v3 = rng.integers(0, 3, size=n_samples).astype(np.float32)

    # Combine all samples
    all_metric_embs = np.vstack([
        metric_embs, neg_metrics_v1, neg_metrics_v2, neg_metrics_v3
    ])

    all_text_embs = np.vstack([
        text_embs, neg_text_v1, neg_text_v2, neg_text_v3
    ])

    all_labels = np.concatenate([
        labels, neg_labels_v1, neg_labels_v2, neg_labels_v3
    ])

    print(f"Generated {len(all_labels)} total samples (original + negatives)")

    return all_metric_embs, all_text_embs, all_labels


In [None]:
# =====================================
# FEATURE ENGINEERING
# =====================================

def build_interaction_features(metric_embs, text_embs):
    """Create rich feature set from embedding pairs"""

    # Simple concatenation
    concatenated = np.hstack([metric_embs, text_embs])

    # Element-wise interactions
    absolute_diff = np.abs(metric_embs - text_embs)
    element_product = metric_embs * text_embs

    # Cosine similarity as additional feature
    dot_products = np.sum(metric_embs * text_embs, axis=1)
    metric_norms = np.linalg.norm(metric_embs, axis=1)
    text_norms = np.linalg.norm(text_embs, axis=1)
    cosine_sim = (dot_products / (metric_norms * text_norms + 1e-9)).reshape(-1, 1)

    # Combine all features
    feature_matrix = np.hstack([
        concatenated,
        absolute_diff,
        element_product,
        cosine_sim
    ]).astype(np.float32)

    return feature_matrix

In [None]:
# =====================================
# PYTORCH COMPONENTS
# =====================================

class ScoreDataset(Dataset):
    """Custom dataset for PyTorch training"""

    def __init__(self, features, targets):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]


class ScoringNetwork(nn.Module):
    """Deep neural network for score prediction"""

    def __init__(self, input_size):
        super(ScoringNetwork, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 128),
            nn.ReLU(),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.layers(x).squeeze(-1)

In [None]:
# =====================================
# TRAINING & VALIDATION
# =====================================

def train_single_fold(train_data, val_data, config, fold_num):
    """Train one fold of cross-validation"""

    train_loader = DataLoader(
        train_data,
        batch_size=config.BATCH_SIZE,
        shuffle=True
    )

    val_loader = DataLoader(
        val_data,
        batch_size=config.BATCH_SIZE,
        shuffle=False
    )

    # Initialize model
    input_dim = train_data.features.shape[1]
    model = ScoringNetwork(input_dim).to(config.DEVICE)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config.LEARNING_RATE,
        weight_decay=1e-5
    )

    loss_function = nn.MSELoss()
    best_val_loss = float('inf')

    # Training loop
    for epoch in range(config.NUM_EPOCHS):
        model.train()
        train_loss = 0.0

        for features, targets in train_loader:
            features = features.to(config.DEVICE)
            targets = targets.to(config.DEVICE)

            optimizer.zero_grad()
            predictions = model(features)
            loss = loss_function(predictions, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * features.size(0)

        # Validation
        model.eval()
        val_predictions = []
        val_targets = []

        with torch.no_grad():
            for features, targets in val_loader:
                features = features.to(config.DEVICE)
                preds = model(features).cpu().numpy()
                val_predictions.extend(preds)
                val_targets.extend(targets.numpy())

        val_predictions = np.array(val_predictions)
        val_targets = np.array(val_targets)
        val_rmse = np.sqrt(mean_squared_error(val_targets, val_predictions))

        avg_train_loss = train_loss / len(train_data)
        print(f"Fold {fold_num} | Epoch {epoch+1}/{config.NUM_EPOCHS} | "
              f"Train Loss: {avg_train_loss:.4f} | Val RMSE: {val_rmse:.4f}")

        # Save best model
        if val_rmse < best_val_loss:
            best_val_loss = val_rmse
            torch.save(model.state_dict(), f"best_model_fold{fold_num}.pth")

    return best_val_loss


def cross_validation_training(features, labels, config):
    """Perform k-fold cross-validation"""

    kfold = StratifiedKFold(
        n_splits=config.NUM_FOLDS,
        shuffle=True,
        random_state=config.RANDOM_SEED
    )

    # Stratify by rounded labels
    stratify_labels = np.round(labels).astype(int)

    oof_predictions = np.zeros(len(features))
    fold_scores = []

    for fold_idx, (train_indices, val_indices) in enumerate(kfold.split(features, stratify_labels)):
        print(f"\n{'='*60}")
        print(f"Training Fold {fold_idx + 1}/{config.NUM_FOLDS}")
        print(f"{'='*60}")

        # Split data
        X_train, X_val = features[train_indices], features[val_indices]
        y_train, y_val = labels[train_indices], labels[val_indices]

        train_dataset = ScoreDataset(X_train, y_train)
        val_dataset = ScoreDataset(X_val, y_val)

        # Train fold
        fold_score = train_single_fold(
            train_dataset,
            val_dataset,
            config,
            fold_idx
        )

        fold_scores.append(fold_score)

        # Generate OOF predictions
        model = ScoringNetwork(features.shape[1]).to(config.DEVICE)
        model.load_state_dict(torch.load(f"best_model_fold{fold_idx}.pth"))
        model.eval()

        with torch.no_grad():
            val_features = torch.FloatTensor(X_val).to(config.DEVICE)
            oof_predictions[val_indices] = model(val_features).cpu().numpy()

    overall_score = np.mean(fold_scores)
    print(f"\nAverage CV Score: {overall_score:.4f}")

    return oof_predictions, fold_scores


In [None]:
# =====================================
# MAIN EXECUTION PIPELINE
# =====================================

def main():
    """Execute complete training pipeline"""

    config = Config()
    print(f"Using device: {config.DEVICE}\n")

    # Step 1: Load data
    print("Loading datasets...")
    train_df, test_df, metrics_info = load_dataset_files()

    train_df = create_combined_text(train_df)
    test_df = create_combined_text(test_df)

    print(f"Training samples: {len(train_df)}")
    print(f"Test samples: {len(test_df)}\n")

    # Step 2: Generate embeddings
    print("Generating embeddings...")
    embedder = EmbeddingGenerator(config.EMBEDDING_MODEL, config.DEVICE)

    # Metric embeddings
    metric_embedding_dict = embedder.encode_metrics(
        train_df["metric_name"].tolist()
    )

    train_metric_embs = np.vstack([
        metric_embedding_dict[m] for m in train_df["metric_name"]
    ])

    test_metric_embs = np.vstack([
        metric_embedding_dict[m] for m in test_df["metric_name"]
    ])

    # Text embeddings
    print("Encoding training texts...")
    train_text_embs = embedder.encode_batch(
        train_df["full_text"].tolist(),
        show_progress=True
    )

    print("Encoding test texts...")
    test_text_embs = embedder.encode_batch(
        test_df["full_text"].tolist(),
        show_progress=True
    )

    # Step 3: Augment with negative samples
    print("\nGenerating negative samples...")
    train_labels = train_df["score"].values.astype(np.float32)

    aug_metric_embs, aug_text_embs, aug_labels = generate_negative_samples(
        train_metric_embs,
        train_text_embs,
        train_labels,
        config
    )

    # Step 4: Build features
    print("\nBuilding features...")
    train_features = build_interaction_features(aug_metric_embs, aug_text_embs)
    test_features = build_interaction_features(test_metric_embs, test_text_embs)

    print(f"Training feature shape: {train_features.shape}")
    print(f"Test feature shape: {test_features.shape}\n")

    # Step 5: Train models
    print("Starting cross-validation training...")
    oof_preds, fold_scores = cross_validation_training(
        train_features,
        aug_labels,
        config
    )

    # Step 6: Calibration
    print("\nApplying calibration...")
    calibrator = Ridge(alpha=1.0)
    calibrator.fit(oof_preds.reshape(-1, 1), aug_labels)

    oof_calibrated = calibrator.predict(oof_preds.reshape(-1, 1))
    final_oof_score = np.sqrt(mean_squared_error(aug_labels, oof_calibrated))

    print(f"Calibrated OOF RMSE: {final_oof_score:.4f}")

    # Step 7: Test predictions
    print("\nGenerating test predictions...")
    test_preds_all_folds = []

    for fold_idx in range(config.NUM_FOLDS):
        model = ScoringNetwork(train_features.shape[1]).to(config.DEVICE)
        model.load_state_dict(torch.load(f"best_model_fold{fold_idx}.pth"))
        model.eval()

        with torch.no_grad():
            test_tensor = torch.FloatTensor(test_features).to(config.DEVICE)
            fold_preds = model(test_tensor).cpu().numpy()
            test_preds_all_folds.append(fold_preds)

    # Average and calibrate
    test_preds_avg = np.mean(test_preds_all_folds, axis=0)
    test_preds_final = calibrator.predict(test_preds_avg.reshape(-1, 1))
    test_preds_final = np.clip(test_preds_final, 0, 10)

    # Step 8: Create submission
    submission = pd.DataFrame({
        "ID": range(1, len(test_df) + 1),
        "score": test_preds_final
    })

    submission.to_csv("submission.csv", index=False)
    print("\nSubmission file created: submission.csv")
    print(f"Prediction range: [{test_preds_final.min():.2f}, {test_preds_final.max():.2f}]")


if __name__ == "__main__":
    main()