In [None]:
import os
import gc
import json
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, logging as hf_logging
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
hf_logging.set_verbosity_error()
warnings.filterwarnings('ignore')
TRAIN_CONFIG = {'MODEL_NAME': 'sentence-transformers/all-mpnet-base-v2', 'MAX_LENGTH': 512, 'BATCH_SIZE': 32, 'EPOCHS': 10, 'LR_ENCODER': 2e-06, 'LR_HEAD': 2e-05, 'N_SPLITS': 5, 'SEED': 42, 
                'DEVICE': torch.device('cpu' if not torch.cuda.is_available() else 'cuda')
                }
METRIC_EMBEDDING_DIMENSION = 768
INTERACTION_FEATURE_DIMENSION = METRIC_EMBEDDING_DIMENSION * 4


Utilities:

In [None]:
def set_random_seed(seed_value: int) -> None:

    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def compute_expected_score_from_logits(logits: torch.Tensor) -> torch.Tensor:
    
    class_probabilities = torch.softmax(logits, dim=1)
    score_values = torch.arange(0, 11, dtype=torch.float, device=class_probabilities.device)
    expected_scores = (class_probabilities * score_values).sum(dim=1)
    return expected_scores

set_random_seed(TRAIN_CONFIG['SEED'])
print(f"Using device: {TRAIN_CONFIG['DEVICE']}")


Training Data:

In [None]:
training_dataframe = pd.read_json('train_data.json')
metric_name_list = pd.read_json('metric_names.json')
metric_name_embedding_matrix = np.load('metric_name_embeddings.npy')
metric_name_to_embedding = dict(zip(metric_name_list, metric_name_embedding_matrix))
training_dataframe['user_prompt'] = training_dataframe['user_prompt'].astype(str)
training_dataframe['system_prompt'] = training_dataframe['system_prompt'].fillna('None').astype(str)
training_dataframe['response'] = training_dataframe['response'].astype(str)
training_dataframe['full_text'] = 'User: ' + training_dataframe['user_prompt'] + ' | System: ' + training_dataframe['system_prompt'] + ' | Response: ' + training_dataframe['response']
training_dataframe['metric_embedding'] = training_dataframe['metric_name'].map(metric_name_to_embedding)
training_dataframe['score_float'] = training_dataframe['score'].astype(float)
training_dataframe['score_label'] = training_dataframe['score_float'].astype(int)


Class Weights:

In [None]:
score_label_counts = training_dataframe['score_label'].value_counts().sort_index().reindex(range(11), fill_value=0)
num_training_samples = len(training_dataframe)
num_score_classes = 11
raw_class_weight_values = num_training_samples / (num_score_classes * (score_label_counts + 1))
class_weight_tensor = torch.tensor(raw_class_weight_values.values, dtype=torch.float).to(TRAIN_CONFIG['DEVICE'])
class_weight_array = class_weight_tensor.cpu().numpy()

for score_value in range(6):
    class_weight_array[score_value] = class_weight_array[score_value] * 1.5
    
class_weight_tensor = torch.tensor(class_weight_array, dtype=torch.float).to(TRAIN_CONFIG['DEVICE']) 
print('Using class weights for imbalance compensation.')
print('NOTE: Manually boosted class weights for scores 0-5 by 1.5x.') 
text_tokenizer = AutoTokenizer.from_pretrained(TRAIN_CONFIG['MODEL_NAME'])  # Initializing tokenizer


Pytorch Dataset and Model Definition:

In [None]:
class TrainingMetricScoringDataset(Dataset):

    def __init__(self, data_frame: pd.DataFrame, tokenizer, max_length: int) -> None:
        
        self.sample_texts = data_frame['full_text'].values
        metric_embedding_series = data_frame['metric_embedding'].apply(lambda embedding: embedding if isinstance(embedding, np.ndarray) else np.zeros(METRIC_EMBEDDING_DIMENSION))
        self.metric_embeddings = np.stack(metric_embedding_series.values)
        self.score_labels = data_frame['score_label'].values
        self.score_values = data_frame['score_float'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:

        return len(self.sample_texts)

    def __getitem__(self, index: int) -> dict:

        text_sample = self.sample_texts[index]
        tokenized_inputs = self.tokenizer.encode_plus(text_sample, None, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = tokenized_inputs['input_ids'].squeeze(0)
        attention_mask = tokenized_inputs['attention_mask'].squeeze(0)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'metric_embedding': torch.tensor(self.metric_embeddings[index], dtype=torch.float), 'label': torch.tensor(self.score_labels[index], dtype=torch.long), 'score': torch.tensor(self.score_values[index], dtype=torch.float)}

class InferenceMetricScoringDataset(Dataset):

    def __init__(self, data_frame: pd.DataFrame, tokenizer, max_length: int) -> None:

        self.sample_texts = data_frame['full_text'].values
        metric_embedding_series = data_frame['metric_embedding'].apply(lambda embedding: embedding if isinstance(embedding, np.ndarray) else np.zeros(METRIC_EMBEDDING_DIMENSION))
        self.metric_embeddings = np.stack(metric_embedding_series.values)
        self.sample_ids = data_frame['ID'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:

        return len(self.sample_texts)

    def __getitem__(self, index: int) -> dict:

        text_sample = self.sample_texts[index]
        tokenized_inputs = self.tokenizer.encode_plus(text_sample, None, add_special_tokens=True, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = tokenized_inputs['input_ids'].squeeze(0)
        attention_mask = tokenized_inputs['attention_mask'].squeeze(0)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'metric_embedding': torch.tensor(self.metric_embeddings[index], dtype=torch.float), 'id': torch.tensor(self.sample_ids[index], dtype=torch.long)}

class MetricScoringModel(nn.Module):

    def __init__(self, model_name: str) -> None:

        super().__init__()
        self.text_encoder = SentenceTransformer(model_name)
        self.prediction_head = nn.Sequential(nn.Linear(INTERACTION_FEATURE_DIMENSION, 1024), nn.BatchNorm1d(1024), nn.ReLU(), nn.Dropout(0.4), nn.Linear(1024, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 11))

    def forward(self, metric_embedding_batch: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:

        batch_text_inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
        text_embedding_output = self.text_encoder(batch_text_inputs)
        text_embedding_batch = text_embedding_output['sentence_embedding']
        metric_embedding_batch = metric_embedding_batch
        embedding_difference = torch.abs(metric_embedding_batch - text_embedding_batch)
        embedding_product = metric_embedding_batch * text_embedding_batch
        interaction_features = torch.cat([metric_embedding_batch, text_embedding_batch, embedding_difference, embedding_product], dim=1)
        logits = self.prediction_head(interaction_features)
        return logits


Training and Evaluation Functions:

In [None]:
def train_fn(model: nn.Module, data_loader: DataLoader, optim_engine: optim.Optimizer, loss_fn, device: torch.device) -> float:

    model.train()
    cumulative_loss = 0.0
    
    for batch in data_loader:
        optim_engine.zero_grad()
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_mask = batch['attention_mask'].to(device)
        batch_metric_embeddings = batch['metric_embedding'].to(device)
        batch_labels = batch['label'].to(device)
        logits = model(batch_metric_embeddings, batch_input_ids, batch_attention_mask)
        loss = loss_fn(logits, batch_labels)
        loss.backward()
        optim_engine.step()
        cumulative_loss += loss.item()
    mean_epoch_loss = cumulative_loss / len(data_loader)

    return mean_epoch_loss

def eval_fn(model: nn.Module, data_loader: DataLoader, loss_fn, device: torch.device) -> tuple[float, float]:

    model.eval()
    cumulative_loss = 0.0
    predicted_scores_list = []
    true_scores_list = []

    with torch.no_grad():
        for batch in data_loader:

            batch_input_ids = batch['input_ids'].to(device)
            batch_attention_mask = batch['attention_mask'].to(device)
            batch_metric_embeddings = batch['metric_embedding'].to(device)
            batch_labels = batch['label'].to(device)
            batch_scores = batch['score'].to(device)
            logits = model(batch_metric_embeddings, batch_input_ids, batch_attention_mask)
            loss = loss_fn(logits, batch_labels)
            cumulative_loss += loss.item()
            
            expected_scores = compute_expected_score_from_logits(logits)
            predicted_scores_list.extend(expected_scores.cpu().numpy())
            true_scores_list.extend(batch_scores.cpu().numpy())

    validation_rmse = np.sqrt(mean_squared_error(true_scores_list, predicted_scores_list))
    mean_validation_loss = cumulative_loss / len(data_loader)

    return (mean_validation_loss, validation_rmse)


K-Fold Cross-Validation Training Loop

In [None]:
print('\n--- Starting Model Training ---')

loss_fn = nn.CrossEntropyLoss(weight=class_weight_tensor)
cross_validation_splitter = StratifiedGroupKFold(n_splits=TRAIN_CONFIG['N_SPLITS'], shuffle=True, random_state=TRAIN_CONFIG['SEED'])

sample_indices = training_dataframe.index
score_label_series = training_dataframe['score_label']
metric_group_labels = training_dataframe['metric_name']
fold_rmse_scores = []
fold_model_paths = [f'model_fold_{fold_index + 1}.pth' for fold_index in range(4)]

for fold_index, (train_indices, validation_indices) in enumerate(cross_validation_splitter.split(sample_indices, score_label_series, metric_group_labels)):
    if fold_index < 5:
        print(f'\n--- Skipping Fold {fold_index + 1} (Already Trained) ---')
        continue

    print(f"\n--- Fold {fold_index + 1}/{TRAIN_CONFIG['N_SPLITS']} ---")

    train_fold_dataframe = training_dataframe.iloc[train_indices]
    validation_fold_dataframe = training_dataframe.iloc[validation_indices]
    train_dataset = TrainingMetricScoringDataset(train_fold_dataframe, text_tokenizer, TRAIN_CONFIG['MAX_LENGTH'])
    validation_dataset = TrainingMetricScoringDataset(validation_fold_dataframe, text_tokenizer, TRAIN_CONFIG['MAX_LENGTH'])
    training_loader = DataLoader(train_dataset, batch_size=TRAIN_CONFIG['BATCH_SIZE'], shuffle=True, num_workers=2, drop_last=True)
    validation_data_loader = DataLoader(validation_dataset, batch_size=TRAIN_CONFIG['BATCH_SIZE'], shuffle=False, num_workers=2)

    print(f'Train samples: {len(train_fold_dataframe)}, Val samples: {len(validation_fold_dataframe)}')
    
    metric_model = MetricScoringModel(TRAIN_CONFIG['MODEL_NAME']).to(TRAIN_CONFIG['DEVICE'])
    optim_engine = optim.AdamW([{'params': metric_model.text_encoder.parameters(), 'lr': TRAIN_CONFIG['LR_ENCODER']}, {'params': metric_model.prediction_head.parameters(), 'lr': TRAIN_CONFIG['LR_HEAD']}])
    best_fold_rmse = float('inf')
    fold_model_path = f'model_fold_{fold_index + 1}.pth'

    for epoch_idx in range(TRAIN_CONFIG['EPOCHS']):

        training_loss = train_fn(metric_model, training_loader, optim_engine, loss_fn, TRAIN_CONFIG['DEVICE'])
        validation_loss, validation_rmse = eval_fn(metric_model, validation_data_loader, loss_fn, TRAIN_CONFIG['DEVICE'])
        print(f'Epoch {epoch_idx + 1}: Train Loss={training_loss:.4f} | Val Loss={validation_loss:.4f} | Val RMSE={validation_rmse:.4f}')
        if validation_rmse < best_fold_rmse:
            best_fold_rmse = validation_rmse
            torch.save(metric_model.state_dict(), fold_model_path)
            print(f'  ^ New best model saved to {fold_model_path}')

    fold_rmse_scores.append(best_fold_rmse)
    fold_model_paths.append(fold_model_path)
    print(f'Best RMSE for Fold {fold_index + 1}: {best_fold_rmse:.4f}')
    del metric_model, training_loader, validation_data_loader, train_dataset, validation_dataset
    gc.collect()
    
    if TRAIN_CONFIG['DEVICE'] == 'cuda':
        torch.cuda.empty_cache()
        
print('\n--- Training Complete ---')
print(f'RMSE for new folds: {fold_rmse_scores}')
print(f'All model paths for ensemble: {fold_model_paths}')


Test Data:

In [None]:
print('\n--- Starting Inference on Test Data ---')
fold_model_paths = [f'model_fold_{fold_index + 1}.pth' for fold_index in range(10)]

INFERENCE_CONFIG = {'MODEL_NAME': 'sentence-transformers/all-mpnet-base-v2', 'MAX_LENGTH': 512, 'BATCH_SIZE': 32, 'DEVICE': torch.device('cpu')}
test_dataframe = pd.read_json('test_data.json')

if 'ID' not in test_dataframe.columns:
    test_dataframe['ID'] = range(1, len(test_dataframe) + 1)


Test Data Preprocessing:

In [None]:
print('Preprocessing test data...')
test_dataframe['user_prompt'] = test_dataframe['user_prompt'].astype(str)
test_dataframe['system_prompt'] = test_dataframe['system_prompt'].fillna('None').astype(str)
test_dataframe['response'] = test_dataframe['response'].astype(str)
test_dataframe['full_text'] = 'User: ' + test_dataframe['user_prompt'] + ' | System: ' + test_dataframe['system_prompt'] + ' | Response: ' + test_dataframe['response']
test_dataframe['metric_embedding'] = test_dataframe['metric_name'].map(metric_name_to_embedding)


Dataset, DataLoader, and Model Preparation

In [None]:
test_dataset = InferenceMetricScoringDataset(test_dataframe, text_tokenizer, INFERENCE_CONFIG['MAX_LENGTH'])
test_data_loader = DataLoader(test_dataset, batch_size=INFERENCE_CONFIG['BATCH_SIZE'] * 2, shuffle=False, num_workers=2)
all_fold_probability_arrays = []
inference_model = MetricScoringModel(INFERENCE_CONFIG['MODEL_NAME']).to(INFERENCE_CONFIG['DEVICE'])


Ensemble Inference and Submission Generation

In [None]:
for fold_model_path in fold_model_paths:

    print(f'Loading model: {fold_model_path}')
    try:
        inference_model.load_state_dict(torch.load(fold_model_path, map_location=INFERENCE_CONFIG['DEVICE']), strict=False)
    except FileNotFoundError:
        print(f'Warning: Model file {fold_model_path} not found. Skipping this fold.')
        continue
    inference_model.eval()
    fold_probability_batches = []

    for batch in tqdm(test_data_loader, desc=f'Predicting with {fold_model_path}'):
        
        batch_input_ids = batch['input_ids'].to(INFERENCE_CONFIG['DEVICE'])
        batch_attention_mask = batch['attention_mask'].to(INFERENCE_CONFIG['DEVICE'])
        batch_metric_embeddings = batch['metric_embedding'].to(INFERENCE_CONFIG['DEVICE'])
        logits = inference_model(batch_metric_embeddings, batch_input_ids, batch_attention_mask)
        batch_probabilities = torch.softmax(logits, dim=1)
        fold_probability_batches.append(batch_probabilities.detach().cpu().numpy())
    all_fold_probability_arrays.append(np.concatenate(fold_probability_batches))

if not all_fold_probability_arrays:
    print('Error: No models were loaded for inference. Cannot create the file.')
    
else:
    print('Ensemble predictions complete.')
    averaged_class_probabilities = np.mean(all_fold_probability_arrays, axis=0)
    score_value_array = np.arange(0, 11)
    predicted_score_values = (averaged_class_probabilities * score_value_array).sum(axis=1)
    submission_dataframe = pd.DataFrame({'ID': test_dataframe['ID'], 'score': predicted_score_values})
    submission_dataframe['score'] = submission_dataframe['score'].clip(0, 10)
    submission_dataframe.to_csv('FINAL_RUN.csv', index=False)
    
    print('\n--- FINAL_RUN.csv created successfully! ---')
    print(submission_dataframe.head())
