In [1]:
import pandas as pd
import os
from pathlib import Path

import torch
from torch.nn import MSELoss
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset

from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from transformers import Trainer, TrainingArguments

### Load Data

In [2]:
local_dir = Path(os.getcwd())
kaggle_dir = Path('/kaggle/input')

notebook_dir = local_dir

In [3]:
train_df = pd.read_csv(notebook_dir / 'commonlit-evaluate-student-summaries' / 'summaries_train.csv')
train_prompts_df = pd.read_csv(notebook_dir / 'commonlit-evaluate-student-summaries' / 'prompts_train.csv')
train_df = train_df.merge(train_prompts_df, on='prompt_id', how='inner')

## Training Prediction Model (DeBERTa)

In [4]:
def split_data(df, prompt_title):
    """Split data into training and validation sets based on prompt_title"""
    training_df = df[df['prompt_title'] != prompt_title]
    validation_df = df[df['prompt_title'] == prompt_title]
    return training_df, validation_df

def prepare_data(df, tokenizer, batch_size, shuffle):
    """Prepare data into DataLoader for training and validation"""
    responses = df['text'].to_list()
    content = df['content'].to_list()
    wording = df['wording'].to_list()
    
    encodings = tokenizer(responses, truncation=True, padding=True, max_length=512, return_tensors='pt')
    content_tensor = torch.tensor(content).unsqueeze(-1).float()
    wording_tensor = torch.tensor(wording).unsqueeze(-1).float()
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], content_tensor, wording_tensor)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    
    return loader

def compute_metrics(eval_predictions):
    mse_loss_fn = MSELoss()
    logits, labels = eval_predictions
    content_logits, wording_logits = logits[:, 0], logits[:, 1]
    content_labels, wording_labels = labels[:, 0], labels[:, 1]
    
    content_mse = mse_loss_fn(content_logits, content_labels).item()
    wording_mse = mse_loss_fn(wording_logits, wording_labels).item()
    
    avg_mse = (content_mse + wording_mse) / 2
    
    return {'content_mse': content_mse, 'wording_mse': wording_mse, 'avg_mse': avg_mse}

def train_transformer(model_path, training_loader, validation_loader, device, lr=1e-5, weight_decay=0.01, epochs=25):
    """Train transformer model using HuggingFace's Trainer"""
    
    print(f'Loading model from {model_path}...')
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
    model.to(device)

    training_args = TrainingArguments(
        per_device_train_batch_size=training_loader.batch_size,
        per_device_eval_batch_size=validation_loader.batch_size,
        evaluation_strategy='epoch',
        learning_rate=lr,
        weight_decay=weight_decay,
        num_train_epochs=epochs,
        output_dir='./training_output',
        logging_dir='./training_logs',
        logging_steps=1,
        save_strategy='epoch',
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model='avg_mse',
        greater_is_better=False,
        push_to_hub=False,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=training_loader.dataset,
        eval_dataset=validation_loader.dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    metrics = trainer.evaluate()
    print(f'Training complete. Final validation MSE: {metrics["avg_mse"]}')

    return model

### Train RoBERTa Model

In [10]:
model_path = notebook_dir / 'deberta-grader'
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

if not os.path.exists(model_path):
    temp_model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=2)
    temp_model.save_pretrained(model_path)

deberta_model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
deberta_tozenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base', model_max_length=1024)

deberta_config = AutoConfig.from_pretrained(model_path)
deberta_config.max_position_embeddings = 1024
deberta_config.hidden_dropout_prob = 0.2
deberta_config.attention_probs_dropout_prob = 0.2

In [11]:
training_df, validation_df = split_data(train_df, 'The Third Wave')

training_loader = prepare_data(
    training_df, 
    deberta_tozenizer, 
    batch_size=8, 
    shuffle=True
)

validation_loader = prepare_data(
    validation_df, 
    deberta_tozenizer, 
    batch_size=8, 
    shuffle=False
)

In [12]:
deberta_model = train_transformer(model_path, training_loader, validation_loader, device, lr=1e-5, epochs=100)

Loading model from c:\Users\c.gendron1\Git\commonlit-evaluate-student-summaries-kaggle\deberta-grader...


  0%|          | 0/75800 [00:00<?, ?it/s]

TypeError: vars() argument must have __dict__ attribute