In [6]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import os
from pathlib import Path

import textstat
import nltk
from collections import Counter
from language_tool_python import LanguageTool

import spacy
import torch
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaModel, RobertaTokenizer, RobertaForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor

### Load Data

In [7]:
local_dir = Path(os.getcwd())
kaggle_dir = Path('/kaggle/input')

notebook_dir = local_dir

In [8]:
train_df = pd.read_csv(notebook_dir / 'commonlit-evaluate-student-summaries' / 'summaries_train.csv')
train_prompts_df = pd.read_csv(notebook_dir / 'commonlit-evaluate-student-summaries' / 'prompts_train.csv')

## Training Conent Model (RoBERTa)

In [9]:
def summarize_text(prompt, text, model, tokenizer, max_length=128):
    """Summarize long text with prompt using T5 model"""
    input_text = 'summarize: ' + prompt + text
    input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    summary_ids = model.generate(input_ids, max_length=max_length, length_penalty=5.0, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def split_data(df, prompt_title):
    """Split data into training and validation sets based on prompt_title"""
    training_df = df[df['prompt_title'] != prompt_title]
    validation_df = df[train_df['prompt_title'] == prompt_title]
    return training_df, validation_df

def prepare_data(df, tokenizer, batch_size, shuffle, target):
    """Prepare data into DataLoader for training and validation"""
    prompts = df['prompt_text_summary'].to_list()
    responses = df['text'].to_list()
    scores = df[target].to_list()
    
    encodings = tokenizer(prompts, responses, truncation=True, padding=True, max_length=512, return_tensors='pt')
    scores_tensor = torch.tensor(scores).unsqueeze(-1).float()
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], scores_tensor)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    
    return loader

def compute_model_loss(model, data_loader, device):
    """Compute MSE loss of model on data_loader"""
    model.eval()  # set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    
    mse = total_loss / len(data_loader)
    
    model.train()  # set the model back to training mode
    return mse

def train_roberta(model_path, training_loader, validation_loader, device, lr=3e-5, epochs=100):
    """Train RoBERTa model with automatic saving of best model and early stopping"""
    print(f'Loading RoBERTa model from {model_path}...')
    model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=1)
    model.to(device)
    model.train()
    optimizer = AdamW(model.parameters(), lr=lr)

    initial_val_loss = compute_model_loss(model, validation_loader, device)

    print('Initial validation loss:', initial_val_loss, '\n')
    print('Training model...')
    
    epochs_without_improvement = 0
    no_improvement_since_reset = 0

    for epoch in range(epochs):
        total_loss = 0
        model.train()
        
        for batch in training_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        train_loss = total_loss / len(training_loader)
        val_loss = compute_model_loss(model, validation_loader, device)
        
        print(f'Epoch {epoch + 1}/{epochs} - Training Loss: {train_loss:.4f} - Validation Loss: {val_loss:.4f}')
        
        if val_loss < initial_val_loss:
            epochs_without_improvement = 0
            no_improvement_since_reset = 0
            initial_val_loss = val_loss
            print('Improvement in validation loss. Saving model.')
            model.save_pretrained(model_path)
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement == 5:
            no_improvement_since_reset += 1

            if no_improvement_since_reset == 2:
                print('No more improvement in validation loss. Stopping training.')
                break
            else:
                print('No improvement in validation loss for 5 epochs.')
                print('Resetting model to last saved state.')
                no_improvement_since_reset += 1
                model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=1)
                model.to(device)

    print('Training complete. Final validation MSE:', initial_val_loss, '\n')
    return model
    

### T5 for Prompt Summarization

In [10]:
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length = 512)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
train_prompts_df['prompt_text_summary'] = np.zeros(len(train_prompts_df), dtype=object)
for i, row in train_prompts_df.iterrows():
    prompt_summary = summarize_text(row['prompt_question'], row['prompt_text'], t5_model, t5_tokenizer)
    train_prompts_df.loc[i, 'prompt_text_summary'] = prompt_summary

### Train RoBERTa Model

In [None]:
model_path = notebook_dir / 'roberta_content_scorer'
device = torch.device('mps') if torch.mps.is_available() else torch.device('cpu')

roberta_content_model = RobertaForSequenceClassification.from_pretrained(model_path, num_labels=1)
roberta_tozenizer = RobertaTokenizer.from_pretrained('roberta-base', model_max_length = 512)

In [None]:
train_df = train_df.merge(train_prompts_df, on='prompt_id', how='inner')

In [None]:
training_df, validation_df = split_data(train_df, 'On Tragedy')

training_loader = prepare_data(
    training_df, 
    roberta_tozenizer, 
    batch_size=8, 
    shuffle=True, 
    target='content'
)

validation_loader = prepare_data(
    validation_df, 
    roberta_tozenizer, 
    batch_size=8, 
    shuffle=False, 
    target='content'
)

In [None]:
roberta_content_model = train_roberta(model_path, training_loader, validation_loader, device, lr=3e-5, epochs=100)