In [33]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import torch
from torch import nn
from transformers import get_cosine_schedule_with_warmup
import time
from torch.optim import AdamW

# Config

In [23]:
learning_rate = 5e-6
n_epochs = 2
gradient_acc = 8
batch_size = 4
max_len = 512
seed = 8824
weight_decay = 1e-4
warmup_rate = 0.1
overall_weight = 0.75
rdrop_weight = 0.1

model_save_path = "saved_models/best_mmregressor.pth"

In [24]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
device = torch.device("cpu")

# Dataloader

In [25]:
input_ids, attention_masks, labels = [], [], []

# Load and tokenize the dataset
for idx, row in pd.read_csv("data/full_dataset.csv").iterrows():
    text1 = str(row['text1']) if pd.notna(row['text1']) else ""
    text2 = str(row['text2']) if pd.notna(row['text2']) else ""

    encode_dict = tokenizer(
        text1,
        text2,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        add_special_tokens=True
    )

    input_ids.append(encode_dict['input_ids'])
    attention_masks.append(encode_dict['attention_mask'])
    labels.append([
        float(x) for x in [
            row['geography'], row['entities'], row['time'],
            row['narrative'], row['overall'], row['style'], row['tone']
        ]
    ])

# Sanity check
print("Input IDs:", input_ids[0])
print("Attention Mask:", attention_masks[0])
print("Labels:", labels[0])

# Split the data before converting to tensors
train_inputs, test_inputs, train_masks, test_masks, train_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=seed
)

# Convert to tensors
train_inputs = torch.tensor(train_inputs)
test_inputs = torch.tensor(test_inputs)
train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)
train_labels = torch.tensor(train_labels, dtype=torch.float)
test_labels = torch.tensor(test_labels, dtype=torch.float)

# Create Datasets
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)


Input IDs: [0, 162607, 85717, 15639, 48802, 4, 601, 5, 28563, 5, 292, 62, 92610, 297, 104, 79099, 33, 27941, 509, 34784, 297, 23, 10, 2356, 42552, 26, 7, 10013, 11192, 7514, 150631, 450, 152388, 15700, 2926, 1419, 23, 10542, 118623, 4, 35206, 2804, 5, 581, 27998, 74918, 2822, 39395, 64227, 33233, 127067, 99091, 5337, 61340, 16503, 23, 2076, 350, 4293, 47064, 4, 581, 34419, 113771, 5, 1311, 1459, 3316, 112, 335, 104158, 634, 1486, 4, 2789, 4, 111, 17686, 104588, 4, 118623, 4, 509, 25534, 71, 678, 14614, 568, 678, 47219, 4, 18738, 13, 214, 14614, 568, 4, 136, 8035, 10, 63323, 3674, 3445, 23, 139355, 111, 10, 11476, 56859, 136, 10, 116131, 4935, 1295, 87338, 4, 2076, 350, 4293, 47064, 53257, 32920, 17065, 1814, 161644, 2804, 5, 161644, 2804, 10, 8, 7077, 53, 32603, 1916, 11782, 7, 1672, 10, 6, 115656, 9393, 27941, 81887, 297, 47, 3249, 10, 83629, 7279, 4, 1284, 70, 92610, 26, 7, 80939, 13986, 71, 1257, 136, 5962, 10, 2258, 4, 3129, 7068, 91, 173964, 10, 56050, 68823, 5, 581, 27941, 111, 7

# Model

In [26]:
class MMRegressor(nn.Module):

    def __init__(self):

        super(MMRegressor, self).__init__()
        self.config = AutoConfig.from_pretrained("FacebookAI/xlm-roberta-base")
        self.reg_model = AutoModel.from_pretrained("FacebookAI/xlm-roberta-base")

        self.fc1 = nn.Linear(self.config.hidden_size, 512)
        self.fc2 = nn.Linear(512, 7)
        self.activation = nn.GELU()

    def forward(self, input_ids, attention_mask):

        output = self.reg_model(input_ids, attention_mask)[1]
        logits = self.fc2(self.activation(self.fc1(output)))

        return logits

# Train & Evaluate

In [30]:
def train_and_evaluate(model, train_loader, val_loader, device):
    model.to(device)
    model.train()

    criterion = nn.MSELoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    total_steps = len(train_loader) * n_epochs
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(warmup_rate * total_steps),
        num_training_steps=total_steps
    )

    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss = 0.0

        print(f"—————————————————————— Epoch {epoch+1} ——————————————————————")

        for step, batch in enumerate(train_loader, 1):
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            outputs = model(input_ids, attention_mask)  # Assume model returns [batch_size, 7]
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            # Print progress every 10 batches
            if step % 10 == 0 or step == len(train_loader):
                print(f"Epoch {epoch+1} [{step}/{len(train_loader)}] "
                      f"Loss: {loss.item():.4f} "
                      f"Progress: {100 * step / len(train_loader):.1f}%")

        avg_train_loss = train_loss / len(train_loader)
        elapsed = time.time() - start_time
        minutes = int(elapsed // 60)
        seconds = int(elapsed % 60)
        print(f"[Epoch {epoch+1}] Train Loss: {avg_train_loss:.4f} | Time: {minutes}m {seconds}s")

        # Evaluate
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_targets = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [x.to(device) for x in batch]
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                all_preds.extend(outputs.cpu().numpy())
                all_targets.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        mae = mean_absolute_error(all_targets, all_preds)
        r2 = r2_score(all_targets, all_preds)

        print(f"[Epoch {epoch+1}] Val Loss (MSE): {avg_val_loss:.4f}")
        print(f"[Epoch {epoch+1}] Val MAE: {mae:.4f}")
        print(f"[Epoch {epoch+1}] Val R²: {r2:.4f}")
        model.train()

    # Save final model
    torch.save(model.state_dict(), model_save_path)


In [31]:
model = MMRegressor()
train_and_evaluate(model, train_loader, test_loader, device)

—————————————————————— Epoch 1 ——————————————————————
Epoch 1 [10/547] Loss: 6.3783 Progress: 1.8%
Epoch 1 [20/547] Loss: 4.0789 Progress: 3.7%
Epoch 1 [30/547] Loss: 8.6415 Progress: 5.5%
Epoch 1 [40/547] Loss: 6.9946 Progress: 7.3%
Epoch 1 [50/547] Loss: 7.5339 Progress: 9.1%
Epoch 1 [60/547] Loss: 6.6879 Progress: 11.0%
Epoch 1 [70/547] Loss: 5.3838 Progress: 12.8%
Epoch 1 [80/547] Loss: 6.1982 Progress: 14.6%
Epoch 1 [90/547] Loss: 6.5797 Progress: 16.5%
Epoch 1 [100/547] Loss: 1.6912 Progress: 18.3%
Epoch 1 [110/547] Loss: 7.8363 Progress: 20.1%
Epoch 1 [120/547] Loss: 5.9835 Progress: 21.9%
Epoch 1 [130/547] Loss: 2.8989 Progress: 23.8%
Epoch 1 [140/547] Loss: 2.0771 Progress: 25.6%
Epoch 1 [150/547] Loss: 3.8994 Progress: 27.4%
Epoch 1 [160/547] Loss: 2.1883 Progress: 29.3%
Epoch 1 [170/547] Loss: 2.6699 Progress: 31.1%
Epoch 1 [180/547] Loss: 3.1015 Progress: 32.9%
Epoch 1 [190/547] Loss: 3.6091 Progress: 34.7%
Epoch 1 [200/547] Loss: 3.6366 Progress: 36.6%
Epoch 1 [210/547] Lo

RuntimeError: Parent directory saved_models does not exist.