In [1]:
!pip install loralib evaluate -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86

In [2]:
import torch
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import (
    RobertaModel, 
    RobertaTokenizer, 
    RobertaConfig,
    get_linear_schedule_with_warmup,
    get_scheduler,
    DataCollatorWithPadding,
)

from torch.optim import AdamW
from datasets import load_dataset
import loralib as lora
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm
from sklearn import metrics
import os
import random
import wandb
from torch.amp import autocast, GradScaler

import string
import re

# Set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Set device to GPU, if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

2025-04-18 21:09:56.520108: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745010596.915563      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745010597.034714      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda


In [3]:
# Configuration
config = {
    "model_name": "roberta-large",
    
    "lora_rank": 10,
    "lora_alpha": 20,
    "lora_query": True,
    "lora_key": False,
    "lora_value": True,
    "lora_output": False,
    
    "max_length": 512,
    "batch_size": 16,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "epochs": 5,
    "warmup_steps": 500,
    "max_steps": 10,
    "max_grad_norm": 1.0,
    "output_dir": "./lora_roberta_ag_news",
    "log_wandb": False
}

# Initialize wandb if needed
if config["log_wandb"]:
    wandb.init(project="lora-roberta-ag-news", config=config)

In [4]:
class RobertaForSequenceClassification(nn.Module):
    def __init__(self, roberta_model, num_labels, id2label=None):
        super().__init__()
        self.roberta = roberta_model
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)
        
        # Add id2label mapping
        if id2label is None:
            self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
        else:
            self.id2label = id2label
            
        # Create label2id as the inverse mapping
        self.label2id = {label: idx for idx, label in self.id2label.items()}
        
        # Store these mappings in the model config for compatibility
        if hasattr(self.roberta, 'config'):
            self.roberta.config.id2label = self.id2label
            self.roberta.config.label2id = self.label2id
        
    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        pooled_output = sequence_output[:, 0, :]  # CLS token
        
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))
            
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [5]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')

tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    # Punctuation to keep
    punctuation_to_keep = ".,'-$%?!"

    # Remove unwanted punctuation
    processed_texts = [
        ''.join(
            char for char in text
            if char not in string.punctuation or char in punctuation_to_keep
        )
        for text in examples["text"]
    ]

    # Remove HTML tags, URLs, and extra whitespace
    processed_texts = [re.sub(r'<.*?>', '', text) for text in processed_texts]
    processed_texts = [re.sub(r'http\S+', '', text) for text in processed_texts]
    processed_texts = [re.sub(r'\s+', ' ', text).strip() for text in processed_texts]

    # Tokenize the processed texts
    tokenized = tokenizer(processed_texts, truncation=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

num_labels = train_dataset.features['labels'].num_classes
class_names = train_dataset.features["labels"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors="pt")

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [6]:
def compute_metrics(preds, labels):
    precision = metrics.precision_score(labels, preds, average='weighted')
    recall = metrics.recall_score(labels, preds, average='weighted')
    f1 = metrics.f1_score(labels, preds, average='weighted', zero_division=0)
    accuracy = metrics.accuracy_score(labels, preds)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [7]:
# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained(config["model_name"])

# Load the pre-trained RoBERTa model
base_model = RobertaModel.from_pretrained(config["model_name"])

# Apply LoRA to the query and value projection matrices in each attention layer
for layer in base_model.encoder.layer:

    if config["lora_query"]:
        # Apply LoRA to the query projection with lora_alpha
        original_q_proj = layer.attention.self.query
        layer.attention.self.query = lora.Linear(
            original_q_proj.in_features,
            original_q_proj.out_features,
            r=config["lora_rank"],
            lora_alpha=config["lora_alpha"],
            bias=original_q_proj.bias is not None
        )
        # Copy the pre-trained weights
        layer.attention.self.query.weight.data.copy_(original_q_proj.weight.data)
        if original_q_proj.bias is not None:
            layer.attention.self.query.bias.data.copy_(original_q_proj.bias.data)
            

    if config["lora_key"]:
        original_k_proj = layer.attention.self.key
        layer.attention.self.key = lora.Linear(
            original_k_proj.in_features,
            original_k_proj.out_features,
            r=config["lora_rank"],
            bias=original_k_proj.bias is not None
        )

        # Copy the pre-trained weights
        layer.attention.self.key.weight.data.copy_(original_k_proj.weight.data)
        if original_k_proj.bias is not None:
            layer.attention.self.key.bias.data.copy_(original_k_proj.bias.data)
    

    if config["lora_value"]:
        # Apply LoRA to the value projection with lora_alpha
        original_v_proj = layer.attention.self.value
        layer.attention.self.value = lora.Linear(
            original_v_proj.in_features,
            original_v_proj.out_features,
            r=config["lora_rank"],
            lora_alpha=config["lora_alpha"],
            bias=original_v_proj.bias is not None
        )
    
        # Copy the pre-trained weights
        layer.attention.self.value.weight.data.copy_(original_v_proj.weight.data)
        if original_v_proj.bias is not None:
            layer.attention.self.value.bias.data.copy_(original_v_proj.bias.data)

    if config["lora_output"]:
        original_output_proj = layer.attention.output.dense
        layer.attention.output.dense = lora.Linear(
            original_output_proj.in_features,
            original_output_proj.out_features,
            r=config["lora_rank"],
            bias=original_output_proj.bias is not None
        )

        # Copy the pre-trained weights
        layer.attention.output.dense.weight.data.copy_(original_output_proj.weight.data)
        if original_output_proj.bias is not None:
            layer.attention.output.dense.bias.data.copy_(original_output_proj.bias.data)
    

# Mark only LoRA parameters as trainable
lora.mark_only_lora_as_trainable(base_model)

# Create the classification model
model = RobertaForSequenceClassification(base_model, id2label=id2label, num_labels=4)  # AG News has 4 classes

# Move model to device
model.to(device)
model = nn.DataParallel(model)

# Count trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters: 987140


In [8]:
# Create PyTorch datasets
train_dataset = train_dataset.with_format("torch")
eval_dataset = eval_dataset.with_format("torch")

# Create data loaders
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_sampler,
    batch_size=config["batch_size"],
    pin_memory=True,
    num_workers=2,
    persistent_workers=True,
    collate_fn=data_collator,
)

eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(
    eval_dataset,
    sampler=eval_sampler,
    batch_size=config["batch_size"],
    num_workers=2,
    pin_memory=True,
    persistent_workers=True,
    collate_fn=data_collator
)

In [9]:
# Optimizer and scheduler
optimizer = AdamW(
    [p for p in model.parameters() if p.requires_grad],
    lr=config["learning_rate"],
    weight_decay=config["weight_decay"]
)

total_steps = len(train_dataloader) * config["epochs"]
print(total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config["warmup_steps"],
    num_training_steps=total_steps
)

scaler = GradScaler("cuda")

37300


In [10]:
# Training loop
print("Starting training...")
for epoch in range(config["epochs"]):
    model.train()
    total_train_loss = 0
    train_pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{config['epochs']} [Train]")
    
    for batch in train_pbar:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad(set_to_none=True)
        
        with autocast("cuda"):
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"]
            )
            
        loss = outputs["loss"]
        if isinstance(loss, torch.Tensor) and loss.numel() > 1:
            loss = loss.mean()
            
        total_train_loss += loss.item()
        scaler.scale(loss).backward()
        
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config["max_grad_norm"])
        
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        train_pbar.set_postfix({"loss": loss.item()})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss}")
    
    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    total_eval_loss = 0
    
    eval_pbar = tqdm(eval_dataloader, desc=f"Epoch {epoch+1}/{config['epochs']} [Eval]")
    
    with torch.no_grad():
        for batch in eval_pbar:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with autocast("cuda"):
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"]
                )
            
            loss = outputs["loss"]
            if isinstance(loss, torch.Tensor) and loss.numel() > 1:
                loss = loss.mean()
            total_eval_loss += loss.item()
            
            logits = outputs["logits"]
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels)
    
    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    metrics_results = compute_metrics(all_preds, all_labels)
    
    print(f"Validation Loss: {avg_eval_loss}")
    print(f"Accuracy: {metrics_results['accuracy']:.4f}")
    
    if config["log_wandb"]:
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss,
            **metrics_results
        })

Starting training...


Epoch 1/5 [Train]:   0%|          | 0/7460 [00:00<?, ?it/s]



Average training loss: 0.2502724840907485


Epoch 1/5 [Eval]:   0%|          | 0/40 [00:00<?, ?it/s]

Validation Loss: 0.22240130845457315
Accuracy: 0.9391


Epoch 2/5 [Train]:   0%|          | 0/7460 [00:00<?, ?it/s]



Average training loss: 0.17260967170864583


Epoch 2/5 [Eval]:   0%|          | 0/40 [00:00<?, ?it/s]

Validation Loss: 0.21033989768475295
Accuracy: 0.9422


Epoch 3/5 [Train]:   0%|          | 0/7460 [00:00<?, ?it/s]



Average training loss: 0.1512277090157964


Epoch 3/5 [Eval]:   0%|          | 0/40 [00:00<?, ?it/s]

Validation Loss: 0.18667175639420747
Accuracy: 0.9437


Epoch 4/5 [Train]:   0%|          | 0/7460 [00:00<?, ?it/s]



Average training loss: 0.13340217681856323


Epoch 4/5 [Eval]:   0%|          | 0/40 [00:00<?, ?it/s]

Validation Loss: 0.19593724198639392
Accuracy: 0.9453


Epoch 5/5 [Train]:   0%|          | 0/7460 [00:00<?, ?it/s]



Average training loss: 0.11749999355075347


Epoch 5/5 [Eval]:   0%|          | 0/40 [00:00<?, ?it/s]

Validation Loss: 0.18928632829338313
Accuracy: 0.9406


In [11]:
# Save the fine-tuned model
if not os.path.exists(config["output_dir"]):
    os.makedirs(config["output_dir"])

# Save LoRA weights separately
torch.save(lora.lora_state_dict(model), os.path.join(config["output_dir"], "lora_weights.pt"))

# Save the classifier
torch.save(model.module.classifier.state_dict(), os.path.join(config["output_dir"], "classifier.pt"))
print(f"Model saved to {config['output_dir']}")

# Final evaluation
final_metrics = compute_metrics(all_preds, all_labels)
print("\nFinal Evaluation Results:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"Precision: {final_metrics['precision']:.4f}")
print(f"Recall: {final_metrics['recall']:.4f}")
print(f"F1 Score: {final_metrics['f1']:.4f}")

if config["log_wandb"]:
    wandb.finish()

Model saved to ./lora_roberta_ag_news

Final Evaluation Results:
Accuracy: 0.9406
Precision: 0.9409
Recall: 0.9406
F1 Score: 0.9406


# Testing

In [12]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator, num_workers=2, pin_memory=True, persistent_workers=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            with autocast("cuda"):
                outputs = inference_model(**batch)
        predictions = outputs["logits"].argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    return all_predictions

In [13]:
import pandas as pd

unlabelled_dataset = pd.read_pickle("/kaggle/input/deep-learning-spring-2025-project-2/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [14]:
# Run inference and save predictions
preds = evaluate_model(model, test_dataset, False, 1, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})

df_output.to_csv("/kaggle/working/inference_output.csv", index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 8000/8000 [05:08<00:00, 25.93it/s]


Inference complete. Predictions saved to inference_output.csv
