# Starter Notebook

Install and import required libraries

In [1]:
!pip install datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3
!pip install transformers --upgrade

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Down

In [2]:
import os
import torch
import pandas as pd
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import evaluate

2025-04-18 21:52:41.104245: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745013161.320016      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745013161.381253      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Load Tokenizer and Preprocess Data

To clean the dataset, we filter out very short headlines (less than 10 tokens) and extremely long ones (more than 512 tokens), which might cause instability during training.

In [None]:
base_model = "roberta-base"
dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)
dataset = dataset.filter(lambda x: 10 < len(x["text"].split()) <= 512)

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

num_labels = dataset.features['label'].num_classes
id2label = {i: label for i, label in enumerate(dataset.features['label'].names)}
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/119985 [00:00<?, ? examples/s]

We load a pre-trained RoBERTa model for sequence classification with 4 output classes.

Then, we apply LoRA (Low-Rank Adaptation) to freeze the model and inject lightweight trainable adapters into the attention mechanism:

We adapt only query, key, and value projection matrices.

r=6 controls the rank of the low-rank matrix.

alpha=16 scales the adapter output.

dropout=0.1 adds regularization.

This greatly reduces the number of trainable parameters, staying under the 1M limit.

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model, num_labels=num_labels, id2label=id2label
)

peft_config = LoraConfig(
    r=6, 
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value", "key"],  
    task_type="SEQ_CLS"
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 925,444 || all params: 125,574,152 || trainable%: 0.7370


##  Training Configuration
We split our tokenized dataset into training and validation sets, using 640 samples for validation.

We define training hyperparameters using TrainingArguments:

4 epochs

Batch sizes: 16 (train) and 64 (eval)

Cosine learning rate decay with warmup

Best model checkpoint is saved based on accuracy

We also define a simple compute_metrics function that calculates accuracy.

In [8]:
split = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split['train']
eval_dataset = split['test']


In [9]:
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    return {"accuracy": accuracy_score(pred.label_ids, preds)}

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    save_total_limit=2,
    report_to="none"
)


trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


### Start Training

In [10]:
trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy
1,0.2698,0.252297,0.9125
2,0.2505,0.221683,0.923438
3,0.2268,0.209723,0.928125
4,0.2234,0.205723,0.926562




TrainOutput(global_step=14920, training_loss=0.2992374527550255, metrics={'train_runtime': 12850.4997, 'train_samples_per_second': 37.149, 'train_steps_per_second': 1.161, 'total_flos': 8.418353115112325e+16, 'train_loss': 0.2992374527550255, 'epoch': 4.0})

## Evaluate Finetuned Model


In [None]:
print("Final Evaluation:")
trainer.evaluate()

trainable = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {trainable}")


📊 Final Evaluation:




✅ Total Trainable Parameters: 925444


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [12]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [13]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 1, Label: Sports, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.


'Business'

### Run Inference on eval_dataset

In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [None]:
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

100%|██████████| 80/80 [00:11<00:00,  6.98it/s]

Evaluation Metric: {'accuracy': 0.928125}





### Run Inference on unlabelled dataset

In [None]:
unlabelled_dataset = pd.read_pickle("/kaggle/input/test-unlabelled-pkl/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy() 
})
df_output.to_csv("/kaggle/working/inference_output2.csv", index=False)
print("Inference complete. Predictions saved to inference_output.csv")

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

100%|██████████| 1000/1000 [01:51<00:00,  8.93it/s]

Inference complete. Predictions saved to inference_output.csv





#### Observations and Lessons
Here are some insights from our experiments:

r=6 and alpha=16 performed better than larger ranks which caused overfitting.

Filtering out very short or very long inputs improved stability.

The warmup ratio and cosine scheduler helped avoid early overfitting.

Applying LoRA to feedforward layers violated the parameter budget.

