# Starter Notebook

Install and import required libraries

In [21]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3



In [22]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

## Load Tokenizer and Preprocess Data

In [23]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [24]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [25]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Anything from here on can be modified

In [26]:
#add dropout modification
model = RobertaForSequenceClassification.from_pretrained(base_model, id2label=id2label)
model.config.attention_probs_dropout_prob = 0.1
model.config.hidden_dropout_prob = 0.1

# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=1280, seed=42)
train_val_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

val_split = train_val_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = val_split['train']
validation_dataset = val_split['test']


print(f"number of train set: {len(train_dataset)}")
print(f"number of validation set: {len(validation_dataset)}")
print(f"number of test set: {len(eval_dataset)}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


number of train set: 106848
number of validation set: 11872
number of test set: 1280


In [27]:
# PEFT Config
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias = 'none',
    target_modules = ['query','value','key','out'],
    task_type="SEQ_CLS",
)

peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Mo

In [28]:
#show LoRA trainable parameters
num_trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print("LoRA model parameter:", num_trainable_params)
assert num_trainable_params <= 1e6, f"warning: parameter exceeds 1e6. Now we have {num_trainable_params}"

LoRA model parameter: 814852


## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [29]:
#show LoRA trainable parameters percentage
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 814,852 || all params: 125,463,560 || trainable%: 0.6495


## Training Setup

In [30]:
!pip install scikit-learn



In [31]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [32]:
#Give dynamic progress of Trainer
from transformers import TrainerCallback

class EpochProgressCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"\n==> start No. {state.epoch + 1:.0f}  Epoch ...")
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"==> Complete No. {state.epoch:.0f} Epoch.\n")

In [33]:
from transformers import TrainerCallback

class StepEvaluationCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # check accuracy at each evaluation.
        if metrics is not None and "eval_accuracy" in metrics:
            print(f"\n[Step {state.global_step}] Eval Accuracy: {metrics['eval_accuracy']:.4f}")

### Start Training

In [34]:
# Setup Training args
#.csv file output position
output_dir = "output"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    eval_steps=400, #Each 2400 steps we evaluate once.
    save_strategy='steps',
    save_steps=400, #Each 2400 steps we save once.
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_steps=50,
    learning_rate=1e-5,
    num_train_epochs=5,
    # max_steps=1200,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    weight_decay=0.01,
    max_grad_norm=1.0
)

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=validation_dataset,
          data_collator=data_collator,
          callbacks=[EpochProgressCallback(), StepEvaluationCallback()]
      )

In [35]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [36]:
result = peft_lora_finetuning_trainer.train()


==> start No. 1  Epoch ...


Step,Training Loss,Validation Loss,Accuracy
400,1.2348,1.164917,0.872389
800,0.3719,0.343956,0.890162
1200,0.3018,0.3191,0.894794
1600,0.3271,0.310068,0.900017
2000,0.2997,0.302057,0.899933
2400,0.2514,0.301696,0.902628
2800,0.2638,0.304193,0.902123
3200,0.3031,0.288918,0.905323
3600,0.29,0.290628,0.906503
4000,0.3007,0.287978,0.906334



[Step 400] Eval Accuracy: 0.8724

[Step 800] Eval Accuracy: 0.8902

[Step 1200] Eval Accuracy: 0.8948

[Step 1600] Eval Accuracy: 0.9000

[Step 2000] Eval Accuracy: 0.8999

[Step 2400] Eval Accuracy: 0.9026

[Step 2800] Eval Accuracy: 0.9021

[Step 3200] Eval Accuracy: 0.9053

[Step 3600] Eval Accuracy: 0.9065

[Step 4000] Eval Accuracy: 0.9063

[Step 4400] Eval Accuracy: 0.9067

[Step 4800] Eval Accuracy: 0.9083

[Step 5200] Eval Accuracy: 0.9087

[Step 5600] Eval Accuracy: 0.9097

[Step 6000] Eval Accuracy: 0.9092

[Step 6400] Eval Accuracy: 0.9115
==> Complete No. 1 Epoch.


==> start No. 2  Epoch ...

[Step 6800] Eval Accuracy: 0.9101

[Step 7200] Eval Accuracy: 0.9105

[Step 7600] Eval Accuracy: 0.9116

[Step 8000] Eval Accuracy: 0.9131

[Step 8400] Eval Accuracy: 0.9138

[Step 8800] Eval Accuracy: 0.9132

[Step 9200] Eval Accuracy: 0.9143

[Step 9600] Eval Accuracy: 0.9132

[Step 10000] Eval Accuracy: 0.9137

[Step 10400] Eval Accuracy: 0.9150

[Step 10800] Eval Accuracy: 0.9145

## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [37]:
# def classify(model, tokenizer, text):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
#     output = model(**inputs)

#     prediction = output.logits.argmax(dim=-1).item()

#     print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
#     return id2label[prediction]

In [38]:
# classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
# classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [39]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Final Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [40]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 160/160 [00:05<00:00, 27.61it/s]

Final Evaluation Metric: {'accuracy': 0.91640625}





### Run Inference on unlabelled dataset

In [41]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("../../data/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 8000
})

In [42]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"test_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [00:26<00:00, 37.61it/s]


Inference complete. Predictions saved to inference_output.csv
