# Starter Notebook

Install and import required libraries

In [2]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3



In [3]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle




In [4]:
import torch

# 检查CUDA是否可用
if torch.cuda.is_available():
    print("CUDA已启用！")
    print("GPU数量:", torch.cuda.device_count())
    print("当前使用的设备:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("未检测到CUDA。")

CUDA已启用！
GPU数量: 1
当前使用的设备: NVIDIA GeForce RTX 4070 SUPER


## Load Tokenizer and Preprocess Data

In [5]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [6]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [7]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Anything from here on can be modified

In [8]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=1280, seed=42)
train_val_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

val_split = train_val_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = val_split['train']
validation_dataset = val_split['test']


print(f"number of train set: {len(train_dataset)}")
print(f"number of validation set: {len(validation_dataset)}")
print(f"number of test set: {len(eval_dataset)}")

number of train set: 106848
number of validation set: 11872
number of test set: 1280


In [9]:
# PEFT Config
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.05,
    bias = 'none',
    target_modules = ['query','value'],
    task_type="SEQ_CLS",
)

peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Mo

In [10]:
#show LoRA trainable parameters
num_trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print("LoRA model parameter:", num_trainable_params)
assert num_trainable_params <= 1e6, f"warning: parameter exceeds 1e6. Now we have {num_trainable_params}"

LoRA model parameter: 741124


## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [11]:
#show LoRA trainable parameters percentage
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 741,124 || all params: 125,389,832 || trainable%: 0.5911


## Training Setup

In [12]:
!pip install scikit-learn



In [13]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [14]:
#Give dynamic progress of Trainer
from transformers import TrainerCallback

class EpochProgressCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"\n==> start No. {state.epoch + 1:.0f}  Epoch ...")
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"==> Complete No. {state.epoch:.0f} Epoch.\n")

In [15]:
from transformers import TrainerCallback

class StepEvaluationCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        # check accuracy at each evaluation.
        if metrics is not None and "eval_accuracy" in metrics:
            print(f"\n[Step {state.global_step}] Eval Accuracy: {metrics['eval_accuracy']:.4f}")

### Start Training

In [16]:
# Setup Training args
#.csv file output position
output_dir = "code/RoBERTa_AGNEWS/output"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    eval_steps=2400, #Each 2400 steps we evaluate once.
    save_strategy='steps',
    save_steps=2400, #Each 2400 steps we save once.
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_steps=50,
    learning_rate=2e-5,
    num_train_epochs=3,
    # max_steps=1200,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch"
)

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=validation_dataset,
          data_collator=data_collator,
          callbacks=[EpochProgressCallback(), StepEvaluationCallback()]
      )

In [17]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
result = peft_lora_finetuning_trainer.train()


==> start No. 1  Epoch ...


Step,Training Loss,Validation Loss,Accuracy
2400,0.2419,0.289838,0.908103
4800,0.2463,0.271236,0.912989
7200,0.2228,0.25733,0.916442
9600,0.2582,0.251427,0.918379
12000,0.2252,0.244414,0.920654
14400,0.1998,0.241629,0.921664
16800,0.2874,0.239734,0.920991
19200,0.272,0.240042,0.921917



[Step 2400] Eval Accuracy: 0.9081

[Step 4800] Eval Accuracy: 0.9130
==> Complete No. 1 Epoch.


==> start No. 2  Epoch ...

[Step 7200] Eval Accuracy: 0.9164

[Step 9600] Eval Accuracy: 0.9184

[Step 12000] Eval Accuracy: 0.9207
==> Complete No. 2 Epoch.


==> start No. 3  Epoch ...

[Step 14400] Eval Accuracy: 0.9217

[Step 16800] Eval Accuracy: 0.9210

[Step 19200] Eval Accuracy: 0.9219
==> Complete No. 3 Epoch.



## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [19]:
# def classify(model, tokenizer, text):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
#     output = model(**inputs)

#     prediction = output.logits.argmax(dim=-1).item()

#     print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
#     return id2label[prediction]

In [20]:
# classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
# classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [21]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Final Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [22]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 160/160 [00:06<00:00, 26.58it/s]

Final Evaluation Metric: {'accuracy': 0.91796875}





### Run Inference on unlabelled dataset

In [25]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("../../data/test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 8000
})

In [26]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"test_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [00:26<00:00, 37.22it/s]

Inference complete. Predictions saved to inference_output.csv



