# Starter Notebook

Install and import required libraries

In [None]:
%pip install transformers datasets evaluate accelerate peft trl bitsandbytes
%pip install nvidia-ml-py3

In [None]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

## Load Tokenizer and Preprocess Data

In [None]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [None]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

## Anything from here on can be modified

In [None]:
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

## Data Cleaning and Augmentation

In [1]:

from datasets import load_dataset
import re
import nlpaug.augmenter.word as naw
from transformers import MarianMTModel, MarianTokenizer

def clean_and_augment(dataset, augment_size=500):
    def is_valid_text(text, min_len=5, max_len=512):
        word_count = len(text.split())
        if word_count < min_len or word_count > max_len:
            return False
        if len(re.findall(r'[^\x00-\x7F]+', text)) > 0.3 * len(text):
            return False
        return True

    seen_texts = set()
    def filter_dataset(example):
        text = example['text'].strip()
        if text in seen_texts or not is_valid_text(text):
            return False
        seen_texts.add(text)
        return True

    dataset = dataset.filter(filter_dataset)

    syn_aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.1)
    def synonym_augment(example):
        return {
            'text': syn_aug.augment(example['text']),
            'label': example['label']
        }

    eda_augmented_dataset = dataset.map(synonym_augment)
    dataset = dataset.concatenate(eda_augmented_dataset)

    en_de_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de').cuda()
    de_en_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-de-en').cuda()
    en_de_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
    de_en_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-de-en')

    def back_translate(text):
        inputs = en_de_tokenizer(text, return_tensors='pt', truncation=True).to('cuda')
        translated = en_de_model.generate(**inputs, max_length=512)
        de_text = en_de_tokenizer.decode(translated[0], skip_special_tokens=True)

        inputs_back = de_en_tokenizer(de_text, return_tensors='pt', truncation=True).to('cuda')
        translated_back = de_en_model.generate(**inputs_back, max_length=512)
        return de_en_tokenizer.decode(translated_back[0], skip_special_tokens=True)

    augmented_samples = []
    for example in dataset.select(range(min(len(dataset), augment_size))):
        augmented_text = back_translate(example['text'])
        if augmented_text != example['text']:
            augmented_samples.append({'text': augmented_text, 'label': example['label']})

    dataset = dataset.add_items(augmented_samples)
    
    return dataset

dataset = load_dataset('ag_news')
dataset['train'] = clean_and_augment(dataset['train'], augment_size=500)


ModuleNotFoundError: No module named 'nlpaug'

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [None]:
# PEFT Config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias = 'none',
    target_modules = ['query', 'value'],
    task_type="SEQ_CLS",
)

In [None]:
peft_model = get_peft_model(model, peft_config)

peft_model

In [None]:
print("Trainable parameters:")
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(name)

In [None]:
print('PEFT Model')
peft_model.print_trainable_parameters()

## Training Setup

### Layer-wise Learning Rate Decay

In [None]:
from torch.optim import AdamW

def create_optimizer_with_llrd(model, base_lr=5e-5, layer_decay=0.9):
    grouped_parameters = []
    num_layers = len(model.base_model.roberta.encoder.layer)

    # Transformer 每一层
    for layer_idx in range(num_layers):
        layer = model.base_model.roberta.encoder.layer[layer_idx]
        layer_lr = base_lr * (layer_decay ** (num_layers - layer_idx - 1))
        params = [p for n, p in layer.named_parameters() if p.requires_grad]
        if params:
            grouped_parameters.append({"params": params, "lr": layer_lr})

    # Embeddings
    embed_lr = base_lr * (layer_decay ** num_layers)
    embed_params = [p for n, p in model.base_model.roberta.embeddings.named_parameters() if p.requires_grad]
    if embed_params:
        grouped_parameters.append({"params": embed_params, "lr": embed_lr})

    # Classifier
    cls_params = [p for n, p in model.base_model.classifier.named_parameters() if p.requires_grad]
    if cls_params:
        grouped_parameters.append({"params": cls_params, "lr": base_lr})

    # LoRA adapter（识别名称中包含 "lora" 的参数）
    for name, param in model.named_parameters():
        if param.requires_grad and 'lora' in name:
            if 'encoder.layer.' in name:
                layer_num = int(name.split('encoder.layer.')[1].split('.')[0])
                lora_lr = base_lr * (layer_decay ** (num_layers - layer_num - 1))
            else:
                lora_lr = base_lr
            grouped_parameters.append({'params': [param], 'lr': lora_lr})

    return AdamW(grouped_parameters, lr=base_lr)


In [None]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [None]:
# Setup Training args
output_dir = "results"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    logging_steps=1000,
    learning_rate=5e-5,
    num_train_epochs=3,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=8,  # ✅ 保留最近8个checkpoint
    # max_steps=1200,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    fp16=True,
    load_best_model_at_end=True,             # ✅ 自动加载验证集效果最好的模型
    metric_for_best_model="accuracy",        # ✅ 哪个metric作为标准（你可以改成 f1 等）
    greater_is_better=True,                  # ✅ accuracy 越高越好
)

class LLRDTrainer(Trainer):
    def create_optimizer(self):
        self.optimizer = create_optimizer_with_llrd(self.model, base_lr=training_args.learning_rate)
        return self.optimizer


def get_trainer(model):
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()

    return LLRDTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )


### Start Training

In [None]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

In [None]:
# 加载 LoRA adapter 的权重
peft_model = PeftModel.from_pretrained(model, "results/checkpoint-20000")
peft_model.to('cuda')


## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [None]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [None]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

### Run Inference on eval_dataset

In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [None]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

In [None]:
print(type(peft_model))
peft_model.print_trainable_parameters()

### Run Inference on unlabelled dataset

In [None]:
#Load your unlabelled data
# import panda as pd
# with open("test_unlabelled.pkl", "rb") as f:
#     unlabelled_dataset = pickle.load(f)

# print(type(unlabelled_dataset))
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")

test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

In [None]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")