# Starter Notebook

Install and import required libraries

In [1]:
%pip install transformers datasets evaluate accelerate peft trl bitsandbytes
%pip install nvidia-ml-py3
%pip install nlpaug

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm


## Load Tokenizer and Preprocess Data

In [3]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [4]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [5]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Anything from here on can be modified

In [6]:
# Split the original training set
#split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
#train_dataset = split_datasets['train']
#eval_dataset = split_datasets['test']

## Data Cleaning and Augmentation

In [7]:
# -------- 数据增强（静音 + 预热版） --------
import nltk
import logging
import contextlib
import os
import sys
from datasets import concatenate_datasets
import nlpaug.augmenter.word as naw

# ✅ 禁用 nltk 输出日志
nltk_logger = logging.getLogger('nltk')
nltk_logger.setLevel(logging.CRITICAL)

# ✅ 重定向 stdout + stderr
@contextlib.contextmanager
def suppress_output():
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        try:
            sys.stdout = devnull
            sys.stderr = devnull
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

# ✅ 静音下载 + 预热 nltk 所需模块
with suppress_output():
    nltk.download('averaged_perceptron_tagger_eng')
    nltk.download('wordnet')
    nltk.pos_tag(["warmup"])  # 触发 pos_tag 加载
    from nltk.corpus import wordnet
    _ = wordnet.synsets("dog")  # 触发 wordnet 加载

# ✅ 初始化增强器
syn_aug = naw.SynonymAug(aug_src='wordnet', aug_max=4)

# ✅ 定义增强函数（完全静音 + 修复 augment 返回 list 的问题）
def augment_text(example):
    try:
        with suppress_output():
            result = syn_aug.augment(example['text'])
            augmented_text = result[0] if isinstance(result, list) else result  # ✅ 只取第一个增强版本
        return {'text': augmented_text}
    except:
        return {'text': example['text']}

# ✅ 执行增强（单线程 + 禁用缓存）
augmented_dataset = dataset.map(
    augment_text,
    num_proc=1,
    load_from_cache_file=False
)

# ✅ 合并增强数据与原始数据
combined_dataset = concatenate_datasets([dataset, augmented_dataset])
# -------- 数据增强结束 --------

# -------- Tokenize 合并后数据 --------
tokenized_dataset = combined_dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# -------- 重新划分训练集 --------
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']

# -------- 验证集（使用原始数据） --------
raw_eval_dataset = dataset.train_test_split(test_size=640, seed=42)['test']
eval_dataset = raw_eval_dataset.map(preprocess, batched=True, remove_columns=["text"])
eval_dataset = eval_dataset.rename_column("label", "labels")


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/zyc/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/zyc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Map: 100%|██████████| 120000/120000 [01:44<00:00, 1143.50 examples/s]
Map: 100%|██████████| 240000/240000 [00:52<00:00, 4558.28 examples/s]


## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [8]:
# PEFT Config
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias = 'none',
    target_modules = ['query', 'value'],
    task_type="SEQ_CLS",
)

In [9]:
peft_model = get_peft_model(model, peft_config)

peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Mod

In [10]:
print("Trainable parameters:")
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(name)

Trainable parameters:
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.2.attention.sel

In [11]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 888,580 || all params: 125,537,288 || trainable%: 0.7078


## Training Setup

### Layer-wise Learning Rate Decay

In [12]:
from torch.optim import AdamW

def create_optimizer_with_llrd(model, base_lr=5e-5, layer_decay=0.9):
    grouped_parameters = []
    seen_params = set()  # ✅ 用于去重

    num_layers = len(model.base_model.roberta.encoder.layer)

    # Transformer 每一层
    for layer_idx in range(num_layers):
        layer = model.base_model.roberta.encoder.layer[layer_idx]
        layer_lr = base_lr * (layer_decay ** (num_layers - layer_idx - 1))
        params = [p for n, p in layer.named_parameters() if p.requires_grad and p not in seen_params]
        seen_params.update(params)
        if params:
            grouped_parameters.append({"params": params, "lr": layer_lr})

    # Embeddings
    embed_lr = base_lr * (layer_decay ** num_layers)
    embed_params = [p for n, p in model.base_model.roberta.embeddings.named_parameters()
                    if p.requires_grad and p not in seen_params]
    seen_params.update(embed_params)
    if embed_params:
        grouped_parameters.append({"params": embed_params, "lr": embed_lr})

    # Classifier
    cls_params = [p for n, p in model.base_model.classifier.named_parameters()
                  if p.requires_grad and p not in seen_params]
    seen_params.update(cls_params)
    if cls_params:
        grouped_parameters.append({"params": cls_params, "lr": base_lr})

    # LoRA adapter（识别名称中包含 "lora" 的参数）
    for name, param in model.named_parameters():
        if param.requires_grad and 'lora' in name and param not in seen_params:
            if 'encoder.layer.' in name:
                layer_num = int(name.split('encoder.layer.')[1].split('.')[0])
                lora_lr = base_lr * (layer_decay ** (num_layers - layer_num - 1))
            else:
                lora_lr = base_lr
            grouped_parameters.append({'params': [param], 'lr': lora_lr})
            seen_params.add(param)

    return AdamW(grouped_parameters, lr=base_lr)



In [13]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [17]:
from torch.utils.data import DataLoader
# Setup Training args
output_dir = "results_new"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    logging_steps=1000,
    learning_rate=5e-5,
    num_train_epochs=10,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=10,  # ✅ 保留最近8个checkpoint
    # max_steps=1200,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    fp16=True,
    load_best_model_at_end=True,             # ✅ 自动加载验证集效果最好的模型
    metric_for_best_model="accuracy",        # ✅ 哪个metric作为标准（你可以改成 f1 等）
    greater_is_better=True,                  # ✅ accuracy 越高越好
)

class LLRDTrainer(Trainer):
    def create_optimizer(self):
        self.optimizer = create_optimizer_with_llrd(self.model, base_lr=training_args.learning_rate)
        return self.optimizer

    def get_train_dataloader(self):
        return DataLoader(
            train_dataset,
            shuffle=True,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=data_collator,
            # num_workers=0  # ✅ 关键：禁用多线程 dataloader，避免死锁
        )

    def get_eval_dataloader(self, eval_dataset=None):
        return DataLoader(
            eval_dataset or self.eval_dataset,
            batch_size=self.args.per_device_eval_batch_size,
            collate_fn=data_collator,
            # num_workers=0  # ✅ 一样禁用
        )



def get_trainer(model):
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()

    return LLRDTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,   # ✅ 保留原方式
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=7)]
    )

### Start Training

In [18]:
print("🧪 train_dataset[0] keys:", train_dataset[0].keys())
print("🧪 input_ids:", train_dataset[0]['input_ids'][:10])
print("🧪 labels:", train_dataset[0]['labels'])

🧪 train_dataset[0] keys: dict_keys(['labels', 'input_ids', 'attention_mask'])
🧪 input_ids: [0, 20556, 13716, 826, 7351, 2459, 7858, 16838, 1309, 18335]
🧪 labels: 2


In [19]:
from transformers import logging
logging.set_verbosity_info()
logging.disable_default_handler()

print("🔥 开始调用 get_trainer...")
peft_lora_finetuning_trainer = get_trainer(peft_model)
print("✅ get_trainer 完成，开始训练...")

result = peft_lora_finetuning_trainer.train()


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


🔥 开始调用 get_trainer...
✅ get_trainer 完成，开始训练...


Step,Training Loss,Validation Loss,Accuracy
1000,0.3107,0.289674,0.901563
2000,0.2598,0.275776,0.909375
3000,0.2519,0.26769,0.910937
4000,0.2461,0.245061,0.9125
5000,0.2365,0.229014,0.921875
6000,0.2219,0.220642,0.925
7000,0.2179,0.21573,0.928125
8000,0.2148,0.214141,0.928125
9000,0.2089,0.208453,0.928125
10000,0.2117,0.212019,0.928125


In [None]:

# ✅ 原生训练 Loop 替代 HuggingFace Trainer
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from peft import PeftModel
import shutil

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model.to(device)
#peft_model = torch.compile(peft_model)

# ✅ 构建 dataloader（使用已有的 train_dataset 和 data_collator）
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator, num_workers=0)
eval_loader = DataLoader(eval_dataset, batch_size=64, collate_fn=data_collator, num_workers=0)

# ✅ 优化器
optimizer = torch.optim.AdamW(peft_model.parameters(), lr=5e-5)

# ✅ 设置训练轮数
num_epochs = 10
best_acc = 0.0  # ✅ 初始最佳验证准确率

for epoch in range(num_epochs):
    print(f"\n Epoch {epoch+1}/{num_epochs}")
    peft_model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=" Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = peft_model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"✅ Avg train loss: {avg_train_loss:.4f}")

    # ✅ 验证阶段
    peft_model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(eval_loader, desc=" Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = peft_model(**batch)
            preds = outputs.logits.argmax(dim=-1).cpu()
            labels = batch["labels"].cpu()
            all_preds.extend(preds)
            all_labels.extend(labels)

    acc = accuracy_score(all_labels, all_preds)
    print(f" Eval Accuracy: {acc:.4f}")

    # ✅ 保存最佳模型（兼容 HuggingFace 的加载方式）
    if acc > best_acc:
        best_acc = acc
        save_path = "results/checkpoint-best"
        peft_model.save_pretrained(save_path)
        print(f" Saved new best model to {save_path} with acc={acc:.4f}")
        shutil.copytree(save_path, "/kaggle/working/checkpoint-best", dirs_exist_ok=True)
        print(" Copied to /kaggle/working/checkpoint-best for persistence")



In [19]:
ls results/checkpoint-best

ls: cannot access 'results/checkpoint-best': No such file or directory


In [36]:
# 加载 LoRA adapter 的权重
peft_model = PeftModel.from_pretrained(model, "results_new/checkpoint-18000")
peft_model.to('cuda')




PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Mod

## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [37]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [38]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 1, Label: Sports, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.


'Business'

### Run Inference on eval_dataset

In [39]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [40]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 80/80 [00:01<00:00, 43.41it/s]

Evaluation Metric: {'accuracy': 0.940625}





In [41]:
print(type(peft_model))
peft_model.print_trainable_parameters()

<class 'peft.peft_model.PeftModelForSequenceClassification'>
trainable params: 593,668 || all params: 125,537,288 || trainable%: 0.4729


### Run Inference on unlabelled dataset

In [42]:
#Load your unlabelled data
# import panda as pd
# with open("test_unlabelled.pkl", "rb") as f:
#     unlabelled_dataset = pickle.load(f)

# print(type(unlabelled_dataset))
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")

test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:02<00:00, 3188.71 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [43]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [00:22<00:00, 45.32it/s]


Inference complete. Predictions saved to inference_output.csv
