# Starter Notebook

Install and import required libraries

In [1]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import json
import os
import shutil
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm





## Load Tokenizer and Preprocess Data

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"./results/{timestamp}"
os.makedirs(output_dir, exist_ok=True)
# -----------------------------
# 3. Load and preprocess AGNEWS dataset
# -----------------------------
dataset = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Using device: cuda


In [4]:
# -----------------------------
# 4. Load RoBERTa model with LoRA adapters
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
model.to(device)
model.print_trainable_parameters()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 888,580 || all params: 125,537,288 || trainable%: 0.7078


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [5]:
# -----------------------------
# 5. Define training arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir=f"{output_dir}/logs",
    logging_steps=100,
    report_to="tensorboard",
    save_total_limit=1,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

if os.path.exists("./results"):
    shutil.rmtree("./results")
os.makedirs("./results")

## Anything from here on can be modified

In [6]:
# -----------------------------
# 6. Train the model
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train(resume_from_checkpoint=False)
best_model_path = f"{output_dir}/best_model"
trainer.save_model(best_model_path)
print(f"Best model saved to {best_model_path}")

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2231,0.194306,0.935263
2,0.18,0.187799,0.942368
3,0.1659,0.178437,0.945395
4,0.1376,0.179631,0.945789
5,0.1264,0.178965,0.947105


Best model saved to ./results/2025-04-20_21-17-48/best_model


In [7]:
log_history = trainer.state.log_history

if not log_history:
    print("skip ploting")
else:
    train_loss = []
    eval_loss = []
    eval_accuracy = []
    steps = []
    epochs = []

    for log in log_history:
        if "loss" in log and "step" in log:
            train_loss.append(log["loss"])
            steps.append(log["step"])
        if "eval_loss" in log:
            eval_loss.append(log["eval_loss"])
            eval_accuracy.append(log.get("eval_accuracy", None))
            epochs.append(log.get("epoch", len(epochs) + 1))

    if not train_loss or not eval_loss:
        print("too less data to praint")
    else:
        
        eval_loss_extended = []
        steps_per_epoch = len(train_loss) // len(eval_loss) if len(eval_loss) > 0 else 1
        for i in range(len(eval_loss)):
            eval_loss_extended.extend([eval_loss[i]] * steps_per_epoch)
        eval_loss_extended = eval_loss_extended[:len(train_loss)]

        plt.figure(figsize=(10, 6))
        plt.plot(steps, train_loss, label="training loss", color="blue")
        plt.plot(steps, eval_loss_extended, label="val loss", color="orange", linestyle="--")
        plt.xlabel("training step")
        plt.ylabel("loss")
        plt.title("taining adn val loss")
        plt.legend()
        plt.grid(True)
        plt.savefig(f"{output_dir}/loss_curve.png")
        plt.close()
        print(f"loss plot saved to {output_dir}/loss_curve.png")

        
        plt.figure(figsize=(10, 6))
        plt.plot(epochs, eval_loss, label="val loss", color="orange", marker="o")
        plt.xlabel("epoch")
        plt.ylabel("loss")
        plt.title("loss in each epoch")
        plt.legend()
        plt.grid(True)
        plt.savefig(f"{output_dir}/epoch_loss_curve.png")
        plt.close()
        print(f"plot saved to {output_dir}/epoch_loss_curve.png")

    if not eval_accuracy or all(acc is None for acc in eval_accuracy):
        print("skip ploting")
    else:
        
        plt.figure(figsize=(10, 6))
        plt.plot(epochs, eval_accuracy, label="val acc", color="green", marker="o")
        plt.xlabel("epoch")
        plt.ylabel("acc")
        plt.title("every acc in each epoch")
        plt.legend()
        plt.grid(True)
        plt.savefig(f"{output_dir}/accuracy_curve.png")
        plt.close()
        print(f"plot saved to {output_dir}/accuracy_curve.png")


with open(f"{output_dir}/training_log.json", "w") as f:
    json.dump(log_history, f)
print(f"training saved to {output_dir}/training_log.json")

loss plot saved to ./results/2025-04-20_21-17-48/loss_curve.png
plot saved to ./results/2025-04-20_21-17-48/epoch_loss_curve.png
plot saved to ./results/2025-04-20_21-17-48/accuracy_curve.png
training saved to ./results/2025-04-20_21-17-48/training_log.json


## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [8]:
# -----------------------------
# 7. Evaluate the model
# -----------------------------
eval_results = trainer.evaluate()
print("Final Evaluation Accuracy:", eval_results["eval_accuracy"])

Final Evaluation Accuracy: 0.9453947368421053


In [9]:
# -----------------------------
# 8. Check trainable parameter count
# -----------------------------
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 888580


In [10]:
# print("Trainable parameters:")
# for name, param in peft_model.named_parameters():
#     if param.requires_grad:
#         print(name)
with open(f"{output_dir}/eval_results.json", "w") as f:
    json.dump(eval_results, f)
print(f"accessment saved to {output_dir}/eval_results.json")

accessment saved to ./results/2025-04-20_21-17-48/eval_results.json


In [11]:
from datasets import Dataset
from torch.utils.data import DataLoader


if not os.path.exists(best_model_path):
    print(f"no best model found in {best_model_path}")
else:
    # 加载最佳模型
    print(f"load best model from {best_model_path}")
    base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
    best_model = PeftModel.from_pretrained(base_model, best_model_path)
    best_model.to(device)

    
    test_data_path = "./test_unlabelled.pkl"
    print(f"check testing data: {test_data_path}")
    print(f"wether exist: {os.path.exists(test_data_path)}")
    if not os.path.exists(test_data_path):
        print(f"no tset data in {test_data_path}")
        parent_dir = os.path.dirname(test_data_path)
        if os.path.exists(parent_dir):
            print(f"dict: {os.listdir(parent_dir)}")
        else:
            print(f" {parent_dir}not exist")
    else:
        try:
            with open(test_data_path, "rb") as f:
                test_dataset = pickle.load(f)
            print(f"sucessfully load test data, number of samples: {len(test_dataset['text'])}")

            test_dataset = Dataset.from_dict({"text": test_dataset["text"]})

            def preprocess_function(examples):
                return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

            tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
            tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

            test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

            
            best_model.eval()
            all_predictions = []

            with torch.no_grad():
                for batch in test_dataloader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = best_model(**batch)
                    preds = torch.argmax(outputs.logits, dim=-1)
                    all_predictions.extend(preds.cpu().numpy())

            if all_predictions:
                df = pd.DataFrame({
                    "ID": list(range(len(all_predictions))),
                    "label": all_predictions
                })
                df.to_csv(f"{output_dir}/submission.csv", index=False)
                print(f"✅ prediction saved to{output_dir}/submission.csv")
            else:
                print("no prediction")
        except Exception as e:
            print(f"error: {str(e)}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


load best model from ./results/2025-04-20_21-17-48/best_model
check testing data: ./test_unlabelled.pkl
wether exist: True
sucessfully load test data, number of samples: 8000


Map: 100%|███████████████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 21145.51 examples/s]


✅ prediction saved to./results/2025-04-20_21-17-48/submission.csv
