In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score
import numpy as np
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
import pickle

In [2]:
# Test GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
# Load the AG News dataset
dataset = load_dataset("ag_news")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [4]:
# Tokenizer（Use RoBERTa）
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def tokenize_function(example):
  return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [8]:
# Load the RoBERTa model and add the LoRA adapter
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
lora_config = LoraConfig(
    r=8,
    #r=16,
    lora_alpha=16,
    #lora_alpha=32,
    target_modules=["query", "value"],
    #target_modules=["value"],
    lora_dropout=0.1,
    bias="all",
    task_type=TaskType.SEQ_CLS )


model = get_peft_model(model, lora_config)
model.to(device)
model.print_trainable_parameters()

from peft import get_peft_model_state_dict

lora_params = sum(p.numel() for p in get_peft_model_state_dict(model).values())
print(f"✅ LoRA adapter trainable parameters: {lora_params:,}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 990,724 || all params: 125,537,288 || trainable%: 0.7892
✅ LoRA adapter trainable parameters: 992,268


In [None]:
# Load the LoRA adapter Training parameter Settings
training_args = TrainingArguments(
    output_dir="./results",
    eval_steps=500,
    save_strategy="no",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    #
    lr_scheduler_type="cosine",  
    warmup_steps=500
)

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  preds = np.argmax(logits, axis=1)
  return {"accuracy": accuracy_score(labels, preds)}

In [None]:
# Train the model

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, )

trainer.train()

  trainer = Trainer(


Step,Training Loss
500,0.6657
1000,0.2911
1500,0.2626
2000,0.2526
2500,0.2505
3000,0.2571
3500,0.2232
4000,0.2241
4500,0.2202
5000,0.213


Step,Training Loss
500,0.6657
1000,0.2911
1500,0.2626
2000,0.2526
2500,0.2505
3000,0.2571
3500,0.2232
4000,0.2241
4500,0.2202
5000,0.213


TrainOutput(global_step=22500, training_loss=0.20037767808702256, metrics={'train_runtime': 5844.1609, 'train_samples_per_second': 61.6, 'train_steps_per_second': 3.85, 'total_flos': 2.392609480704e+16, 'train_loss': 0.20037767808702256, 'epoch': 3.0})

In [None]:
# evaluate the model
eval_results = trainer.evaluate()
print("Final Evaluation Accuracy:", eval_results["eval_accuracy"])

Final Evaluation Accuracy: 0.9460526315789474


In [None]:
# load the test dataset
with open("/content/test_unlabelled.pkl", "rb") as f: test_dataset = pickle.load(f)
# Convert to the HuggingFace Dataset format
test_dataset = Dataset.from_dict({"text": test_dataset["text"]})
# Tokenize test set
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Batch prediction
from torch.utils.data import DataLoader
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)
model.eval()
all_predictions = []

with torch.no_grad():
  for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    preds = torch.argmax(outputs.logits, dim=-1)
    all_predictions.extend(preds.cpu().numpy())


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
# Save as submission.csv
df = pd.DataFrame({
  "ID": list(range(len(all_predictions))),
  "label": all_predictions })
df.to_csv("submission.csv", index=False)
print("Save Successfully：submission.csv")

Save Successfully：submission.csv
