# Initializer

In [1]:
!pip install transformers datasets peft evaluate torch wandb

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [2]:
# The only thing you need to config!
# Loadoad dataset (config the path to the dataset folder)
from google.colab import drive
drive.mount('/content/drive')
# Define dataset path
dataset_path = "/content/drive/MyDrive/NLP/"  # Change this to your dataset folder

Mounted at /content/drive


In [3]:
import torch, wandb
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from peft import get_peft_model, LoraConfig
import evaluate


# login huggingface
from huggingface_hub import login
YOUR_TOKEN = "your_huggingface_token"
login(token=YOUR_TOKEN)

# login wandb
wandb.login(key="your_wandb_key")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


HTTPError: Invalid user token.

# Training Processing

In [None]:
# 1 load dataset
dataset = load_dataset("gxb912/large-twitter-tweets-sentiment")
train_size = len(dataset['train']) // 50
test_size = len(dataset['test']) // 50
dataset['train'] = dataset['train'].select(range(train_size))
dataset['test'] = dataset['test'].select(range(test_size))

# look the data structure
print(dataset['train'].features)
print(dataset['train'][0])

# print the number of labels
unique_labels = set(dataset['train']['sentiment'])
NUM_LABELS = len(unique_labels)
print(f"Number of unique labels: {NUM_LABELS}, Labels: {unique_labels}")

In [None]:
# 2: load tokeniser and model

# login huggingface
from huggingface_hub import login
YOUR_TOKEN = "your_huggingface_token"
login(token=YOUR_TOKEN)

# change model here
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B-Instruct')
tokenizer.pad_token = tokenizer.eos_token
NUM_LABELS = 2  # 2 classes
model = AutoModelForSequenceClassification.from_pretrained('meta-llama/Llama-3.2-3B-Instruct', num_labels=NUM_LABELS) # Initialization for classification model
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# LoRA
#
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
)
model = get_peft_model(model, lora_config)

In [None]:
# data pre processing
def preprocess_function(examples):
    result = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=64)
    result['labels'] = examples['sentiment']  #
    return result

encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text', 'sentiment'])

In [None]:
# # 步骤5: 设置训练参数
# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy='epoch',
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
#     gradient_accumulation_steps=4,
#     fp16=True,
#     label_names=['labels'],  # 显式指定标签字段名
#     report_to="wandb",  # 修改为 wandb（原来是 tensorboard）
#     run_name="my_model_finetune",  # 可选：为这次训练命名
# )

# early stoping
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    gradient_accumulation_steps=4,
    fp16=True,
    label_names=['labels'],
    report_to="wandb",
    run_name="my_model_finetune",
    # Early Stopping
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_strategy = "epoch"
)

In [None]:
# evaluation
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    logits = torch.from_numpy(logits)
    predictions = torch.argmax(logits, dim=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    return {
        'accuracy': accuracy['accuracy'],
        'f1': f1['f1']
    }

In [None]:
# train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    compute_metrics=compute_metrics,
)


wandb.init(
    project="my_finetune_project",
    config=training_args.__dict__,
    name="experiment_1"
)


trainer.train()


wandb.finish()

In [None]:

import wandb


wandb.init(project="your_project_name")

eval_results = trainer.evaluate()
print("Final evaluation results on test set:")
print(eval_results)


In [None]:
# free gpu
del trainer
del model
torch.cuda.empty_cache()


print("Memory allocated after release:", torch.cuda.memory_allocated() / 1e9, "GB")
print("Memory reserved after release:", torch.cuda.memory_reserved() / 1e9, "GB")