<a href="https://colab.research.google.com/github/AlexXPZhu/XMUM-FYP-Code/blob/main/FYP_MobileBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 安装依赖
!pip install transformers datasets torch scikit-learn accelerate -q

In [None]:
# 访问云盘数据集
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!nvidia-smi
# 检查是不是在gpu上面运行

Wed Dec  3 17:49:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   58C    P0             26W /   70W |    2084MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# 检查是不是链接到了cuda
import torch
print(f"Using device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

Using device: cuda


In [None]:
# 导入模型
# Load model directly
from transformers import AutoTokenizer, AutoModelForPreTraining

tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
model = AutoModelForPreTraining.from_pretrained("google/mobilebert-uncased")



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer

# 1. 载入数据
file_path = '/content/drive/MyDrive/FYP/merged_data.csv'
df = pd.read_csv(file_path)

# 2. 划分数据
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["Label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["Label"])

# 3. 转换为 Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# ================= 关键修正步骤 =================

# 4. [必须执行] 改名：因为你重置了环境，现在的列名又是 "Label" 了
# 如果这一步报错说 "Label" 不存在，说明你可能重复运行了，但在重置环境后，这步是必须的。
train_dataset = train_dataset.rename_column("Label", "labels")
val_dataset = val_dataset.rename_column("Label", "labels")
test_dataset = test_dataset.rename_column("Label", "labels")

# 5. [必须执行] 分词：这一步生成 input_ids 和 attention_mask
# 加载分词器 (假设你用的是 MobileBERT，如果是其他模型请替换)
model_checkpoint = "google/mobilebert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    # 注意：这里使用的是 "Sentence" 列，对应你报错信息里的列名
    return tokenizer(examples["Sentence"], padding="max_length", truncation=True)

# 批量处理
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# 6. [最后一步] 设置格式
# 只有在分词之后，这些列才真正存在
# 这里的 columns 列表要根据 tokenizer 实际生成的列来写，通常是这三个+labels
columns_to_keep = ["input_ids", "attention_mask", "labels"]
if "token_type_ids" in tokenized_train.column_names:
    columns_to_keep.append("token_type_ids")

tokenized_train.set_format("torch", columns=columns_to_keep)
tokenized_val.set_format("torch", columns=columns_to_keep)
tokenized_test.set_format("torch", columns=columns_to_keep)

print("✅ 数据处理完成！可以开始训练了。")
print("现在的列名:", tokenized_train.column_names)

Map:   0%|          | 0/39827 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/8535 [00:00<?, ? examples/s]

Map:   0%|          | 0/8535 [00:00<?, ? examples/s]

✅ 数据处理完成！可以开始训练了。
现在的列名: ['Sentence', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
!pip install evaluate



In [None]:
import wandb
import os
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification, # 注意这里换成了分类模型
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import evaluate

In [None]:
# ----------------------------------------------------------------
# 1. 准备工作（假设上面的代码已经运行，datasets 已创建）
# ----------------------------------------------------------------

# 确定标签数量（假设你的标签是 0, 1, 2... 这样的数字）
num_labels = len(df["Label"].unique())

# 加载分词器
model_checkpoint = "google/mobilebert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 数据预处理函数
def tokenize_function(examples):
    return tokenizer(
        examples["Sentence"],
        padding="max_length",  # 不够长补 0
        truncation=True,       # 重点：超长就切掉！
        max_length=128         # 重点：显式限制为 MobileBERT 的上限
    )

# 对数据集进行分词
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# ----------------------------------------------------------------
# 2. 加载模型 (针对分类任务)
# ----------------------------------------------------------------
# MobileBERT 针对分类任务微调，而不是 PreTraining
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)

# ----------------------------------------------------------------
# 3. 设置评估指标
# ----------------------------------------------------------------
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


Map:   0%|          | 0/39827 [00:00<?, ? examples/s]

Map:   0%|          | 0/8535 [00:00<?, ? examples/s]

Map:   0%|          | 0/8535 [00:00<?, ? examples/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ----------------------------------------------------------------
# 4. 配置训练参数 (重点：wandb 设置)
# ----------------------------------------------------------------
# 初始化 wandb 项目
wandb.init(project="mobilebert-finetuning", name="run-1")

training_args = TrainingArguments(
    output_dir="./results-2025_12_3",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",  # 每个 epoch 结束后评估 (修复：将 evaluation_strategy 修改为 eval_strategy)
    save_strategy="epoch",        # 每个 epoch 结束后保存 checkpoint
    load_best_model_at_end=True,  # 训练结束时加载验证集表现最好的模型
    report_to="wandb",            # 关键：将指标发送到 wandb
    logging_steps=50,             # 每50步记录一次日志
)


0,1
train/epoch,▁▃▆█
train/global_step,▁▃▆█
train/grad_norm,█▁▁▁
train/learning_rate,█▆▃▁
train/loss,█▁▁▁

0,1
train/epoch,0.08032
train/global_step,200.0
train/grad_norm,0.02489
train/learning_rate,2e-05
train/loss,0.6057


In [None]:
# ----------------------------------------------------------------
# 5. 初始化 Trainer 并开始训练
# ----------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("开始训练...")
trainer.train()

  trainer = Trainer(


开始训练...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0001,0.017238,0.997774
2,0.0006,0.061509,0.99836
3,0.0,0.046089,0.998711


TrainOutput(global_step=7470, training_loss=1799.28196317354, metrics={'train_runtime': 1574.723, 'train_samples_per_second': 75.874, 'train_steps_per_second': 4.744, 'total_flos': 1873119198205440.0, 'train_loss': 1799.28196317354, 'epoch': 3.0})

In [None]:
import os

# 1. 定义一个干净的保存目录（不要直接用 output_dir，因为它里面可能有很多 checkpoint 文件夹）
save_directory = "./final_model_output"

# 2. 保存模型 (这会保存 config.json 和 pytorch_model.bin/model.safetensors)
print(f"正在保存模型到 {save_directory} ...")
trainer.save_model(save_directory)

# 3. 【关键步骤】保存 Tokenizer
# 这会保存 vocab.txt, tokenizer.json, special_tokens_map.json 等必要文件
print(f"正在保存 Tokenizer 到 {save_directory} ...")
tokenizer.save_pretrained(save_directory)

# 4. (可选) 上传到 WandB
# 这样你的 artifact 里就会同时包含模型权重和分词器，缺一不可
import wandb
if wandb.run is not None:
    artifact = wandb.Artifact(
        name="mobilebert_finetuned",
        type="model",
        description="Fine-tuned MobileBERT with tokenizer"
    )
    artifact.add_dir(save_directory)
    wandb.log_artifact(artifact)
    print("✅ 模型和 Tokenizer 已上传到 WandB")

正在保存模型到 ./final_model_output ...
正在保存 Tokenizer 到 ./final_model_output ...


[34m[1mwandb[0m: Adding directory to artifact (final_model_output)... Done. 0.4s


✅ 模型和 Tokenizer 已上传到 WandB
