<a href="https://colab.research.google.com/github/AlvinScrp/LLMs-from-scratch-CN/blob/main/finetuning-for-classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 进行文本分类的微调

## huggingface gpt2
使用[huggingface gpt2](https://huggingface.co/openai-community/gpt2)

为了理解 huggingface gpt2的使用，写一个文本生成示例代码，感受下

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(model.config)
print('*'*10)
print(output.last_hidden_state.shape)
print(model.wte.weight.shape)

### 自定生成文本Model

In [None]:
import torch
from torch import nn
from transformers import GPT2Tokenizer, GPT2Model
class MyGPT2LMHeadModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.gpt2 = GPT2Model.from_pretrained('gpt2')
    config = self.gpt2.config
    self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
    self.lm_head.weight = self.gpt2.wte.weight

  def forward(self,in_idx):
    gpt2_out = self.gpt2(in_idx)
    logits = self.lm_head(gpt2_out.last_hidden_state)
    return logits

def generate(model, idx, max_new_tokens, context_size, temperature=1.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]/temperature
        if top_k is not None:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')

        probs = torch.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)

        if idx_next == eos_id:
            break

        idx = torch.cat((idx, idx_next), dim=1)

    return idx




In [None]:
prompt_text = "In a cozy little cottage, lived a fluffy cat named Whiskers. One sunny morning,"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用的设备: {device}")
myGPT2LMHeadModel = MyGPT2LMHeadModel()
myGPT2LMHeadModel.to(device)
token_ids = generate(
    model=myGPT2LMHeadModel,
    idx= tokenizer.encode(prompt_text, return_tensors='pt').to(device),
    max_new_tokens=50,
    context_size=1024,
    top_k=50,
    temperature=0.9
)

print("Output text:\n", tokenizer.decode(token_ids[0], skip_special_tokens=True))

使用的设备: cuda
Output text:
 In a cozy little cottage, lived a fluffy cat named Whiskers. One sunny morning, he wandered the hallways in her bright red coat and she let him out. Then he moved to play with her and brought her toys, and we spent the next two years in a very kind house in the northern suburbs and around Toronto's Oak Park


### 使用transformers的GPT2LMHeadModel生成文本

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# 检查是否有可用的 GPU，并设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用的设备: {device}")

# 1. 加载带有语言模型头的模型和分词器
# 将模型移动到指定的设备上
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
modelLMHead = GPT2LMHeadModel.from_pretrained('gpt2')
modelLMHead.to(device)

input_ids = tokenizer.encode(prompt_text, return_tensors='pt').to(device)

# 4. 使用 model.generate() 生成文本
print("\n正在生成故事...")
# 调用 generate 方法来创作故事
output_sequences = modelLMHead.generate(
    input_ids=input_ids,
    max_length=50,          # 生成文本的最大长度（包含提示）
    num_return_sequences=1,  # 生成几个不同的故事
    no_repeat_ngram_size=2,  # 避免重复短语的关键参数
    do_sample=True,          # 启用采样，让文本更有创意，而不是死板的预测
    temperature=0.9,         # 控制创造性与确定性的平衡，数值越低越保守
    top_k=50,                # 采样时只考虑概率最高的50个词
    top_p=0.95,              # 核心采样，保留概率总和为95%的词汇
)

# 5. 解码生成的文本
# 将生成的数字ID序列转换回人类可读的字符串
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

# 6. 打印结果
print("\n--- 生成的故事 ---")
print(generated_text)


使用的设备: cuda


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



正在生成故事...

--- 生成的故事 ---
In a cozy little cottage, lived a fluffy cat named Whiskers. One sunny morning, the cat's owner woke the next morning to find him asleep on the couch. Whiskey then had a heart attack and died. A little before 4


## finetuning-for-classification

- 如果您具有机器学习的背景，对于分类微调您可能已经熟悉. 举个例子，分类微调类似于训练卷积网络来对手写数字进行分类的过程
- 在分类微调中，模型可以输出特定的分类标签（例如，“spam”和“not spam”）
- 分类微调模型只能预测它在训练期间所熟知的类别标签（例如，“垃圾邮件”或“非垃圾邮件”），而指令微调模型通常可以执行更广泛的任务
- 我们可以将分类微调模型视为高度专业化的模型;在实践中，开发专业化的模型通常比开发在许多不同任务上表现良好的通用模型要容易得多

###准备数据集
我们使用由垃圾邮件和非垃圾邮件组成的数据集来对 LLM 进行分类微调

In [None]:
import os
import torch
import pandas as pd
import urllib.request
import zipfile
from tqdm import tqdm
from pathlib import Path
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer
from datasets import Dataset, DatasetDict

# === 1. 全局配置 ===
URL = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
ZIP_PATH = "sms_spam_collection.zip"
DATA_DIR = Path("sms_spam_collection")
DATA_FILE = DATA_DIR / "SMSSpamCollection.tsv"
BATCH_SIZE = 8
RANDOM_STATE = 123
NUM_WORKERS = 2


# === 2. 数据准备 ===
def prepare_data():
    if not DATA_FILE.exists():
        print("⬇️ Downloading and extracting dataset...")
        with urllib.request.urlopen(URL) as r, open(ZIP_PATH, "wb") as f:
            f.write(r.read())
        with zipfile.ZipFile(ZIP_PATH, "r") as z:
            z.extractall(DATA_DIR)
        os.rename(DATA_DIR / "SMSSpamCollection", DATA_FILE)
    else:
        print("✅ Dataset already exists.")

    df = pd.read_csv(DATA_FILE, sep="\t", names=["Label", "Text"])
    print(f"Loaded {len(df)} samples")
    print(df["Label"].value_counts())

    # 平衡数据集，ham:4825 ,spam:747， 使每个类别包含 747 个实例。
    ham = df[df["Label"] == "ham"].sample(n=df.Label.value_counts()["spam"], random_state=RANDOM_STATE)
    df = pd.concat([ham, df[df["Label"] == "spam"]])
    df["Label"] = df["Label"].map({"ham": 0, "spam": 1})
    df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

    n = len(df)
    return df[:int(0.7*n)], df[int(0.7*n):int(0.8*n)], df[int(0.8*n):]


# === 3. Tokenizer & Dataset 使用 Hugging Face `datasets` 库 ===
def build_datasets(train_df, val_df, test_df):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    datasets = DatasetDict({
        "train": Dataset.from_pandas(train_df),
        "validation": Dataset.from_pandas(val_df),
        "test": Dataset.from_pandas(test_df)
    })

    print("\n🔢 Calculating max sequence length (may take a few seconds)...")
    train_texts = datasets["train"]["Text"]
    max_len = max(len(tokenizer.encode(t)) for t in tqdm(train_texts))
    print(f"Max length: {max_len}")

    def tokenize_fn(batch):
        return tokenizer(batch["Text"], truncation=True, padding="max_length", max_length=max_len)

    print("\n✂️ Tokenizing datasets...")
    tokenized = datasets.map(tokenize_fn, batched=True, remove_columns=["Text"])
    tokenized = tokenized.rename_column("Label", "labels")
    tokenized.set_format(type="torch", columns=[ "input_ids", "attention_mask","labels"])
    return tokenized


# === 4. DataLoader ===
def create_loaders(datasets):
    return {
        split: DataLoader(ds, batch_size=BATCH_SIZE,  shuffle=(split == "train"),num_workers=NUM_WORKERS)
        for split, ds in datasets.items()
    }


# === 主流程 ===
print("🏗 Preparing data...")
train_df, val_df, test_df = prepare_data()
datasets = build_datasets(train_df, val_df, test_df)
loaders = create_loaders(datasets)

train_loader, val_loader, test_loader = loaders["train"], loaders["validation"], loaders["test"]

print("\n✅ Data pipeline ready!")
for name, loader in loaders.items():
    print(f"{name:>10}: {len(loader)} batches")

# === 示例输出 ===
print("\n📦 Example batch from train_loader:")
batch = next(iter(loaders["train"]))
for k, v in batch.items():
    print(f"{k:>15}:", v.shape)

# 查看实际文本
print("\n📝 Decoded sample text:")
sample_ids = batch["input_ids"][0]
decoded = GPT2Tokenizer.from_pretrained("gpt2").decode(sample_ids, skip_special_tokens=True)
print(decoded)
print("Label:", batch["labels"][0].item())


🏗 Preparing data...
✅ Dataset already exists.
Loaded 5572 samples
Label
ham     4825
spam     747
Name: count, dtype: int64

🔢 Calculating max sequence length (may take a few seconds)...


100%|██████████| 1045/1045 [00:00<00:00, 2533.13it/s]


Max length: 120

✂️ Tokenizing datasets...


Map:   0%|          | 0/1045 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]


✅ Data pipeline ready!
     train: 131 batches
validation: 19 batches
      test: 38 batches

📦 Example batch from train_loader:
         labels: torch.Size([8])
      input_ids: torch.Size([8, 120])
 attention_mask: torch.Size([8, 120])

📝 Decoded sample text:
Loan for any purpose £500 - £75,000. Homeowners + Tenants welcome. Have you been previously refused? We can still help. Call Free 0800 1956669 or text back 'help'
Label: 1


In [None]:
batch["input_ids"].shape

## 模型与训练

In [None]:
import torch
from torch import nn
from transformers import GPT2Tokenizer, GPT2Model
class GPT2ClassificationModel(nn.Module):
  def __init__(self,num_labels = 2):
    super().__init__()
    self.gpt2 = GPT2Model.from_pretrained('gpt2')
    config = self.gpt2.config
    self.classifier = nn.Linear(config.hidden_size, num_labels, bias=True)

  def forward(self,input_ids,attention_mask):
    gpt2_out = self.gpt2(input_ids,attention_mask=attention_mask)
    logits = self.classifier(gpt2_out.last_hidden_state[:, -1, :])
    return logits

def train(model, train_loader,val_loader, optimizer,loss_fn, lr_scheduler, device,progress_bar):
    model.train()
    for epoch in range(num_epochs):
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"])
            loss = loss_fn(outputs, batch["labels"])
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.update(1)

    # 3. 评估循环
    model.eval()
    total_correct = 0
    total_samples = 0
    total_loss = 0  # 初始化总损失

    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            logits = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"])
            loss = loss_fn(logits, batch["labels"])
            total_loss += loss.item()

        predictions = torch.argmax(logits, dim=-1)
        total_correct += (predictions == batch["labels"]).sum().item()
        total_samples += len(batch["labels"])

    # 计算平均损失和准确率
    avg_val_loss = total_loss / len(val_loader)
    accuracy = total_correct / total_samples

    print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
    print(f"Validation Loss: {avg_val_loss:.4f} | Validation Accuracy: {accuracy:.4f}")

from torch.optim import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm

# 1. 初始化模型、优化器、损失函数
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = GPT2ClassificationModel(num_labels=2)
model.to(device)

# --- 支持：冻结 GPT2 模型的参数，或者只训练某几层 ---
# for param in model.gpt2.parameters():
#     param.requires_grad = False
# ------------------------------------

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# 2. 训练循环
progress_bar = tqdm(range(num_training_steps))
train(
    model = model,
    train_loader=train_loader ,
    val_loader=val_loader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    lr_scheduler=lr_scheduler,
    device=device,
    progress_bar=progress_bar
)


  0%|          | 0/393 [00:00<?, ?it/s]


--- Epoch 3/3 ---
Validation Loss: 0.0640 | Validation Accuracy: 0.9867


##验证


## 测试