# 微调
我们目前总是自己定义一个模型从头开始训练，但是这种做法比较耗费时间，且适用于数据集较大的情况下。因此我们可以采用一种更快速的方式，也就是在别人训练好的模型的基础上微调参数，让该模型快速适配我们的数据集。

In [None]:
import torch
from torch import nn
import pandas as pd
from torch.nn.functional import pad
from torch.utils.data import Dataset
from transformers import GPT2Model, AutoTokenizer  # 我们使用预训练的gpt2模型

# 使用tokenizer的LyricsDataset

In [None]:
class LyricsDataset(Dataset):
    def __init__(
            self,
            file_path: str,
            separator: str = "，",
            nrows: int = 300,
            batch_size: int = 32,
    ):
        super().__init__()
        self.separator = separator
        self.nrows = nrows
        self.batch_size = batch_size
        # 使用预训练的 uer/gpt2-chinese-cluecorpussmall 的tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            r'D:\cache\models--uer--gpt2-chinese-cluecorpussmall\snapshots\c2c0249d8a2731f269414cc3b22dff021f8e07a3')
        # 添加特殊标记
        self.bos_token = '<bos>'
        self.eos_token = '<eos>'
        self.pad_token = '<pad>'
        special_tokens = {'bos_token': self.bos_token,
                          'eos_token': self.eos_token,
                          'pad_token': self.pad_token}
        # 添加我们自定义的三个token
        self.num_added = self.tokenizer.add_special_tokens(special_tokens)
        # 计算更新之后的词表大小
        self.vocab_size = self.tokenizer.vocab_size + self.num_added

        self.bos_token_id = self.tokenizer.bos_token_id
        self.eos_token_id = self.tokenizer.eos_token_id
        self.pad_token_id = self.tokenizer.pad_token_id

        self.data = self.read_file(file_path)

    def read_file(self, file_path):
        if self.nrows >= 0:
            df = pd.read_csv(file_path, nrows=self.nrows, encoding='utf-8')
        else:
            df = pd.read_csv(file_path, encoding='utf-8')
        # 返回原始数据
        return df.values.reshape(-1)

    def __getitem__(self, index):
        # 我们使用collate_fn将批次中的所有样本补充到同一个长度,因此这个函数不进行填充处理
        src = self.data[index][:-1]
        tgt = self.data[index][1:]

        src = self.tokenizer(src, return_tensors='pt', add_special_tokens=False)['input_ids'].reshape(-1).tolist()
        tgt = self.tokenizer(tgt, return_tensors='pt', add_special_tokens=False)['input_ids'].reshape(-1).tolist()
        # 为src,tgt添加<bos>,<eos>
        src = [self.bos_token_id] + src + [self.eos_token_id]
        tgt = [self.bos_token_id] + tgt + [self.eos_token_id]

        return torch.tensor(src), torch.tensor(tgt)

    def __len__(self):
        return len(self.data)

    def collate_fn(self, batch):
        """动态批次长度，这个函数可被指定为DataLoader数据加载器的collate_fn参数"""
        batch_max_length = max(max(len(src), len(tgt)) for src, tgt in batch)
        batch_src, batch_tgt = [], []
        pad_idx = self.pad_token_id
        for src, tgt in batch:
            batch_src.append(pad(src, (0, batch_max_length - len(src)), value=pad_idx))
            batch_tgt.append(pad(tgt, (0, batch_max_length - len(tgt)), value=pad_idx))
        return torch.stack(batch_src), torch.stack(batch_tgt)

# 增加自定义语言模型头

In [None]:
class GPT2Lyrics(nn.Module):
    def __init__(self, vocab_size, num_added_tokens):
        super().__init__()
        # 使用预训练的 uer/gpt2-chinese-cluecorpussmall 模型
        # 为了加载方便，这里我使用下载好的本地模型,如果不使用本地模型，可将参数替换为uer/gpt2-chinese-cluecorpussmall
        self.gpt2 = GPT2Model.from_pretrained(
            r"D:\cache\models--uer--gpt2-chinese-cluecorpussmall\snapshots\c2c0249d8a2731f269414cc3b22dff021f8e07a3")
        # 因为我们手动添加了<bos>,<eos>,<pad>三个特殊字符，因此我们需要将这三个词加入到词嵌入层中
        original_embedding = self.gpt2.wte.weight  # 获取原来的词嵌入层
        # 创建新增的三个词的词嵌入参数，使用原词嵌入的均值，标准差，形状为(3,768),768是gpt2的模型维度
        new_embeddings = torch.normal(mean=original_embedding.mean().item(),
                                      std=original_embedding.std().item(),
                                      size=(num_added_tokens, 768),
                                      device=original_embedding.device)
        # 将新增的词嵌入矩阵连接到原词嵌入矩阵
        extended_embedding = torch.cat([original_embedding, new_embeddings], dim=0)
        # 更新gpt2模型的词嵌入矩阵
        self.gpt2.wte.weight = nn.Parameter(extended_embedding)
        # 更新gpt2模型的 vocab_size 大小
        self.gpt2.config.vocab_size = vocab_size

        for name, param in self.gpt2.named_parameters():
            # 我们保留原词嵌入矩阵的值，关闭它们的梯度计算，我们只计算我们添加的三个词的梯度
            if name == 'wte.weight':
                param.requires_grad = False
                param[-num_added_tokens:].requires_grad = True
            else:
                param.requires_grad = False  # 其它的层的梯度全部冻结

        # 新增的线性层，将原gpt2输出的 768 维隐藏状态映射到自定义词表大小vocab_size
        self.custom_head = nn.Sequential(nn.Linear(768, 768 * 4),
                                         nn.Dropout(0.1),
                                         nn.Linear(768 * 4, vocab_size))

    def forward(self, input_ids, attention_mask=None):
        input_ids = input_ids.to(self.gpt2.device)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.gpt2.device)
        # 获取gpt2的最后一层隐藏状态
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, 768)
        # 映射到我们的自定义词表
        out = self.custom_head(last_hidden_state)
        return out

# 训练

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split

device = "cuda"

batch_size = 12
dataset = LyricsDataset("../data/generate/lyrics.csv", nrows=-1, batch_size=batch_size)
vocab_size = dataset.vocab_size
model = GPT2Lyrics(vocab_size, dataset.num_added)

padding_idx = dataset.pad_token_id
loss_fn = nn.CrossEntropyLoss(ignore_index=padding_idx)

lr = 1e-4  # 我们仅仅训练最后一个线性层和新增的3个token的嵌入层，因此我们使用固定学习率
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
train_dataset, test_dataset = random_split(dataset, [0.9, 0.1])  # 划分训练集和测试集
train_loader, test_loader = DataLoader(
    train_dataset, batch_size=batch_size, collate_fn=dataset.collate_fn, pin_memory=True, shuffle=True
), DataLoader(test_dataset, batch_size=batch_size, collate_fn=dataset.collate_fn, pin_memory=True)

In [None]:
def save_checkpoint(epoch, model, optimizer, scheduler, loss, path):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state": scheduler.state_dict(),
        "scheduler_type": type(scheduler).__name__,
        "loss": loss,
    }
    torch.save(checkpoint, path)


def load_checkpoint(model, optimizer, scheduler, path):
    if path is not None:
        checkpoint = torch.load(path)
        if model:
            model.load_state_dict(checkpoint["model_state_dict"])
        if optimizer:
            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
            print(f"从{checkpoint['epoch']}开始训练")
        if scheduler:
            scheduler.load_state_dict(checkpoint["scheduler_state"])
        return checkpoint["epoch"], checkpoint["loss"]

    print("未发现检查点")
    return 0, float("inf")


def evaluate(model, test_loader, device):
    model.eval().to(device)
    with torch.no_grad():
        total_val_loss = 0
        for src, tgt in test_loader:
            src, tgt = src.to(device), tgt.to(device)
            pred = model(src)
            loss = loss_fn(pred.reshape(-1, vocab_size), tgt.reshape(-1))
            total_val_loss += loss.item()
    return total_val_loss / len(test_loader)


# 设置初始验证集损失
best_val_loss = 1e10
epochs = 500

scaler = torch.amp.GradScaler(device)
for epoch in range(epochs):
    model.train().to(device)
    # 创建进度条对象，可视化我们模型的训练进度
    pbar = tqdm(
        train_loader, desc=f"[epoch {epoch + 1}/{epochs}] epoch progress", leave=False
    )

    total_loss = 0
    for src, tgt in pbar:
        # 创建填充掩码(在transformers库中，填充掩码中的1代表有效token,0代表填充token,因此下面使用 != )
        padding_mask = (src != padding_idx).long().to(device)
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        with torch.amp.autocast(device):
            pred = model(src, padding_mask)  # 不需要因果掩码，因为gpt2模型内置
            loss = loss_fn(pred.reshape(-1, vocab_size), tgt.reshape(-1))
            pbar.set_postfix(loss=f"{loss.item():.6f}")  # 在进度条上面显示当前批次的loss信息
            total_loss += loss.item()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    # 在验证集上面评估模型
    cur_val_loss = evaluate(model, test_loader, device)
    if cur_val_loss < best_val_loss:
        torch.save(model.state_dict(), f"best_lyrics_gpt2_model.pth")  # 保存最佳模型
        best_val_loss = cur_val_loss
    if (epoch + 1) % 2 == 0:
        torch.save(model.state_dict(), f"lyrics_gpt2_{epoch + 1}_model.pth")
    avg_loss = total_loss / len(train_loader)
    print(
        f"epoch {epoch + 1}: avg_loss: {avg_loss:.6f} ,perplexity: {torch.exp(torch.tensor(avg_loss)).item():.6f},val_loss: {cur_val_loss:.6f}"
    )

# 加载训练好的模型

In [None]:
batch_size = 24
dataset = LyricsDataset("../data/generate/lyrics.csv", nrows=-1, batch_size=batch_size)
vocab_size = dataset.vocab_size

model = GPT2Lyrics(vocab_size, dataset.num_added)
path = '../model/微调/best_lyrics_gpt2_model.pth'  # 替换为你保存模型文件的路径
model.load_state_dict(torch.load(path))

# 预测

In [None]:
def predict(
        text: str,
        model: nn.Module,
        max_length: int,
        separator: str,
        device: str,
        tokenizer,
        decode,
        temperature: float = 0.75,
):
    model.eval().to(device)

    def generate(splitted_text):
        with torch.no_grad():
            index_text = tokenizer(
                "<bos>", return_tensors="pt", add_special_tokens=False
            )["input_ids"].reshape(-1).tolist() + [
                             tokenizer(char, return_tensors="pt", add_special_tokens=False)["input_ids"].item()
                             for char in splitted_text
                         ]  # 添加句首标记并将文本转化为索引
            tensor_text = torch.tensor(index_text, device=device).unsqueeze(0)
            generated = index_text.copy()
            for _ in range(max_length):
                # 自回归生成没有<pad>,因此mask传入None
                pred = model(tensor_text, None)[:, -1, :] / temperature  # 应用温度
                # 概率采样预测
                proba = nn.Softmax(dim=-1)(pred)
                dist = torch.distributions.Categorical(proba)
                next_id = dist.sample()
                # 添加新next_id到下一次的输入中
                tensor_text = torch.cat((tensor_text, next_id.unsqueeze(0)), dim=-1)
                if decode(next_id.item()) == "<eos>":
                    break
                generated.append(next_id.item())
            return generated

    generate_text = []
    for splitted_text in text.split(
            separator
    ):  # 按照separator分割，分割后的每个元素作为每一句的开头
        generate_text += list(
            splitted_text
        )  # 将新的splitted_text转化为列表添加到generate_text中
        generate_text = [
            decode(idx) for idx in generate(generate_text)
        ]  # 上一次的输出拼接上新加入的token作为输入，以实现上下文关联
        generate_text.append("，")  # 添加逗号

    return "".join(generate_text).strip("<bos>").replace("，，", "，")


text = "玫瑰/晚风"
generated_lyrics = predict(
    text,
    model,
    100,
    "/",
    "cuda",
    dataset.tokenizer,
    dataset.tokenizer.decode,
    temperature=0.95,
)

generated_lyrics