In [1]:
import torch
from torch import nn
import pandas as pd
from torch.nn.functional import pad
from torch.utils.data import Dataset
from transformers import GPT2Model, AutoTokenizer  # 我们使用预训练的gpt2模型


class LyricsDataset(Dataset):
    def __init__(
            self,
            file_path: str,
            separator: str = "，",
            nrows: int = 300,
            batch_size: int = 32,
    ):
        super().__init__()
        self.separator = separator
        self.nrows = nrows
        self.batch_size = batch_size
        # 使用预训练的 uer/gpt2-chinese-cluecorpussmall 的tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            r'D:\cache\models--uer--gpt2-chinese-cluecorpussmall\snapshots\c2c0249d8a2731f269414cc3b22dff021f8e07a3')
        # 添加特殊标记
        self.bos_token = '<bos>'
        self.eos_token = '<eos>'
        self.pad_token = '<pad>'
        special_tokens = {'bos_token': self.bos_token,
                          'eos_token': self.eos_token,
                          'pad_token': self.pad_token}
        # 添加我们自定义的三个token
        self.num_added = self.tokenizer.add_special_tokens(special_tokens)
        # 计算更新之后的词表大小
        self.vocab_size = self.tokenizer.vocab_size + self.num_added

        self.bos_token_id = self.tokenizer.bos_token_id
        self.eos_token_id = self.tokenizer.eos_token_id
        self.pad_token_id = self.tokenizer.pad_token_id

        self.data = self.read_file(file_path)

    def read_file(self, file_path):
        if self.nrows >= 0:
            df = pd.read_csv(file_path, nrows=self.nrows, encoding='utf-8')
        else:
            df = pd.read_csv(file_path, encoding='utf-8')
        # 返回原始数据
        return df.values.reshape(-1)

    def __getitem__(self, index):
        # 我们使用collate_fn将批次中的所有样本补充到同一个长度,因此这个函数不进行填充处理
        src = self.data[index][:-1]
        tgt = self.data[index][1:]

        src = self.tokenizer(src, return_tensors='pt', add_special_tokens=False)['input_ids'].reshape(-1).tolist()
        tgt = self.tokenizer(tgt, return_tensors='pt', add_special_tokens=False)['input_ids'].reshape(-1).tolist()
        # 为src,tgt添加<bos>,<eos>
        src = [self.bos_token_id] + src + [self.eos_token_id]
        tgt = [self.bos_token_id] + tgt + [self.eos_token_id]

        return torch.tensor(src), torch.tensor(tgt)

    def __len__(self):
        return len(self.data)

    def collate_fn(self, batch):
        """动态批次长度，这个函数可被指定为DataLoader数据加载器的collate_fn参数"""
        batch_max_length = max(max(len(src), len(tgt)) for src, tgt in batch)
        batch_src, batch_tgt = [], []
        pad_idx = self.pad_token_id
        for src, tgt in batch:
            batch_src.append(pad(src, (0, batch_max_length - len(src)), value=pad_idx))
            batch_tgt.append(pad(tgt, (0, batch_max_length - len(tgt)), value=pad_idx))
        return torch.stack(batch_src), torch.stack(batch_tgt)


class GPT2Lyrics(nn.Module):
    def __init__(self, vocab_size, num_added_tokens):
        super().__init__()
        # 使用预训练的 uer/gpt2-chinese-cluecorpussmall 模型
        # 为了加载方便，这里我使用下载好的本地模型,如果不使用本地模型，可将参数替换为uer/gpt2-chinese-cluecorpussmall
        self.gpt2 = GPT2Model.from_pretrained(
            r"D:\cache\models--uer--gpt2-chinese-cluecorpussmall\snapshots\c2c0249d8a2731f269414cc3b22dff021f8e07a3")
        # 因为我们手动添加了<bos>,<eos>,<pad>三个特殊字符，因此我们需要将这三个词加入到词嵌入层中
        original_embedding = self.gpt2.wte.weight  # 获取原来的词嵌入层
        # 创建新增的三个词的词嵌入参数，使用原词嵌入的均值，标准差，形状为(3,768),768是gpt2的模型维度
        new_embeddings = torch.normal(mean=original_embedding.mean().item(),
                                      std=original_embedding.std().item(),
                                      size=(num_added_tokens, 768),
                                      device=original_embedding.device)
        # 将新增的词嵌入矩阵连接到原词嵌入矩阵
        extended_embedding = torch.cat([original_embedding, new_embeddings], dim=0)
        # 更新gpt2模型的词嵌入矩阵
        self.gpt2.wte.weight = nn.Parameter(extended_embedding)
        # 更新gpt2模型的 vocab_size 大小
        self.gpt2.config.vocab_size = vocab_size

        for name, param in self.gpt2.named_parameters():
            # 我们保留原词嵌入矩阵的值，关闭它们的梯度计算，我们只计算我们添加的三个词的梯度
            if name == 'wte.weight':
                param.requires_grad = False
                param[-num_added_tokens:].requires_grad = True
            else:
                param.requires_grad = False  # 其它的层的梯度全部冻结

        # 新增的线性层，将原gpt2输出的 768 维隐藏状态映射到自定义词表大小vocab_size
        self.custom_head = nn.Sequential(nn.Linear(768, 768 * 4),
                                         nn.Dropout(0.1),
                                         nn.Linear(768 * 4, vocab_size))

    def forward(self, input_ids, attention_mask=None):
        input_ids = input_ids.to(self.gpt2.device)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.gpt2.device)
        # 获取gpt2的最后一层隐藏状态
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, 768)
        # 映射到我们的自定义词表
        out = self.custom_head(last_hidden_state)
        return out


batch_size = 24
dataset = LyricsDataset("../data/generate/lyrics.csv", nrows=-1, batch_size=batch_size)
vocab_size = dataset.vocab_size

model = GPT2Lyrics(vocab_size, dataset.num_added)
path = '../model/best_lyrics_gpt2_model.pth'  # 替换为你保存模型文件的路径
model.load_state_dict(torch.load(path))



<All keys matched successfully>

# 贪心搜索

In [5]:
def predict(
        text: str,
        model: nn.Module,
        max_length: int,
        separator: str,
        device: str,
        tokenizer,
        decode,
        tempreture: float = 0.75,
):
    model.eval().to(device)

    def generate(splitted_text):
        with torch.no_grad():
            index_text = tokenizer(
                "<bos>", return_tensors="pt", add_special_tokens=False
            )["input_ids"].reshape(-1).tolist() + [
                             tokenizer(char, return_tensors="pt", add_special_tokens=False)["input_ids"].item()
                             for char in splitted_text
                         ]  # 添加句首标记并将文本转化为索引
            tensor_text = torch.tensor(index_text, device=device).unsqueeze(0)
            generated = index_text.copy()
            for _ in range(max_length):
                # 自回归生成没有<pad>,因此mask传入None
                pred = model(tensor_text, None)[:, -1, :] / tempreture  # 应用温度
                # 使用argmax贪心预测
                next_id = pred.argmax(dim=-1)
                # 添加新next_id到下一次的输入中
                tensor_text = torch.cat((tensor_text, next_id.unsqueeze(0)), dim=-1)
                if decode(next_id.item()) == "<eos>":
                    break
                generated.append(next_id.item())
            return generated

    generate_text = []
    for splitted_text in text.split(
            separator
    ):  # 按照separator分割，分割后的每个元素作为每一句的开头
        generate_text += list(
            splitted_text
        )  # 将新的splitted_text转化为列表添加到generate_text中
        generate_text = [
            decode(idx) for idx in generate(generate_text)
        ]  # 上一次的输出拼接上新加入的token作为输入，以实现上下文关联
        generate_text.append("，")  # 添加逗号

    return "".join(generate_text).strip("<bos>").replace("，，", "，")


text = "玫瑰/晚风"
generated_lyrics = predict(
    text,
    model,
    100,
    "/",
    "cuda",
    dataset.tokenizer,
    dataset.tokenizer.decode,
    tempreture=0.95,
)

generated_lyrics  # 生成内容重复

'玫瑰花瓣，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，晚风吹拂我们的爱情，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种美丽的传说，我们的爱情是一种，'

# 概率采样

In [2]:
def predict(
        text: str,
        model: nn.Module,
        max_length: int,
        separator: str,
        device: str,
        tokenizer,
        decode,
        tempreture: float = 0.75,
):
    model.eval().to(device)

    def generate(splitted_text):
        with torch.no_grad():
            index_text = tokenizer(
                "<bos>", return_tensors="pt", add_special_tokens=False
            )["input_ids"].reshape(-1).tolist() + [
                             tokenizer(char, return_tensors="pt", add_special_tokens=False)["input_ids"].item()
                             for char in splitted_text
                         ]  # 添加句首标记并将文本转化为索引
            tensor_text = torch.tensor(index_text, device=device).unsqueeze(0)
            generated = index_text.copy()
            for _ in range(max_length):
                # 自回归生成没有<pad>,因此mask传入None
                pred = model(tensor_text, None)[:, -1, :] / tempreture  # 应用温度
                # 概率采样预测
                proba = nn.Softmax(dim=-1)(pred)
                dist = torch.distributions.Categorical(proba)
                next_id = dist.sample()
                # 添加新next_id到下一次的输入中
                tensor_text = torch.cat((tensor_text, next_id.unsqueeze(0)), dim=-1)
                if decode(next_id.item()) == "<eos>":
                    break
                generated.append(next_id.item())
            return generated

    generate_text = []
    for splitted_text in text.split(
            separator
    ):  # 按照separator分割，分割后的每个元素作为每一句的开头
        generate_text += list(
            splitted_text
        )  # 将新的splitted_text转化为列表添加到generate_text中
        generate_text = [
            decode(idx) for idx in generate(generate_text)
        ]  # 上一次的输出拼接上新加入的token作为输入，以实现上下文关联
        generate_text.append("，")  # 添加逗号

    return "".join(generate_text).strip("<bos>").replace("，，", "，")


text = "玫瑰/晚风"
generated_lyrics = predict(
    text,
    model,
    100,
    "/",
    "cuda",
    dataset.tokenizer,
    dataset.tokenizer.decode,
    tempreture=0.95,
)

generated_lyrics

'玫瑰以为我变成什么美丽的花，其实她只是想和我并肩走，只为什么要我想你，还一个人走，不想再去想你，只想和我在一起，期间她对我说过一句话，只要你到我身边，就会比别人好又何必日日夜夜说，只有一夜夜等于一生，再说，晚风里和我一起的作者，忠贞不牺，一生为爱苦苦追，只为离开故土，爱是一种力量，守候玫瑰和他，今生与你，天天涯海角相逢，天天转月看你，不悔做同一个梦，不因为我爱你到尽头，不会相见，不在乎那些闷闷的年代，心随身，'

# 束搜索

In [42]:
def beam_search(model, initial_tensor, k, decode, max_length, device):
    # 初始化候选序列：(序列, 累积概率, 长度)
    candidates = [
        (initial_tensor, torch.tensor(0.0, device=device), 0)
    ]

    completed = []

    for _ in range(max_length):
        new_candidates = []

        # 扩展每个候选序列
        for seq, prob, length in candidates:
            # 如果序列已结束，直接添加到完成列表
            if length > 0 and decode(seq[0, -1].item()) == '<eos>':
                completed.append((seq, prob))
                continue

            # 获取下一个词的预测
            pred = model(seq, None)[:, -1, :]
            proba = nn.Softmax(dim=-1)(pred)

            # 获取topk个候选词
            top_probs, top_indices = proba.topk(k, dim=-1)
            top_probs = top_probs.squeeze(0)
            top_indices = top_indices.squeeze(0)

            # 扩展序列
            for i in range(k):
                idx = top_indices[i].unsqueeze(0).unsqueeze(0)
                new_seq = torch.cat([seq, idx], dim=-1)
                new_prob = prob + torch.log(top_probs[i])  # 使用对数概率避免下溢
                new_length = length + 1

                new_candidates.append((new_seq, new_prob, new_length))

        # 如果没有新候选，提前结束
        if not new_candidates:
            break

        # 按概率排序并保留topk个候选
        new_candidates.sort(key=lambda x: x[1], reverse=True)
        candidates = new_candidates[:k]

    # 将剩余未完成的候选添加到结果中
    completed.extend([(seq, prob) for seq, prob, _ in candidates])

    # 按概率排序并返回
    completed.sort(key=lambda x: x[1], reverse=True)
    return [(seq.reshape(-1).tolist(), prob) for seq, prob in completed]


def predict(
        text: str,
        model: nn.Module,
        max_length: int,
        separator: str,
        device: str,
        tokenizer,
        decode,
        k: int = 5
):
    model.eval().to(device)

    def generate(splitted_text):
        with torch.no_grad():
            index_text = tokenizer(
                "<bos>", return_tensors="pt", add_special_tokens=False
            )["input_ids"].reshape(-1).tolist() + [
                             tokenizer(char, return_tensors="pt", add_special_tokens=False)["input_ids"].item()
                             for char in splitted_text
                         ]  # 添加句首标记并将文本转化为索引
            tensor_text = torch.tensor(index_text, device=device).unsqueeze(0)
            # 自回归生成没有<pad>,因此mask传入None
            generated = sorted(beam_search(model, tensor_text, k, decode, max_length, device),
                               key=lambda x: x[1].item(),
                               reverse=True)[0][0]
            return generated

    generate_text = []
    for splitted_text in text.split(
            separator
    ):  # 按照separator分割，分割后的每个元素作为每一句的开头
        generate_text += list(
            splitted_text
        )  # 将新的splitted_text转化为列表添加到generate_text中
        generate_text = [
            decode(idx) for idx in generate(generate_text)
        ]  # 上一次的输出拼接上新加入的token作为输入，以实现上下文关联
        generate_text.append("，")  # 添加逗号

    return "".join(generate_text).strip("<bos>").replace("，，", "，")


text = "玫瑰/晚风"
generated_lyrics = predict(
    text,
    model,
    50,
    "/",
    "cuda",
    dataset.tokenizer,
    dataset.tokenizer.decode,
    k=20
)

generated_lyrics

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class


KeyboardInterrupt

