In [1]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from Utility import *
from typing import List, Union, Tuple
from collections import defaultdict
from MarkovChain import MarkovChain

In [2]:

# 1. 数据预处理相关函数
lines = read_time_machine()
tokens = tokenize(lines)
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [2]:
# 2. BERT模型加载与配置
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # 加载BERT对应的词表等
bert_model = BertModel.from_pretrained('bert-base-uncased')  # 加载预训练BERT模型
bert_model.eval()  # 设置为评估模式，因为只是用它提取特征，不训练



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
BEGIN = "___BEGIN__"
END = "___END__"

In [17]:
# 示例数据，这里简单模拟了一些单词组成的序列，实际应用中可以从文件读取文本并进行分词等处理来获取更丰富的数据
data = [["the", "cat", "runs", "quickly"],[ "the", "dog", "walks", "slowly"],["the", "bird", "flies", "high"]]
# 创建马尔可夫链模型实例，设置阶数为2
markov_chain = MarkovChain(7, data)

# 使用模型生成一个长度为5的新序列，不指定起始状态（将随机选择起始状态）
generated_sequence = markov_chain.generate(6)
print(generated_sequence)

['___BEGIN__', '___BEGIN__', '___BEGIN__', '___BEGIN__', 'the', 'bird', 'flies']


In [21]:
def rerank_texts_using_bert(texts: List[str]) -> List[str]:
    """
    使用BERT模型对输入的文本进行打分，根据得分重新排序
    """
    scores = []
    with torch.no_grad():
        for text in texts:
            tokenized_text = tokenizer.tokenize(text)
            input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
            outputs = bert_model(torch.tensor([input_ids]))
            # 可以使用最后一层隐藏状态的平均值等方式来获取文本的特征表示
            text_feature = outputs.last_hidden_state.squeeze(0).mean(dim=0).numpy()
            scores.append(text_feature)

    sorted_indices = sorted(range(len(scores)), key=lambda k: scores[k], reverse=True)
    return [texts[i] for i in sorted_indices]


def generate_and_rerank(length: int, markov_chain: MarkovChain) -> str:
    """
    先使用马尔科夫链生成文本，再用BERT模型润色
    """
    generated_texts = []
    for _ in range(5):  # 生成多个文本，从中选优
        text = markov_chain.generate(length)
        generated_texts.append(' '.join(text))

    reranked_text = rerank_texts_using_bert(generated_texts)[0]
    return reranked_text


# 假设的训练数据
train_data = [["I", "love", "reading", "books"], ["She", "likes", "to", "play", "football"], ["They", "enjoy", "watching", "movies"]]
markov_chain = MarkovChain(2, train_data)

# 生成并润色文本
generated_text = generate_and_rerank(2, markov_chain)
print(generated_text)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()