In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 加载预训练的 GPT-2 模型和分词器
tokenizer = GPT2Tokenizer.from_pretrained("D:\\DDA4210\\gpt")
model = GPT2LMHeadModel.from_pretrained("D:\\DDA4210\\gpt")

# 定义函数来计算困惑度并替换单词
def calculate_perplexity_and_replace(text):
    # 分词
    tokenized_text = tokenizer.tokenize(text)
    # 在句子开头添加起始符号
    tokenized_text = ['<|endoftext|>'] + tokenized_text
    # 将单词转换为索引
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    # 转换为张量
    input_ids = torch.tensor([input_ids])

    # 使用模型预测下一个单词的概率分布
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
        logits = outputs.logits

    # 计算困惑度
    perplexities = []
    for i, word in enumerate(tokenized_text):
        word_logits = logits[0, i]  # 获取当前单词的概率分布
        word_index = tokenizer.convert_tokens_to_ids(word)
        word_probability = torch.softmax(word_logits, dim=-1)[word_index].item()  # 获取当前单词的概率
        perplexity = 1 / word_probability  # 计算困惑度
        perplexities.append(perplexity)

    # 找出困惑度过高的单词
    high_perplexity_words = [tokenized_text[i] for i, perplexity in enumerate(perplexities) if perplexity > 1]

    # 对困惑度过高的单词进行替换
    for word in high_perplexity_words:
        # 替换成其他单词或者通过某种方法重新生成
        pass

    return tokenized_text

# 示例文本
text = "The quick brown fox jumps over the lazy dog."
# 计算困惑度并替换单词
processed_text = calculate_perplexity_and_replace(text)
print(processed_text)




['<|endoftext|>', 'The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']


In [3]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

# 查找单词的同义词
synonyms = []
for syn in wordnet.synsets("happy"):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print("Synonyms:", synonyms)

# 查找单词的反义词
antonyms = []
for syn in wordnet.synsets("happy"):
    for lemma in syn.lemmas():
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())
print("Antonyms:", antonyms)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Charon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Synonyms: ['happy', 'felicitous', 'happy', 'glad', 'happy', 'happy', 'well-chosen']
Antonyms: ['unhappy']


In [4]:
import torch
import torch.nn.functional as F

def calculate_word_perplexity(model, tokenizer, text):
    """
    计算文本中每个单词的困惑度

    Args:
        model: 训练好的语言模型
        tokenizer: 分词器
        text: 输入的文本字符串

    Returns:
        word_perplexities: 包含每个单词困惑度的列表
    """
    # 分词
    tokens = tokenizer.encode(text)
    # print(tokens)
    # 添加特殊标记和转换为张量
    # input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor(tokens).unsqueeze(0)  # 添加批次维度
    # print(input_ids)
    # 初始化空列表用于存储每个单词的困惑度
    word_perplexities = []
    
    # 使用模型生成每个单词的概率分布并计算困惑度
    with torch.no_grad():
        for i in range(len(tokens)):
            # 获取当前位置之前的文本
            context = input_ids[:i+1]
            # 生成下一个词的概率分布
            logits = model(context).logits
            print(logits)
            # 获取当前位置的实际下一个词的索引
            target_id = input_ids[0, i+1]
            print(target_id)
            # 计算困惑度
            perplexity = calculate_perplexity(logits, target_id)
            # 将困惑度添加到列表中
            word_perplexities.append(perplexity.item())
    
    return word_perplexities

def calculate_perplexity(logits, target_id):
    """
    计算单个词的困惑度

    Args:
        logits: 模型生成的logits张量，形状为(batch_size, vocab_size)
        target_id: 实际的下一个词的索引

    Returns:
        perplexity: 单个词的困惑度，标量
    """
    # 选择实际下一个词的logits
    target_logits = logits[0, target_id]
    # 计算交叉熵损失
    loss = F.cross_entropy(target_logits.unsqueeze(0), torch.tensor([target_id]))
    # 计算指数损失
    perplexity = torch.exp(loss)
    
    return perplexity.item()

# 示例用法
text = "This is an example sentence."
# 模型和分词器应根据实际情况替换为你的模型和分词器
tokenizer = GPT2Tokenizer.from_pretrained("D:\\DDA4210\\gpt")
model = GPT2LMHeadModel.from_pretrained("D:\\DDA4210\\gpt")
word_perplexities = calculate_word_perplexity(model, tokenizer, text)
print("Word Perplexities:", word_perplexities)


tensor([[[ -35.8890,  -35.2049,  -39.1336,  ...,  -42.4869,  -41.8197,
           -36.0383],
         [-107.7291, -108.0175, -113.2968,  ..., -116.4646, -115.7443,
          -110.8654],
         [-100.5390,  -99.8514, -103.7539,  ..., -105.0177, -107.3317,
          -102.0780],
         [ -71.9370,  -72.7245,  -76.2084,  ...,  -82.9281,  -81.7860,
           -73.6416],
         [-104.6989, -105.5694, -111.0104,  ..., -116.2477, -115.0036,
          -106.4377],
         [-127.4615, -125.7557, -125.7914,  ..., -133.5118, -134.6318,
          -119.2069]]])
tensor(318)


IndexError: index 318 is out of bounds for dimension 1 with size 6

In [4]:
import math


tokd_inputs = tokenizer.encode("This is an example sentence.", return_tensors="pt", add_special_tokens=True, truncation=True).to('cpu')
# tokd_inputs = tokenizer.convert_tokens_to_ids(tokd_inputs)
tokd_labels = tokd_inputs.clone().detach()
outputs = model(input_ids=tokd_inputs, labels=tokd_labels)
loss = outputs.loss
perplexity = math.exp(loss)
perplexity

69.94070154811348

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import math



# 定义文本
text = "This is an example sentence."

# 使用分词器对文本进行分词，并添加特殊标记
input_ids = tokenizer.encode(text, return_tensors="pt")

# 使用模型预测下一个词的概率分布
with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits

# 计算困惑度
loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), input_ids.view(-1))
perplexity = math.exp(loss)/len(input_ids[0])

print("Perplexity:", perplexity)


Perplexity: 973.3511934110394
