In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from webencodings import labels

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

input_text = "Transformer are the"
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)
iters = []
n_step3 = 8
choices_per_step = 5

with torch.no_grad():
    for _ in range(n_step3):
        iter = dict()
        iter["Input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids=input_ids)
        # Select logits of the first batch and the last token and apply softmax
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        # store tokens with highest probs
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
            iter[f"Choice {choice_idx+1}"] = token_choice
            
            # append predicted next token to input
            input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
        iters.append(iter)

df = pd.DataFrame(iters)
df.to_html("/Users/admin/Developer/PyProjects/DeepLearing/output/output.html")

#use generate
output = model.generate(input_ids, do_sample=False, max_new_tokens=n_step3)
print(tokenizer.decode(output[0]))
print("-"*40)
#reproduce unicorn stories
max_length = 128
input_text = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
output_greedy = model.generate(input_ids, max_length=max_length, do_sample=False)
print(tokenizer.decode(output_greedy[0]))


  from .autonotebook import tqdm as notebook_tqdm
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transformer are the most most most most most powerful powerful powerful powerful powerful weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon
----------------------------------------
Transformer are the most most most most most powerful powerful powerful powerful powerful weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon we

## Greedy Search vs Beam Search Decoding vs Stochastic Beam Search
### Greedy Search （贪心搜索）
* 每一步都选择概率最高的一个单词作为下一个token，只跟踪一条最优路径，不考虑其他选项。
* 公式：$ Y_{t+1} = \arg \max_{ w}P(w \mid Y_{1:t}) $ <br><br>
* 计算快实现简单，但容易卡在局部最优，因为他不考虑未来选项，且缺乏多样性，相同输入总是相同输出。<br>

### Beam Search Decoding（集束搜索）
* 维护 k 条可能的路径（beam size = k），在每一步选择 k 个最可能的序列继续扩展，而不是只选一个最优路径。
* 最终返回概率最高的完整句子。
* 公式：近似最大化序列概率 $ Y^* = \arg \max_{ y}P(Y \mid X) $
1. 设 beam_size = k（比如 k=3）
2. 在每一步，计算所有扩展路径的概率，保留 k 个最优的序列。
3. 直到句子结束符 <EOS>，选择最高概率的完整序列。
<br>
比贪心搜索更优，避免局部最优解。可调控 k 值，在质量和计算成本之间取得平衡。但 计算开销较大，k 越大，计算量越高；仍然可能丢失全局最优解，如果 k 过小，搜索空间不够大。

### Stochastic Beam Search（随机集束搜索）

* 传统 Beam Search 只保留最高概率的路径，而 Stochastic Beam Search 会随机选择 k 个候选路径。
* 引入随机性，避免所有生成的文本都完全相同，提升文本多样性
* 过程
1. 采样候选路径，而不是每次都选 top-k 的最优路径。
2. 在 beam_size 内，使用温度系数调整概率：
$ P(w) \propto P(w)^{1/T} $ <br>
    T > 1 → 选择更随机的词 <br>
    T < 1 → 选择更确定的词 <br>

生成更多样化的文本，适用于对话、故事生成等任务。不会死板地选取最优解，更像人类写作风格。<br>
质量不一定最优，可能选到概率较低的词。需要调节超参数，如 T 和 k，以保证质量。<br>

In [4]:
import torch.nn.functional as F

def log_probs_from_logits(logits, labels):
    """
    single token log probability from logits
    :param logits: 
    :param labels: 
    :return: 
    """
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

def sequence_logprobs(model, labels, input_len = 0):
    """
    sequence log probability
    :param model: 
    :param labels: 
    :param input_len: 
    :return: 
    """
    with torch.no_grad():
        output = model(labels)
        log_probs = log_probs_from_logits(output.logits[:, :-1, :], labels[:, 1:])
        seq_log_probs = torch.sum(log_probs[:, input_len: ])
    return seq_log_probs

logp = sequence_logprobs(model, output_greedy, input_len=len(input_ids[0]))
print("-/"*20)
print(tokenizer.decode(output_greedy[0]))
print(f"Sequence log probs: {logp:.2f}")

print("-bs-"*20)
print('beam search:\n')
# beam search
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5,
 do_sample=False)
logp = sequence_logprobs(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\nlog-prob: {logp:.2f}")

# beam search with ngram
print("-bs ng-"*20)
print("\n")
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5,
 do_sample=False, no_repeat_ngram_size=3)
logp = sequence_logprobs(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f"\nlog-prob: {logp:.2f}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/-/
Transformer are the most most most most most powerful powerful powerful powerful powerful weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon
Sequence log probs: -2.31
-bs--bs--bs--bs--bs--bs--bs--bs--bs--bs--bs--

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transformer are the most most most most most powerful powerful powerful powerful powerful weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon

log-prob: -2.31
-bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs ng--bs 

> <strong> N-gram Penalty（N-gram 惩罚）</strong>  : N-gram Penalty（N-gram 惩罚）是一种 解码策略，用于 防止重复生成相同的 N-gram 片段，尤其在 文本生成任务（如机器翻译、文本摘要、对话系统）中，能有效减少 重复问题。<br>
> N-gram 指的是由 N 个连续 token 组成的短语
> 重复模式 发生的原因: <br>
>   a. 语言模型学习到了某些短语的高概率，导致它 倾向于重复生成。 <br>
>   b. 在 Beam Search 或 Greedy Search 过程中，较高概率的 token 会不断被选中，形成重复。<br>
> How it works? <br>
>   1. 记录 已经生成的 N-gram 片段 <br>
>   2. 当解码过程中尝试生成下一个 token 时: 如果这个 token 形成了已存在的 N-gram，则 禁止生成（分数设为极低）。这样就可以 避免完全相同的重复短语。<br>

## Sampling Methods

Reduces repetitions.

### 随机采样
最简单的采样方法是在每个时间步随机从模型输出的整个词汇表的概率分布中进行采样。 <br>

$P(y_t = w_i | y_{<t}, \mathbf{x}) = \text{softmax}(z_{t,i}) = \frac{\exp(z_{t,i})}{\sum_{j=1}^{|V|} \exp(z_{t,j})} $ <br>

V 表示词表基数，可以通过增加一个温度参数 T在softmax操作之前缩放logits，以此控制输出结果的多样性。<br>

$P(y_t = w_i | y_{<t}, \mathbf{x}) = \frac{\exp(z_{t,i} / T)}{\sum_{j=1}^{|V|} \exp(z_{t,j} / T)}$ <br><br>

调整T来控制概率分布的形状。T<= 1, 分布在原点附近变得尖锐，稀有的token被抑制；当温度 T≫1 时，分布变得平坦，每个token变得同样可能。 <br><br>
<img src="./imgs/temperature_contrl_logits_scale.png" width="400"/> <br>

* T 越小，概率分布越陡峭，输出更确定（接近贪心解）。
* T 越大，概率分布越平缓，采样更随机（提升多样性但可能导致错误）。
* 通常T 设为 0.7 ~ 1.0 之间，以平衡确定性与多样性

### Top-K and Nucleus Sampling

核心思想是限制参与采样的相关token数量。Top-k 取K个最高概率的token。避免选择低概率token作为下一个输出token。相比于Top-k的固定阶段方式，Top-p采用动态截断，设定一个阈值，比如95%，
将所有token按概率值降序，以此高->低取，并逐个累加概率和直至达到阈值95%。

In [15]:
print("-top-k-"*20)
print("\n")
output_topk = model.generate(input_ids, max_length=max_length,
 do_sample=True, top_k=50)
logp = sequence_logprobs(model, output_topk, input_len=len(input_ids[0]))
print(tokenizer.decode(output_topk[0]))
print(f"\nlog-prob: {logp:.2f}")

print("-top-p-"*20)
print("\n")
output_topp = model.generate(input_ids, max_length=max_length,
 do_sample=True, top_p=0.6)
logp = sequence_logprobs(model, output_topp, input_len=len(input_ids[0]))
print(tokenizer.decode(output_topp[0]))
print(f"\nlog-prob: {logp:.2f}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


-top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k--top-k-




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transformer are the most most most most most powerful powerful powerful powerful powerful weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon weapon

log-prob: -2.31
-top-p--top-p--top-p--top-p--top-p--top-p--top-p--top-p--top-p--top-p--top-p--top-p--top-p--top

In [None]:
## Text Summarization

### Datasets

In [24]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")
print(f"Features:{dataset['train'].column_names}")
sample = dataset['train'][1]
print(f"""
Article (excerpt of 500 characters, total length:
{len(sample["article"])}):
""")
print(sample["article"][:500])
print(f'\nSummary (length: {len(sample["highlights"])}):')
print(sample["highlights"])

Features:['article', 'highlights', 'id']

Article (excerpt of 500 characters, total length:
4051):

Editor's note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most s

Summary (length: 281):
Mentally ill inmates in Miami are housed on the "forgotten floor"
Judge Steven Leifman says most are there as a result of "avoidable felonies"
While CNN tours facility, patient shouts: "I am the son of the president"
Leifman says the system is unjust and he's fighting for change .


处理缩写中的标点符号，如：U.S. or U.N.

In [15]:
import nltk
from nltk.tokenize import sent_tokenize
print(nltk.data.path)

import sys
import nltk
print(f"Python Version: {sys.version}")
print(f"NLTK Version: {nltk.__version__}")

nltk.download('punkt_tab', download_dir='/Users/admin/nltk_data')
string = "铁木真.阿济格. The U.N. is an organization."
sent_tokenize(string)

['/Users/admin/nltk_data', '/Users/admin/miniconda3/envs/DL_learning/nltk_data', '/Users/admin/miniconda3/envs/DL_learning/share/nltk_data', '/Users/admin/miniconda3/envs/DL_learning/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/Users/admin/nltk_data/', '/Users/admin/']
Python Version: 3.10.0 | packaged by conda-forge | (default, Nov 20 2021, 02:43:39) [Clang 11.1.0 ]
NLTK Version: 3.9.1


[nltk_data] Downloading package punkt_tab to /Users/admin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['铁木真.阿济格.', 'The U.N. is an organization.']

### Summarization Baseline
比较常用的一个方式是直接取文章的前三个句子。以下是用NLTK实现baseline的例子：

In [None]:
from nltk.tokenize import sent_tokenize
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")
sample_text = dataset["train"][1]["article"][:2000]
summaries = {}

## baseline
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])
summaries["baseline"] = three_sentence_summary(sample_text)

## 使用GPT2做摘要任务
from transformers import pipeline, set_seed

set_seed(42)
pipe = pipeline("text-generation", model="gpt2")
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_output = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
summaries["gpt2"] = "\n".join(sent_tokenize(pipe_output[0]["generated_text"][len(gpt2_query):]))
print(summaries)

#use t5: 通过将所有任务形式化为文本到文本的任务来创建一个通用的transformer 架构; T5检查点是在无监督数据（用于重建遮蔽词）和多个任务的监督数据（包括摘要）的混合数据上进行训练的。
# pipe = pipeline("summarization", model="t5-small")
# pipe_out = pipe(sample_text, num_beams=2, max_length=50)
# summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))
# print(summaries)

#use bart: encoder->decoder架构，被训练用于重构受损的输入，结合了gpt2与bert的预训练方案

## Evaluation Metrics
生成式语言模型能力评估常用的两种方式：BLEU ROUGE

### BLEU （Bilingual Evaluation Understudy）
关注词汇或者n-gram，而不再只是关注生成文本中多少token与参考文本的token对齐。BLEU是一种基于精确度的度量标准，在比较两段文本时，统计生成的文本有多少词是在参考文本里面，再除以参考文本长度，从而得出精确度。<br>
缺陷：如果生成文本中有重复词汇，且这个词汇又在参考文本中出现，那么上述方案计算出的精确度就不置信 <br>
解决：统计生成文本中的词汇出现次数时，以参考文本中该词汇出现次数作为统计上限。<br><br>
a. **n-gram精确度计算**<br>
$P_n=\sum\frac{候选文本中匹配的_{n-gram}}{候选文本中所有的_{n-gram}}$ <br><br>

b. **BP（Brevity Penalty) 长度惩罚**<br>
如生成的句子比参考文本短很多，会导致BLEU过高，引入惩罚项即可：<br>
$BP=\begin{cases}
1, & \text{if } c > r\\
e^{1-r/c}, & \text{if } c \leq r
\end{cases}
$
* c=生成文本长度
* r=参考文本长度

c.**BLEU 计算公式**
$BLEU=BP.exp \left( \sum^{N}_{n=1}{w_n logP_n} \right)$
### ROUGE

计算生成文本和参考文本之间的 n-gram 重叠情况。主要关注 召回率（Recall），即参考文本中的内容有多少被生成文本包含。

常见变体：
1. ROUGE-N（n-gram召回率）
$ROUGE-N=\frac {\sum{匹配的_{n-gram}数}}{\sum{参考文本的_{n-gram}总数}}$

2. ROUGE-L 最长公共子序列(LCS)
计算生成文本与参考文本之间的最长公共子序列（LCS），避免 n-gram 过于严格。
$ROUGE-L=\frac{LCS长度}{参考文本长度}$
> 
> 参考："The cat is on the mat." 
> 
> 生成："The cat is on mat."
> 
> LCS = "The cat is on mat."（匹配长度 5）
> 
> $ROUGE-L = \frac{5}{6}$ 

3. ROUGE-W（加权最长公共子序列）
ROUGE-L 改进版本，给较长的LCS更高权重

4. ROUGE-S（跳跃bi-gram统计）
统计允许跳跃的二元组匹配，计算非连续的短语重叠率

d.**BLEU与ROUGE区别** 
<br><br>
<img src="./imgs/bleu_vs_rouge.png" width="400"/>

In [13]:
# 使用nltk库计算BLEU score
from nltk.translate.bleu_score import sentence_bleu
reference = [["the", "cat", "is", "on", "the", "mat"]]
candidate = ["the", "cat", "is", "on", "mat"]

#BLUE-1 ~ 4
score = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))
print(f"BLEU score: {score:.4f}")

#使用evaluate
import evaluate
import pandas as pd
import numpy as np
bleu_metric = evaluate.load("sacrebleu")

bleu_metric.add(prediction="the the the the the the", reference=["the cat is on the mat"])
results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p, 2) for p in
results["precisions"]]
df = pd.DataFrame.from_dict(results, orient="index", columns=["Value"])
print(df)
print("-"*30)
bleu_metric.add(prediction="the cat is on mat", reference=["the cat is on the mat"])
results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
results["precisions"] = [np.round(p, 2) for p in
results["precisions"]]
df = pd.DataFrame.from_dict(results, orient="index", columns=["Value"])
print(df)

# 使用rouge-score库计算rouge召回率
from rouge_score import rouge_scorer
from datasets import load_dataset
reference = "The cat is on the mat."
candidate = "The cat is on mat."

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(reference, candidate)
print(f"ROUGE-1 score: {scores['rouge1'].fmeasure:.4f}")
print(f"ROUGE-2 score: {scores['rouge2'].fmeasure:.4f}")
print(f"ROUGE-L score: {scores['rougeL'].fmeasure:.4f}")

# 使用evaluate计算rouge
rouge_metric = evaluate.load("rouge")
dataset = load_dataset("cnn_dailymail", "3.0.0")

reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
for model_name in summaries:
 rouge_metric.add(prediction=summaries[model_name],
reference=reference)
 score = rouge_metric.compute()
 rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in
rouge_names)
 records.append(rouge_dict)
df = pd.DataFrame.from_records(records, index=summaries.keys())
print(df)

BLEU score: 0.5789
                             Value
score                          0.0
counts                [2, 0, 0, 0]
totals                [6, 5, 4, 3]
precisions  [33.33, 0.0, 0.0, 0.0]
bp                             1.0
sys_len                          6
ref_len                          6
------------------------------
                                 Value
score                        57.893007
counts                    [5, 3, 2, 1]
totals                    [5, 4, 3, 2]
precisions  [100.0, 75.0, 66.67, 50.0]
bp                            0.818731
sys_len                              5
ref_len                              6
ROUGE-1 score: 0.9091
ROUGE-2 score: 0.6667
ROUGE-L score: 0.9091


NameError: name 'summaries' is not defined

## Train own summarization model

`see python/train_own_summarization_model.py`