# [主要的NLP任务](https://huggingface.co/learn/nlp-course/zh-CN/chapter7)
- Token 分类
- 掩码语言建模（如 BERT）
- 文本摘要
- 翻译
- 因果语言建模预训练（如 GPT-2）
- 问答

## Token 分类
目标是对文本中的每个 token（词或子词）进行分类。它是 序列标注（Sequence Labeling） 的核心任务，广泛应用于信息抽取、语法分析和语义理解等领域。常见的任务类型有：
- 命名实体识别（NER）：识别文本中的人名、地名、组织名等实体
- 词性标注（POS）：标注每个词的语法类别（名词、动词等）
- 分块（Chunking）：划分短语边界（如名词短语、动词短语）

In [None]:
from datasets import load_dataset

# lhoestq/conll2003【其他数据集可能会下载不下来，所有换用这个数据集】
raw_datasets = load_dataset("lhoestq/conll2003", trust_remote_code=True)
print(raw_datasets)
# 查看训练集的第一个元素 和 NER 标签
print(raw_datasets["train"][0]["tokens"])
print(raw_datasets["train"][0]["ner_tags"])

# 查看 NER 标签的种类
ner_feature = raw_datasets["train"].features["ner_tags"]
print(ner_feature)
label_names = ner_feature.feature.names
print(label_names)

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

In [None]:
# 创建 tokenizer 对象
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# is_split_into_words 用于指定输入文本是否已经预先分词（例如，按空格分割成单词）。
# 如果设置为 True，则分词器假定输入已经分割成单词，并对这些单词进行进一步的处理
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
print(inputs)
print(inputs.tokens())
print("input tokens size:%d" % len(raw_datasets["train"][0]["tokens"]))
print("after tokenized size:%d" % len(inputs.tokens()))

In [None]:

def align_labels_with_tokens(labels, word_ids):
    """
    将 NER 标签与分词后的 tokens 对齐
    args:
        labels (List[int]): NER 标签
        word_ids (List[int]): 分词后的 tokens
    return:
        new_labels (List[int]): 对齐后的标签
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # 新单词的开始!
            current_word = word_id
            # 将特殊的 token 设置为 -100
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # 特殊的token
            new_labels.append(-100)
        else:
            # 与前一个 tokens 类型相同的单词
            label = labels[word_id]
            # 如果标签是 B-XXX 我们将其更改为 I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(word_ids)
print(len(word_ids))
print(labels)
print(align_labels_with_tokens(labels, word_ids))

In [None]:
def tokenize_and_align_labels(examples):
    """
    对数据集进行分词，并将 NER 标签与分词后的 tokens 对齐
    """
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs
# 对齐数据集中的所有数据
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
from transformers import DataCollatorForTokenClassification

# step1 整理数据
# DataCollatorForTokenClassification 带有填充功能的Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
print(batch["labels"])
# 对比数据整理器的结果与数据集中未经处理的结果
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])
# step2 评估指标
import evaluate
metric = evaluate.load("seqeval")
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
print(labels)
# 更改索引 2 处的值，创建假的预测值，然后测试评估指标
predictions = labels.copy()
predictions[2] = "O"
print(metric.compute(predictions=[predictions], references=[labels]))
import numpy as np
#  自定义 compute_metrics() 函数，返回所需要的指标
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # 删除忽略的索引(特殊 tokens )并转换为标签
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# step3 定义模型
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)
print("num_labels=%d" % model.config.num_labels)

# step4 微调模型
from transformers import TrainingArguments, Trainer
# 设置 TrainingArguments
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
# 自定义训练循环
from tqdm.auto import tqdm
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator
from transformers import get_scheduler

# 为数据集构建 DataLoader 
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

# 初始化模型
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# 初始化优化器
optimizer = AdamW(model.parameters(), lr=2e-5)

# 将模型和优化器发送到 Accelerator，Accelerator 简化和优化分布式训练及硬件加速
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
# 使用线性学习率调度器
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_name = "bert-finetuned-ner-accelerate"

# 计算 metric
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    # 删除忽略的索引(特殊 tokens )并转换为标签
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# 初始化损失记录
train_losses = []
eval_losses = []

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # 训练
    model.train()
    total_train_loss = 0.0
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()  # 累加训练损失
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    # 计算平均训练损失
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    # 评估
    model.eval()
    total_eval_loss = 0.0
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss
            total_eval_loss += loss.item()  # 累加验证损失
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        # 填充模型的预测和标签后才能调用 gathere()
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)
    # 计算平均验证损失
    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    eval_losses.append(avg_eval_loss)
    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            "train_loss": avg_train_loss,
            "eval_loss": avg_eval_loss,
            "precision": results[f"overall_precision"],
            "recall": results[f"overall_recall"],
            "f1": results[f"overall_f1"],
            "accuracy": results[f"overall_accuracy"]
        },
    )
    # 保存模型
    output_dir = f"../model/{model_name}-epoch{epoch}"
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        output_dir,
        is_main_process=accelerator.is_main_process,
        save_function=accelerator.save
    )
    # 同时保存训练状态
    accelerator.save(
        {"epoch": epoch, "optimizer_state": optimizer.state_dict(), "lr_scheduler_state": lr_scheduler.state_dict()}, 
        f"{output_dir}/training_state.pt"
    )
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        print(f"\nModel saved to {output_dir}")

# """
# 保存最终的模型
# """
# # 全部训练完成后保存最终模型
# output_dir = f"{model_name}-final"
# accelerator.wait_for_everyone()
# unwrapped_model = accelerator.unwrap_model(model)
# unwrapped_model.save_pretrained(
#     output_dir,
#     is_main_process=accelerator.is_main_process,
#     save_function=accelerator.save
# )

In [None]:
# 初始化加速器
accelerator = Accelerator()

# 1. 加载预训练的模型
model = AutoModelForTokenClassification.from_pretrained(
    "../model/bert-finetuned-ner-accelerate-epoch2",
    id2label=id2label,
    label2id=label2id,
)

# 2. 加载测试集
test_dataloader = DataLoader(
    tokenized_datasets["test"],
    collate_fn=data_collator,
    batch_size=8,
)

# 3. 准备测试集和模型
model, test_dataloader = accelerator.prepare(model, test_dataloader)

# 4. 预测测试集并计算指标
model.eval()
# 准备存储预测结果和真实标签
all_predictions = []
all_labels = []
# 迭代测试集
progress_test = tqdm(test_dataloader, desc="Testing")
for batch in progress_test:
    with torch.no_grad():
        outputs = model(**batch)
    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

    # 填充模型的预测和标签后才能调用 gathere()
    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
    predictions_gathered = accelerator.gather(predictions)
    labels_gathered = accelerator.gather(labels)
    true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
    # 添加到列表中
    all_predictions.extend(true_predictions)
    all_labels.extend(true_labels)
    metric.add_batch(predictions=true_predictions, references=true_labels)
results = metric.compute()
print(
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

# 5. 保存预测结果到本地文件
import json
with open("test_predictions.json", "w") as f:
    json.dump(all_predictions, f)


In [None]:
# 通过 token_classifier pipeline 测试模型
from transformers import pipeline

model = AutoModelForTokenClassification.from_pretrained(
    "../model/bert-finetuned-ner-accelerate-epoch2"
)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple"
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

## 微调掩码语言模型（masked language model）


In [None]:
from transformers import AutoModelForMaskedLM

# distilbert模型训练的语料数据主要来源于维基百科
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
# 通过 num_parameters 方法获取模型参数数量
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

# 测试模型能力
text = "This is a great [MASK]."
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
import torch
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# 找到 [MASK] 的位置并提取其 logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# 选择具有最高 logits 的 [MASK] 候选词
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

# 通过使用 IMDB 的大型电影评论数据集对 DistilBERT 进行微调，期望模型能输出电影评论领域相关的预测结果
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")
print(imdb_dataset)

# 创建随机样本
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))
for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")

In [None]:
# 预处理数据
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# 使用 batched=True 来激活快速多线程!
tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
print(tokenized_datasets)

# 切片会为每个特征生成一个列表的列表
tokenized_samples = tokenized_datasets["train"][:3]
# 打印出每个评论的 token 数量
for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

# 将连接的评论拆分为大小为 chunk_size 的块
# 最后一个块通常会小于所设置的分块的大小。有两种常见的策略来处理这个问题：
# 1. 如果最后一个块小于 chunk_size ，就丢弃。
# 2. 填充最后一个块，直到其长度等于 chunk_size 。
chunk_size = 128
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}
for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

def group_texts(examples):
    """
    将所有文本拼接在一起，然后分块
    """
    # 拼接所有的文本
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 计算拼接文本的长度
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # 如果最后一个块小于 chunk_size,我们将其丢弃
    total_length = (total_length // chunk_size) * chunk_size
    # 按最大长度分块
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # 创建一个新的 labels 列
    result["labels"] = result["input_ids"].copy()
    return result
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
# 使用 Trainer API 微调 DistilBERT

# 随机掩码 
from transformers import DataCollatorForLanguageModeling
# 通过DataCollatorForLanguageModeling，使用参数mlm_probability（掩盖 tokens 的比例）实现
"""
随机掩码的一个缺点是，当使用 Trainer 时，每次计算出来的评估结果会有些许不同，即使
我们会对训练集和测试集使用相同的数据整理器。
"""
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")
for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

# 全词屏蔽（whole word masking），可以一次遮蔽整个单词的所有 token，实现
# 全词屏蔽需要自己构建一个数据整理器
import collections
import numpy as np
from transformers import default_data_collator
wwm_probability = 0.2
"""
将使用先前计算的word ID，构建一个单词索引和相应 token 之间的映射，然后随机决定遮蔽哪些单
词，并使用这种方法对输入进行遮蔽。请注意，除了与掩码对应的标签外，所有其他的标签均应该设置
为 -100 
"""
def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")
        # 创建一个单词与对应 token 索引之间的映射
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)
        # 随机遮蔽单词
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels
    return default_data_collator(features)

samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)
for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
# 考虑到硬件能力，这里筛选少量数据来训练
train_size = 10_000
test_size = int(0.1 * train_size)
downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
print(downsampled_dataset)

# 指定Trainer参数
from transformers import TrainingArguments
batch_size = 64
# 在每个 epoch 输出训练的 loss
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]
# tips: mac 机器上可能需要禁用混合精度训练
training_args = TrainingArguments(
    output_dir=f"../model/{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=False,    # 禁用FP16
    bf16=False,    # 禁用BF16
    no_cuda=True,  # 确保不使用CUDA（即使有GPU）
    logging_steps=logging_steps,
)
# 强制使用 CPU 或 CUDA
device = torch.device("cpu")  # 或 "cuda"/"mps"
model = model.to(device)
# 创建Trainer
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
# 指定评估指标 Perplexity
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
trainer.train()
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
# 使用 Accelerate 微调 DistilBERT
"""
DataCollatorForLanguageModeling 在每次评估时也会进行随机遮罩，因此我们在每次训练运行
中都会看到困惑度得分有些波动。
因此在整个测试集上 仅进行一次 遮罩，以确保我们的评估指标是一致的。
"""
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # 为数据集中的每一列创建一个新的"masked"列
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

# 设置 DataLoader
from torch.utils.data import DataLoader
from transformers import default_data_collator
batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

# 重新加载预训练模型
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

# 指定优化器
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

# 用 Accelerator 对象包装所有的组件
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

# 设置学习率调度器
from transformers import get_scheduler
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"

# 训练和评估的循环
from tqdm.auto import tqdm
import torch
import math
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # 训练
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    # 评估
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))
    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")
    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
    # 保存模型
    output_dir = f"../model/{model_name}-epoch{epoch}"
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        output_dir,
        is_main_process=accelerator.is_main_process,
        save_function=accelerator.save
    )
    # 同时保存训练状态
    accelerator.save(
        {"epoch": epoch, "optimizer_state": optimizer.state_dict(), "lr_scheduler_state": lr_scheduler.state_dict()}, 
        f"{output_dir}/training_state.pt"
    )
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        print(f"\nModel saved to {output_dir}")


In [None]:
# 使用我们微调的模型
from transformers import pipeline
model_checkpoint = "../model/distilbert-base-uncased-finetuned-imdb-accelerate-epoch1"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
mask_filler = pipeline("fill-mask", model=model, tokenizer=tokenizer)
text = "This is a great [MASK]."
preds = mask_filler(text)
for pred in preds:
    print(f">>> {pred['sequence']}")


## 翻译


In [None]:
# 加载 KDE4 数据集
from datasets import load_dataset
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")
print(raw_datasets)
# 数据集拆分
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
print(split_datasets)
split_datasets["validation"] = split_datasets.pop("test")
print(split_datasets["train"][1]["translation"])

# 加载预训练模型
from transformers import pipeline
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation", model=model_checkpoint)
print(translator("Default to expanded threads"))

# 加载分词器
from transformers import AutoTokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]
inputs = tokenizer(en_sentence, text_target=fr_sentence)
# 输出包含了英语句子的 inputs IDs，而法语句子的 IDs 存储在 labels 字段中
print(inputs)

# 预处理数据集
max_length = 128
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

# 使用 Trainer API 微调模型
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# 测试data collator
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
print(batch.keys())
print(batch["labels"])
print(batch["decoder_input_ids"])
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])
  
# 评估指标
import evaluate
metric = evaluate.load("sacrebleu")
# 测试评估指标
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
print(metric.compute(predictions=predictions, references=references))
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
print(metric.compute(predictions=predictions, references=references))
# 需要清理标签中的所有 -100 token，为了方便输出的结果直接用来计算评估指标
import numpy as np
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # 如果模型返回的内容超过了预测的logits
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # 由于我们无法解码 -100,因此将标签中的 -100 替换掉
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # 一些简单的后处理
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# 初始化 TrainingArguments
from transformers import Seq2SeqTrainingArguments
args = Seq2SeqTrainingArguments(
    f"../model/marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

# 初始化 Trainer
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 开始训练之前，先查看一下模型目前的 BLEU 分数
print(trainer.evaluate(max_length=max_length))

# 开始训练
trainer.train()

# 再次评估模型效果
print(trainer.evaluate(max_length=max_length))

In [None]:
# 自定义训练循环
from torch.utils.data import DataLoader

# 将数据集设置为 torch 格式
tokenized_datasets.set_format("torch")

# DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

# 实例化model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 设置优化器
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

# 用 Accelerator 对象包装所有的组件
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

# 设置学习率调度器
from transformers import get_scheduler
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

model_name = "marian-finetuned-kde4-en-to-fr-accelerate"

# 为了简化评估部分，这里定义了这个 postprocess() 函数，该函数接受 predictions 和 labels 并
# 将它们转换为 metric 对象所需的字符串列表：
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # 替换标签中的 -100,因为我们无法解码它们。
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # 一些简单的后处理
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels
  
from tqdm.auto import tqdm
import torch
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # 训练
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    # 评估
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]
        # 需要填充预测和标签才能调用gather()
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)
        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)
    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")
    # 保存模型
    output_dir = f"../model/{model_name}-epoch{epoch}"
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        output_dir,
        is_main_process=accelerator.is_main_process,
        save_function=accelerator.save
    )
    # 同时保存训练状态
    accelerator.save(
        {"epoch": epoch, "optimizer_state": optimizer.state_dict(), "lr_scheduler_state": lr_scheduler.state_dict()}, 
        f"{output_dir}/training_state.pt"
    )
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        print(f"\nModel saved to {output_dir}")

In [None]:
# 使用微调后的模型
from transformers import pipeline
# 将其替换成你自己的 checkpoint
model_checkpoint = "../model/marian-finetuned-kde4-en-to-fr-epoch0"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
translator = pipeline("translation", model=model, tokenizer=tokenizer)
print(translator("Default to expanded threads"))
print(translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
))

## 提取文本摘要

In [None]:
from datasets import load_dataset
# 加载数据集 多语言亚马逊评论语料库 ，注：huggingface上数据集没有了，需要从kaggle下载
# spanish_dataset = load_dataset("mteb/amazon_reviews_multi", "es", trust_remote_code=True)
# english_dataset = load_dataset("mteb/amazon_reviews_multi", "en", trust_remote_code=True)
# 从kaggle(https://www.kaggle.com/datasets/mexwell/amazon-reviews-multi/data)下载后，本地加载
data_path = "~/.cache/huggingface/datasets/amazon_reviews_multi/"
dataset = load_dataset('csv', data_files={
    'train': '%s/train.csv' % data_path,
    'validation': '%s/validation.csv' % data_path,
    'test': '%s/test.csv' % data_path
})
# 只获取指定语言类型的评论
def filter_language(example, language):
    return (
        example["language"] == language
    )
spanish_dataset = dataset.filter(lambda example: filter_language(example, "es"))
english_dataset = dataset.filter(lambda example: filter_language(example, "en"))
print(spanish_dataset)
print(english_dataset)

# 随机抽取部分数据来查看内容
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Title: {example['review_title']}'")
        print(f"'>> Review: {example['review_body']}'")
show_samples(english_dataset)

# 只获取产品类别为书籍的评论，减少训练时间
def filter_books(example):
    return (
        example["product_category"] == "book"
        or example["product_category"] == "digital_ebook_purchase"
    )
spanish_books = spanish_dataset.filter(filter_books)
english_books = english_dataset.filter(filter_books)
show_samples(english_books)

# 将英文和西班牙文评论作为单个 DatasetDict 对象组合起来
from datasets import concatenate_datasets, DatasetDict
books_dataset = DatasetDict()
for split in english_books.keys():
    books_dataset[split] = concatenate_datasets(
        [english_books[split], spanish_books[split]]
    )
    books_dataset[split] = books_dataset[split].shuffle(seed=42)
show_samples(books_dataset)

# 过滤掉标题非常短的示例，以便模型可以生成更有效的摘要
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)

# 预处理数据
from transformers import AutoTokenizer
model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer("I loved reading the Hunger Games!")
print(inputs)
print(tokenizer.convert_ids_to_tokens(inputs.input_ids))
# 设置评论和标题的最大长度
max_input_length = 512
max_target_length = 30
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["review_body"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["review_title"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)

# 评估指标
import evaluate
rouge_score = evaluate.load("rouge")
# 测试评估指标
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
print(scores)

# 创建强大的 baseline
# 对于文本摘要，一个常见的参考 baseline 是简单地取文章的前三句话作为摘要，通常
# 称为 lead-3 baseline
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])
print(three_sentence_summary(books_dataset["train"][1]["review_body"]))
# 从数据集中提取这些“摘要”并计算 baseline 的 ROUGE 分数
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["review_body"]]
    return metric.compute(predictions=summaries, references=dataset["review_title"])

# 计算验证集上的 ROUGE 分数
import pandas as pd
score = evaluate_baseline(books_dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = {rn: round(score[rn] * 100, 2) for rn in rouge_names}
print(rouge_dict)

In [None]:
# 使用 Trainer API 微调 mT5
from transformers import AutoModelForSeq2SeqLM
# 加载预训练模型
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 初始化 TrainingArguments
from transformers import Seq2SeqTrainingArguments
batch_size = 4
num_train_epochs = 8
# 每个训练周期都输出训练损失
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    output_dir=f"../model/{model_name}-finetuned-amazon-en-es",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    gradient_accumulation_steps=64  # 累积 64 步后更新参数
)

# 模型评估指标 
# 对于摘要模型来说，不能直接调用 rouge_score.compute() 进行评估，因为需要将输出和参考摘要解码为文
# 本，然后才能计算 ROUGE 分数
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # 将生成的摘要解码为文本
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # 替换标签中的-100,因为我们无法解码它们
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # 将参考摘要解码为文本
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE期望每个句子后都有一个换行符
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # 计算ROUGE分数
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # 计算ROUGE分数
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

# 初始化数据整理器（data collator）
"""
由于 mT5 是一个编码器-解码器的 Transformer 模型，因此在将数据整理成 batch 时有一点需要注意，那就是
在解码期间，我们需要将标签向右移动一个单位。这是为了确保解码器只看到之前的参考序列，而不是当前要预测的 token 或
之后的参考序列，这样模型就能避免容易记住标签。
"""
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
tokenized_datasets = tokenized_datasets.remove_columns(
    books_dataset["train"].column_names
)
features = [tokenized_datasets["train"][i] for i in range(2)]
print(data_collator(features))

# 初始化 Trainer
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
# 开始训练
trainer.train()
# 评估模型
trainer.evaluate()


In [None]:
# 使用 Accelerate 微调 mT5
tokenized_datasets.set_format("torch")
from torch.utils.data import DataLoader
# DataLoader
batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size
)

# 优化器
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

# 实例化model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# 使用 Accelerator 对象包装所有的组件
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

# 设置学习率调度器
from transformers import get_scheduler
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# 定义后处理函数
# 将生成的摘要拆分为由换行符分隔的句子，这是 ROUGE 指标需要的输入格式
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    # ROUGE 需要每个句子后有一个换行符
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels

model_name = "test-bert-finetuned-squad-accelerate"

# 训练和评估的循环
from tqdm.auto import tqdm
import torch
import numpy as np
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # 训练
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    # 评估
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )
            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]
            # 如果我们没有填充到最大长度,我们需要填充标签
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )
            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()
            # 替换标签中的 -100,因为我们无法解码它们
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )
            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)
    # 计算评估的 loss
    result = rouge_score.compute()
    # 提取中位 ROUGE 分数
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)
    # 保存模型
    output_dir = f"../model/{model_name}-epoch{epoch}"
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        output_dir,
        is_main_process=accelerator.is_main_process,
        save_function=accelerator.save
    )
    # 同时保存训练状态
    accelerator.save(
        {"epoch": epoch, "optimizer_state": optimizer.state_dict(), "lr_scheduler_state": lr_scheduler.state_dict()}, 
        f"{output_dir}/training_state.pt"
    )
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        print(f"\nModel saved to {output_dir}")


In [None]:
# 使用微调后的模型
from transformers import pipeline
# 将其替换成你自己的 checkpoint
model_checkpoint = "../model/mt5-small-finetuned-amazon-en-es-epoch0"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
def print_summary(idx):
    review = books_dataset["test"][idx]["review_body"]
    title = books_dataset["test"][idx]["review_title"]
    summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"]
    print(f"'>>> Review: {review}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> Summary: {summary}'")
# 使用测试集中部分样本，来感受一下生成摘要的质量
print_summary(100)
print_summary(0)

## 从头开始训练因果语言模型
使用 Python 代码的一个数据集，来实现一行代码的补全，而不是直接生成完整的函数或类。

In [None]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"
print(
    any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)