# 文本相似度实例

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

## Step2 加载数据集

In [2]:
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [3]:
dataset[0]

{'sentence1': '找一部小时候的动画片', 'sentence2': '求一部小时候的动画片。谢了', 'label': '1'}

## Step3 划分数据集

In [4]:
datasets = dataset.train_test_split(test_size=0.2)#分割数据集，20%作为测试集，80%为训练集
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

## Step4 数据集预处理

In [5]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

def process_function(examples):
    #[CLS]找一部小时候的动画片（sentence1）[SEP]求一部小时候的动画片。谢了（sentence2）[SEP]

    #设置编码后的最大序列长度为128个token，如果文本超过这个长度会被截断
    tokenized_examples = tokenizer(examples["sentence1"], examples["sentence2"], max_length=128, truncation=True)

    #遍历 examples["label"] 中的每个标签,将字符串标签转换为浮点数并赋值
    tokenized_examples["labels"] = [float(label) for label in examples["label"]]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
'''
process_function之后，train、test里面的结构，有原本的'sentence1'、'sentence2'、'label'又有新生成的'input_ids'、'token_type_ids'、'attention_mask'、'labels'

.map(function,...)：对于datasets的所有元素（所有example），应用.map()括号内的内容，process_function、batched、remove_columns，所以这里之所以只移除train的列，其实train和test的列名都一样，所以也是移除test的列

input_ids:输入ID序列：将文本中的每个词/子词转换成的数字ID
        [CLS] (101)：序列开头，用于分类任务 [SEP] (102)：分隔符，用于分隔两个句子
token_type_ids:句子类型ID：用于区分输入中的两个不同句子
        0：表示属于第一个句子     1：表示属于第二个句子
attention_mask:告诉模型哪些位置是真实token，哪些是填充的padding
        1：表示这是真实的token，需要被模型关注  0：表示这是填充的padding，应该被忽略

'''

tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [6]:
print(tokenized_datasets["train"][0])

{'input_ids': [101, 6820, 3300, 5855, 3172, 5838, 2209, 2548, 1469, 6205, 4408, 4280, 4638, 2533, 1759, 2128, 1062, 4265, 1952, 1538, 1937, 8024, 809, 1350, 976, 6814, 4377, 714, 762, 4294, 5945, 5801, 4638, 7355, 1092, 1039, 2358, 4638, 510, 1305, 4946, 4638, 2215, 1164, 6205, 3172, 2357, 3306, 511, 102, 5855, 3172, 5838, 2209, 2548, 8024, 6205, 4408, 4280, 4638, 2533, 1759, 2128, 1062, 4265, 1952, 1538, 1937, 8024, 6820, 3300, 1775, 7949, 3172, 4638, 2215, 1164, 6205, 3172, 2357, 3306, 8024, 5314, 4377, 714, 762, 4294, 7440, 5855, 2496, 7355, 1092, 1039, 2358, 4638, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

## Step5 创建模型

In [7]:
from transformers import BertForSequenceClassification 
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=1)
#num_labels=1 表示使用该模型最后会输出一个维度为1的线性层
#回归式二分类或相似度评分

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step6 创建评估函数

In [14]:
import evaluate

acc_metric = evaluate.load("./metric_accuracy.py")
#F1也是分类问题中的评估指标，其同时考虑了精确率和召回率，使得两者都尽可能高
f1_metirc = evaluate.load("./metric_f1.py")

Downloading builder script: 0.00B [00:00, ?B/s]

In [15]:
def eval_metric(eval_predict):
    #将eval_predict这个元组拆包成两个变量：predictions和labels
    predictions, labels = eval_predict
    #遍历预测结果中的每一个值，当前预测值是否大于0.5
    #并将布尔值转换为整数True → 1，False → 0 得到一个predictions的
    predictions = [int(p > 0.5) for p in predictions]
    #将训练时使用的浮点数标签转换回评估时需要的整数标签
    labels = [int(l) for l in labels]
    # predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    #将f1典添加到acc字典里面
    acc.update(f1)
    return acc

'''
F1分数主要适用于以下情况：
①二分类问题，特别是类别不平衡时
②多分类问题中需要平衡精确率和召回率
③存在假阳性和假阴性代价不同的场景

二分类问题是指预测结果只有两种可能类别的问题
例如：判断邮件是否为垃圾邮件（是/否）

'''

## Step7 创建TrainingArguments

In [16]:
train_args = TrainingArguments(output_dir="./cross_model",      # 输出文件夹
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=32,   # 验证时的batch_size
                               logging_steps=10,       # log 打印的频率
                               eval_strategy="epoch",  # 评估策略
                               save_strategy="epoch",  # 保存策略
                               save_total_limit=3,     # 最大保存数
                               learning_rate=2e-5,     # 学习率
                               weight_decay=0.01,      # weight_decay
                               metric_for_best_model="f1", # 设定评估指标
                               load_best_model_at_end=True)# 训练完成后加载最优模型
train_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp

## Step8 创建Trainer

In [17]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model, 
                  args=train_args, 
                  tokenizer=tokenizer,
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

## Step9 模型训练

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss


  predictions = [int(p > 0.5) for p in predictions]


AttributeError: 'float' object has no attribute 'size'

## Step10 模型评估

In [None]:
trainer.evaluate(tokenized_datasets["test"])

## Step11 模型预测

In [None]:
from transformers import pipeline, TextClassificationPipeline

In [None]:
model.config.id2label = {0: "不相似", 1: "相似"}

In [None]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [None]:
result = pipe({"text": "我喜欢北京", "text_pair": "天气怎样"}, function_to_apply="none")
result["label"] = "相似" if result["score"] > 0.5 else "不相似"
result