# 文本相似度

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
torch.cuda.set_device(8)

## 加载数据集

In [5]:
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [7]:
dataset[1]

{'sentence1': '我不可能是一个有鉴赏能力的行家，小姐我把我的时间都花在书写上；象这样豪华的舞会，我还是头一次见到。',
 'sentence2': '蜡烛没熄就好了，夜黑得瘆人，情绪压抑。',
 'label': '0'}

##  划分数据集

In [8]:
datasets = dataset.train_test_split(test_size=0.2)
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

##  数据集预处理

In [9]:
import torch


tokenizer = AutoTokenizer.from_pretrained("/home/wangtuo/workspace/Homework/Huggingface/models--hfl--chinese-macbert-base")
def process_function(examples):
    sentences = []
    labels = []
    for sen1, sen2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
        sentences.append(sen1)
        sentences.append(sen2)
        labels.append(1 if int(label) == 1 else -1)
    # input_ids, attention_mask, token_type_ids
    tokenized_examples = tokenizer(sentences, max_length=128, truncation=True, padding="max_length")
    tokenized_examples = {k: [v[i: i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    tokenized_examples["labels"] = labels
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [18]:
print(tokenized_datasets["train"][0])

{'input_ids': [[101, 754, 6825, 4960, 4197, 3926, 7008, 6814, 3341, 8024, 2190, 5632, 2346, 6432, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2769, 4685, 928, 8024, 2769, 812, 1377, 809, 679, 3198, 935, 678, 6716, 1343, 4692, 3862, 6495, 8024, 6820, 1377, 809, 679, 3198, 1762, 2340, 5668, 2772, 1381, 5668, 6448, 6389, 3300, 6637, 4638, 752, 4289, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Step5 创建模型

In [12]:
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel
from typing import Optional
from transformers.configuration_utils import PretrainedConfig
from torch.nn import CosineSimilarity, CosineEmbeddingLoss
import torch
class DualModel(BertPreTrainedModel):

    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = BertModel(config)
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Step1 分别获取sentenceA 和 sentenceB的输入
        senA_input_ids, senB_input_ids = input_ids[:, 0], input_ids[:, 1]
        senA_attention_mask, senB_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
        senA_token_type_ids, senB_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]

        # Step2 分别获取sentenceA 和 sentenceB的向量表示
        senA_outputs = self.bert(
            senA_input_ids,  # input
            attention_mask=senA_attention_mask,  # mask
            token_type_ids=senA_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senA_pooled_output = senA_outputs[1]    # [batch, hidden]

        senB_outputs = self.bert(
            senB_input_ids,
            attention_mask=senB_attention_mask,
            token_type_ids=senB_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senB_pooled_output = senB_outputs[1]    # [batch, hidden]

        # step3 计算相似度

        cos = CosineSimilarity()(senA_pooled_output, senB_pooled_output)    # [batch, ]

        # step4 计算loss

        loss = None
        if labels is not None:
            loss_fct = CosineEmbeddingLoss(0.3)
            loss = loss_fct(senA_pooled_output, senB_pooled_output, labels)

        output = (cos,)
        return ((loss,) + output) if loss is not None else output
    
model = DualModel.from_pretrained("/home/wangtuo/workspace/Homework/Huggingface/models--hfl--chinese-macbert-base")

  _torch_pytree._register_pytree_node(


In [3]:
print(model)

DualModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

## Step6 创建评估函数

In [13]:
import evaluate

acc_metric = evaluate.load("./metrics/accuracy")
f1_metirc = evaluate.load("./metrics/f1")

In [14]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [int(p > 0.7) for p in predictions]
    labels = [int(l > 0) for l in labels]
    # predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## 训练

In [25]:
train_args = TrainingArguments(output_dir="./dual_model2",      # 输出文件夹
                               num_train_epochs = 5,            # 迭代次数
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=32,  # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=5,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型

In [26]:
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  compute_metrics=eval_metric)

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1879,0.213186,0.7485,0.693853
2,0.1764,0.203245,0.757,0.697761
3,0.1594,0.201321,0.7675,0.723049
4,0.1542,0.197839,0.7725,0.72103
5,0.1478,0.196796,0.763,0.710269


Checkpoint destination directory ./dual_model2/checkpoint-28 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./dual_model2/checkpoint-56 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./dual_model2/checkpoint-84 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=140, training_loss=0.1646506999220167, metrics={'train_runtime': 237.9703, 'train_samples_per_second': 168.088, 'train_steps_per_second': 0.588, 'total_flos': 5262126612480000.0, 'train_loss': 0.1646506999220167, 'epoch': 5.0})

##  模型评估

In [None]:
trainer.evaluate(tokenized_datasets["test"])



{'eval_loss': 0.20947732031345367,
 'eval_accuracy': 0.7435,
 'eval_f1': 0.7103331451157539,
 'eval_runtime': 6.1191,
 'eval_samples_per_second': 326.844,
 'eval_steps_per_second': 1.144,
 'epoch': 3.0}

##  模型预测

In [15]:
class SentenceSimilarityPipeline:

    def __init__(self, model, tokenizer) -> None:
        self.model = model.bert
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, senA, senB):
        return self.tokenizer([senA, senB], max_length=128, truncation=True, return_tensors="pt", padding=True)

    def predict(self, inputs):
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        return self.model(**inputs)[1]  # [2, 768]

    def postprocess(self, logits):
        cos = CosineSimilarity()(logits[None, 0, :], logits[None,1, :]).squeeze().cpu().item()
        return cos

    def __call__(self, senA, senB, return_vector=False):
        inputs = self.preprocess(senA, senB)
        logits = self.predict(inputs)
        result = self.postprocess(logits)
        if return_vector:
            return result, logits
        else:
            return result

In [16]:
pipe = SentenceSimilarityPipeline(model, tokenizer)

In [17]:
pipe("我喜欢北京", "我喜欢北京", return_vector=True)

(0.9999999403953552,
 tensor([[ 0.9085, -0.2581,  1.0000,  ..., -0.9495, -0.9952, -0.9592],
         [ 0.9085, -0.2581,  1.0000,  ..., -0.9495, -0.9952, -0.9592]],
        grad_fn=<TanhBackward0>))

In [19]:
# 这个地方加载训练好的ckeckpoint
model = DualModel.from_pretrained("/home/wangtuo/workspace/Homework/src/text_classfication/dual_model2/checkpoint-112")
tokenizer = AutoTokenizer.from_pretrained("/home/wangtuo/workspace/Homework/Huggingface/models--hfl--chinese-macbert-base")

In [20]:
pipe = SentenceSimilarityPipeline(model, tokenizer)

In [22]:
pipe("找一部小时候的动画片","求一部小时候的动画片。谢了")

0.9093652963638306