## 文本相似度text_similarity----dual-model向量匹配
交互策略效率低，假设一个待匹配文本和1000000个候选文本，需推理实时1000000万次

## Step1 导入相关包

In [1]:
import os

# 设置可见的 GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

## Step2 加载数据集

In [2]:
dataset = load_dataset("json", data_files="train_pair_1w.json", split="train")
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [3]:
dataset[0]

{'sentence1': '找一部小时候的动画片', 'sentence2': '求一部小时候的动画片。谢了', 'label': '1'}

## Step3 划分数据集


In [4]:
datasets = dataset.train_test_split(test_size=0.2)
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

## Step4 数据集预处理

In [5]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

examples = datasets["train"][:10]
sentences = []
labels = []
for sen1, sen2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
    sentences.append(sen1)
    sentences.append(sen2)
    labels.append(1 if int(label) else -1)
tokenized_examples = tokenizer(sentences, max_length=128, truncation=True, padding="max_length")
tokenized_examples["labels"] = labels
tokenized_examples

tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/343 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

{'input_ids': [[101, 3297, 1400, 8024, 2769, 2792, 5543, 2990, 897, 4638, 3297, 3300, 1213, 4638, 6395, 2945, 8024, 2218, 3221, 131, 1762, 671, 3613, 1280, 1399, 4638, 3122, 1140, 809, 1400, 8024, 2400, 3766, 3300, 671, 702, 6395, 782, 1377, 809, 1415, 2137, 2769, 4638, 6411, 2141, 1469, 2769, 1092, 782, 4495, 3833, 4638, 5283, 3815, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2644, 6821, 2798, 3123, 749, 2552, 8024, 5445, 684, 2218, 3634, 1352, 2797, 2831, 857, 6929, 1352, 4788, 6154, 2094, 679, 3123, 8024, 3146, 3146, 2897, 749, 671, 3226, 1915, 8024, 1166, 782, 6206, 1932, 738, 1932, 679, 6624, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [6]:
tokenized_examples = {k: [v[i: i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
tokenized_examples

{'input_ids': [[[101,
    3297,
    1400,
    8024,
    2769,
    2792,
    5543,
    2990,
    897,
    4638,
    3297,
    3300,
    1213,
    4638,
    6395,
    2945,
    8024,
    2218,
    3221,
    131,
    1762,
    671,
    3613,
    1280,
    1399,
    4638,
    3122,
    1140,
    809,
    1400,
    8024,
    2400,
    3766,
    3300,
    671,
    702,
    6395,
    782,
    1377,
    809,
    1415,
    2137,
    2769,
    4638,
    6411,
    2141,
    1469,
    2769,
    1092,
    782,
    4495,
    3833,
    4638,
    5283,
    3815,
    511,
    102,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
  

In [7]:
sentences

['最后，我所能提供的最有力的证据，就是:在一次匿名的攻击以后，并没有一个证人可以否定我的诚实和我军人生活的纯洁。',
 '您这才放了心，而且就此双手抓住那双破袜子不放，整整拿了一昼夜，别人要夺也夺不走。',
 '花呗能和卡里的钱一起用吗',
 '银行卡可以和花呗一起支付',
 '两只穿着条纹毛衣的狗在雪草上摔跤。',
 '动物在雪中玩耍。',
 '从来没有过吗？',
 '然而，有时间恢复一下体力也许并非无益。',
 '这鹦鹉是什么品种？',
 '这只是什么品种鹦鹉，公还是母？',
 '你会看到热熔炉的玻璃混合物(批次)熔化和吹制，你会看到切割是用手和眼睛，以及抛光或雕刻。',
 '内部调查与玻璃工作是不适用于您在这里。',
 '冰箱异味如何去除？',
 '如何去除冰箱异味',
 '微风吹来，在这里听不见声息，因为没有一枝冬青，没有一棵常绿树，可以发出婆娑之声。',
 '哪怕吹起一丝微风，这儿也不会发出一点声音；因为没有一棵冬青、没有一株常青树可以沙沙作响，光秃秃的山楂树和榛树丛静得就像铺在小路中间的碎白石一样。',
 '二港湾的钟又响了事实是这样，吉里雅特经过了一路平安的航行以后，在天全黑下来的时候，到了圣桑普森，当时已经是将近十点钟，而不是九点钟左右。',
 '与海水在吉尔德奥尔姆于尔岩礁边无限上涨的同时，那黑暗中无限的平静在吉利亚特深邃的目光中升了起来。',
 '例如，她表示，机构和专业制度对报销或法律、隐私和保密问题的关切影响到ED患者是否接受筛查和干预。',
 '她知道，机构和专业系统对接受筛查和干预的ED患者感到关切。']

In [8]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

def process_function(examples):
    sentences = []
    labels = []
    for sen1, sen2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
        sentences.append(sen1)
        sentences.append(sen2)
        labels.append(1 if int(label) else -1)
    tokenized_examples = tokenizer(sentences, max_length=128, truncation=True, padding="max_length")
    tokenized_examples = {k: [v[i: i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}
    tokenized_examples["labels"] = labels
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [9]:
print(tokenized_datasets["train"][:2])

{'input_ids': [[[101, 3297, 1400, 8024, 2769, 2792, 5543, 2990, 897, 4638, 3297, 3300, 1213, 4638, 6395, 2945, 8024, 2218, 3221, 131, 1762, 671, 3613, 1280, 1399, 4638, 3122, 1140, 809, 1400, 8024, 2400, 3766, 3300, 671, 702, 6395, 782, 1377, 809, 1415, 2137, 2769, 4638, 6411, 2141, 1469, 2769, 1092, 782, 4495, 3833, 4638, 5283, 3815, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2644, 6821, 2798, 3123, 749, 2552, 8024, 5445, 684, 2218, 3634, 1352, 2797, 2831, 857, 6929, 1352, 4788, 6154, 2094, 679, 3123, 8024, 3146, 3146, 2897, 749, 671, 3226, 1915, 8024, 1166, 782, 6206, 1932, 738, 1932, 679, 6624, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [10]:
import numpy as np
input_ids = np.array([
    [[101, 100, 1914, 720, 1367, 2597, 1408, 8043, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2644, 6820, 6381, 2533, 1408, 8024, 6929, 1921, 800, 1914, 720, 1367, 2597, 8024, 6760, 6716, 2218, 6624, 749, 8043, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[101, 671, 702, 4959, 758, 7582, 1063, 5682, 3340, 5292, 3688, 6132, 4638, 4511, 782, 1762, 6125, 677, 3123, 7509, 727, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 697, 3340, 7032, 7824, 7744, 4708, 671, 1920, 1831, 1759, 6486, 3799, 7744, 5632, 6121, 6756, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
])

In [11]:
input_ids

array([[[ 101,  100, 1914,  720, 1367, 2597, 1408, 8043,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 2644, 6820, 6381, 2533, 1408, 8024, 6929, 1921,  800,
         1914,  720, 1367, 2597, 8024, 6760, 

In [12]:
sen1_input_ids, sen2_input_ids = input_ids[:, 0], input_ids[:, 1]

In [13]:
sen1_input_ids, sen2_input_ids

(array([[ 101,  100, 1914,  720, 1367, 2597, 1408, 8043,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0],
        [ 101,  671,  702, 4959,  758, 7582, 1063, 5682, 3340, 5292, 3688,
         6132, 4638, 4511,  782, 1762, 6125,  67

In [14]:
input_ids2 = np.array([[[  101,  2769,  1599,  3614,  1266,   776,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0],
         [  101,   151,  8993,  8815,  8169, 12128,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0]]])
input_ids2

array([[[  101,  2769,  1599,  3614,  1266,   776,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

In [15]:
sen1_input_ids2, sen2_input_ids2 = input_ids2[:, 0], input_ids2[:, 1]

In [16]:
sen1_input_ids2, sen2_input_ids2

(array([[ 101, 2769, 1599, 3614, 1266,  776,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0]]),
 array([[  101,   151,  8993,  8815,  8169, 12128,   102,     0,     0,
             0,     0,     0,     0,     0,     0

## Step5 创建模型

In [17]:
from transformers import BertPreTrainedModel, BertModel, BertConfig
from typing import Optional
from transformers.configuration_utils import PretrainedConfig
from torch.nn import CosineSimilarity, CosineEmbeddingLoss

class DualModel(BertPreTrainedModel):
    def __init__(self, config, dim=0, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = BertModel(config)
        self.dim = dim
        self.cosine_similarity = CosineSimilarity(dim=dim)  # 初始化时指定dim
        # Initialize weights and apply final processing
        self.post_init()

    @classmethod
    def from_pretrained(cls, config_name, dim=1, *inputs, **kwargs):
        config = BertConfig.from_pretrained(config_name)
        model = cls(config, dim, *inputs, **kwargs)
        model.bert = BertModel.from_pretrained(config_name)
        return model
    
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # print(f"forward开始：dim={self.dim}")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Step1 分别获取 sen1 和 sen2 的输入
        sen1_input_ids, sen2_input_ids = input_ids[:, 0], input_ids[:, 1]
        sen1_attention_mask, sen2_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
        sen1_token_type_ids, sen2_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]
        # print(f"Step1结束")

        # Step2 分别获取 sen1 和 sen2 的向量表示
        sen1_outputs = self.bert(
            sen1_input_ids,
            attention_mask=sen1_attention_mask,
            token_type_ids=sen1_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sen1_pooled_output = sen1_outputs[1]  # [batch, hidden(768)]

        sen2_outputs = self.bert(
            sen2_input_ids,
            attention_mask=sen2_attention_mask,
            token_type_ids=sen2_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sen2_pooled_output = sen2_outputs[1]  # [batch, hidden(768)]

        # print(f"Step2结束")

        # Step3 计算相似度
        cos = self.cosine_similarity(sen1_pooled_output, sen2_pooled_output)  # [batch,]
        # print(f"Step3结束")

        # Step4 计算loss
        loss = None
        if labels is not None:
            loss_fct = CosineEmbeddingLoss(margin=0.3)
            loss = loss_fct(sen1_pooled_output, sen2_pooled_output, labels)
        
        output = (cos,)
        return ((loss,) + output) if loss is not None else output

# # 加载预训练模型的配置
# config = BertConfig.from_pretrained("hfl/chinese-macbert-base")

# # 实例化DualModel类，并指定dim参数
# model = DualModel(config, dim=0)
model = DualModel.from_pretrained("hfl/chinese-macbert-base", dim=1)

pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

## Step6 创建评估函数

In [18]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [19]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [1 if p > 0.7 else 0 for p in predictions]
    labels = [0 if label == -1 else 1 for label in labels]
    # predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 创建TrainArguments

In [20]:
train_args = TrainingArguments(output_dir="./dual_model",      # 输出文件夹
                               per_device_train_batch_size=64,  # 训练时的batch_size
                               per_device_eval_batch_size=64,  # 验证时的batch_size
                               num_train_epochs=3,
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=2,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True      # 训练完成后加载最优模型
                               )     

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Step8 创建Trainer

In [21]:
trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"],
                  compute_metrics=eval_metric)

## Step9 模型训练

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2281,0.206763,0.7445,0.723035
2,0.1822,0.186928,0.789,0.751765
3,0.1454,0.181199,0.7985,0.761679


TrainOutput(global_step=375, training_loss=0.20454465770721436, metrics={'train_runtime': 254.064, 'train_samples_per_second': 94.464, 'train_steps_per_second': 1.476, 'total_flos': 3157275967488000.0, 'train_loss': 0.20454465770721436, 'epoch': 3.0})

## Step10 模型评估

In [23]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.1811991035938263,
 'eval_accuracy': 0.7985,
 'eval_f1': 0.7616794795978711,
 'eval_runtime': 7.0633,
 'eval_samples_per_second': 283.153,
 'eval_steps_per_second': 4.53,
 'epoch': 3.0}

In [24]:
trainer.evaluate(tokenized_datasets["train"])

{'eval_loss': 0.12269095331430435,
 'eval_accuracy': 0.884625,
 'eval_f1': 0.8582399017048072,
 'eval_runtime': 28.0097,
 'eval_samples_per_second': 285.616,
 'eval_steps_per_second': 4.463,
 'epoch': 3.0}

## Step11 模型预测

第一种方法：我想的
真j8难啊，不过对维度的理解上了一个台阶

In [194]:
from transformers import pipeline, AudioClassificationPipeline

class SentenceSimilarityPipeline1:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
        
    def preprocess(self, sen1, sen2):
        return tokenizer([sen1, sen2], truncation=True, padding="max_length", max_length=128, return_tensors="pt")

    def predict(self, inputs):
        # print(inputs)
        for key in inputs:
            inputs[key] = inputs[key].unsqueeze(0)  # 在第0维添加None维度
        
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        # print("Type of input:", type(inputs))
        # print("Content of input:", inputs)
        result = self.model(**inputs)  # [2, 768]
        return result

    # def postprocess(self, logits):
    #     cos = CosineSimilarity()(logits[None, 0, :], logits[None, 1, :]).cpu().item()  # logits[0, :]变成[768]了，再加None成[1, 768]
    #     return cos

    def __call__(self, sen1, sen2, return_vector=False):
        inputs = self.preprocess(sen1, sen2)
        print(sen1, sen2)
        result = self.predict(inputs)
        print(inputs)
        print(result)
        return result
        # logits = self.predict(inputs)
        # result = self.postprocess(logits)
        # if return_vector:
        #      return result, logits
        # else:
        #      return result

In [195]:
pipe = SentenceSimilarityPipeline1(model, tokenizer)

In [196]:
pipe("我喜欢北京", "I like beijing")

我喜欢北京 I like beijing
{'input_ids': tensor([[[  101,  2769,  1599,  3614,  1266,   776,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
     

(tensor([0.9725], grad_fn=<SumBackward1>),)

第二种方法：up的

In [197]:
from transformers import pipeline

class SentenceSimilarityPipeline:

    def __init__(self, model, tokenizer):
        self.model = model.bert
        self.tokenizer = tokenizer
        self.device = model.device
        
    def preprocess(self, sen1, sen2):
            return tokenizer([sen1, sen2], truncation=True, padding="max_length", max_length=128, return_tensors="pt")

    def predict(self, inputs):
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        return self.model(**inputs)[1]  # [2, 768]

    def postprocess(self, logits):
        # 以下两种都可以，主要是CosineSimilarity默认dim=1在1维度计算
        # cos = CosineSimilarity()(logits[None, 0, :], logits[None, 1, :]).cpu().item()
        cos = CosineSimilarity(dim=0)(logits[0, :], logits[1, :]).item()
        return cos

    def __call__(self, sen1, sen2, return_vector=False):
        inputs = self.preprocess(sen1, sen2)
        logits = self.predict(inputs)
        result = self.postprocess(logits)
        if return_vector:
             return result, logits
        else:
             return result

In [198]:
pipe = SentenceSimilarityPipeline(model, tokenizer)

In [199]:
pipe("我喜欢北京", "北京是我喜欢的地方", return_vector=True)

(0.9907099008560181,
 tensor([[ 0.9085, -0.2581,  1.0000,  ..., -0.9495, -0.9952, -0.9592],
         [ 0.9059,  0.0197,  1.0000,  ..., -0.9535, -0.9946, -0.9790]],
        grad_fn=<TanhBackward0>))

## 训练过程可视化
1、终端进入abc的conda环境和checkpoints目录，执行tensorboard --logdir=runs --host=0.0.0.0 --port=8418

2、vscode中ctrl+shift+p，搜索TensorBoard