（1）数据集构建

In [2]:
import os
import json
import torch

# 加载词表
def load_vocab():
    word_dict = {}
    with open('/kaggle/input/afqmc-datset-zip1/afqmc-datset/vocab.txt') as f:
        for idx, item in enumerate(f.readlines()):
            word_dict[item.strip()] = idx

    return word_dict

# 加载数据
def load_dataset(data_path, is_test):
    examples = []
    with open(data_path) as f:
        for line in f.readlines():
            line = json.loads(line)
            text_a = line["sentence1"]
            text_b = line["sentence2"]
            if is_test:
                examples.append((text_a, text_b,))
            else:
                label = line["label"]
                examples.append((text_a, text_b, label))
    return examples

def load_afqmc_data(path):
    train_path = os.path.join(path, 'train.json')
    dev_path = os.path.join(path, 'dev.json')
    test_path = os.path.join(path, 'test.json')

    train_data = load_dataset(train_path, False)
    dev_data = load_dataset(dev_path, False)
    test_data = load_dataset(test_path, True)
    return train_data, dev_data, test_data

# 字符转id
def words2id(example, word_dict):
    cls_id = word_dict['[CLS]']
    sep_id = word_dict['[SEP]']

    text_a, text_b, label = example

    # 将中文字符切分成单个字符
    text_a = list(text_a)
    text_b = list(text_b)

    input_a = [word_dict[item] if item in word_dict else word_dict['[UNK]'] for item in text_a]
    input_b = [word_dict[item] if item in word_dict else word_dict['[UNK]'] for item in text_b]
    input_ids = [cls_id] + input_a + [sep_id] + input_b + [sep_id]
    segment_id = [0] * (len(input_a) + 2) + [1] * (len(input_b) + 1)
    return input_ids, segment_id, int(label)

# Dataloader中的collate_fn函数
def collate_fn(batch_data, pad_val=0, max_seq_len=512):
    input_ids, segment_ids, labels = [], [], []
    max_len = 0
    for example in batch_data:
        input_id, segment_id, label = example
        # 对数据序列进行截断
        input_ids.append(input_id[:max_seq_len])
        segment_ids.append(segment_id[:max_seq_len])
        labels.append(label)
        # 保存序列最大长度
        max_len = max(max_len, len(input_id))
    # 对数据序列进行填充至最大长度
    for i in range(len(labels)):
        input_ids[i] = input_ids[i] + [pad_val] * (max_len - len(input_ids[i]))
        segment_ids[i] = segment_ids[i] + [pad_val] * (max_len - len(segment_ids[i]))
    return (torch.as_tensor(input_ids), torch.as_tensor(segment_ids)), torch.as_tensor(labels)


# 使用例子
vocab = load_vocab()
train_data, dev_data, test_data = load_afqmc_data('/kaggle/input/afqmc-datset-zip1/afqmc-datset/AFQMC/')

# 将句子转换成id
example = train_data[0]
input_ids, segment_ids, label = words2id(example, vocab)
print("Input IDs:", input_ids)
print("Segment IDs:", segment_ids)
print("Label:", label)

# 构建mini-batch并进行对齐
batch_data = [words2id(example, vocab) for example in train_data[:2]]
batch_input, batch_label = collate_fn(batch_data)
print("Batch Input:", batch_input)
print("Batch Label:", batch_label)


Input IDs: [1, 3802, 2975, 1051, 4947, 43, 852, 201, 699, 48, 22, 806, 33, 254, 399, 49, 89, 1114, 2, 1051, 4947, 9, 254, 399, 45, 195, 201, 89, 1114, 2]
Segment IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Label: 0
Batch Input: (tensor([[   1, 3802, 2975, 1051, 4947,   43,  852,  201,  699,   48,   22,  806,
           33,  254,  399,   49,   89, 1114,    2, 1051, 4947,    9,  254,  399,
           45,  195,  201,   89, 1114,    2],
        [   1, 3802, 2975,  283, 4947,  178,   75, 1147,  450,    7,  218,    2,
         3802, 2975,  283, 4947, 1147,  450,   40,   13,   10,  614,  356,    2,
            0,    0,    0,    0,    0,    0]]), tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0]]))
Batch Label: tensor([0, 0])


（2）实现输入编码、分段编码和位置编码，并组装为嵌入层，打印该层的输入输出

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding

class TransformerEmbedding(Layer):
    def __init__(self, vocab_size, emb_size, max_position_embeddings=512):
        super(TransformerEmbedding, self).__init__()

        self.word_embedding = Embedding(vocab_size, emb_size)
        self.position_embedding = Embedding(max_position_embeddings, emb_size)
        self.segment_embedding = Embedding(2, emb_size)  # 0 for sentence A, 1 for sentence B

    def call(self, inputs):
        input_ids, segment_ids = inputs

        # 输入编码
        word_embeddings = self.word_embedding(input_ids)

        # 分段编码
        segment_embeddings = self.segment_embedding(segment_ids)

        # 位置编码
        position_ids = tf.range(tf.shape(input_ids)[1], dtype=tf.int32)
        position_embeddings = self.position_embedding(position_ids)

        # 将各个编码相加得到最终的嵌入表示
        embeddings = word_embeddings + position_embeddings + segment_embeddings

        return embeddings

# 例子
vocab_size = 10000  # 假设词汇表大小为10000
emb_size = 300  # 假设词向量维度为300

# 创建TransformerEmbedding实例
embedding_layer = TransformerEmbedding(vocab_size, emb_size)

# 构造一个mini-batch的输入数据
input_ids = tf.constant([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])  # 示例输入
segment_ids = tf.constant([[0, 0, 0, 1, 1], [0, 1, 1, 0, 1]])  # 示例分段标记

# 使用嵌入层进行编码
embeddings = embedding_layer([input_ids, segment_ids])

# 打印输入和输出
print("Input IDs:", input_ids.numpy())
print("Segment IDs:", segment_ids.numpy())
print("Output Embeddings:", embeddings.numpy())


Input IDs: [[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
Segment IDs: [[0 0 0 1 1]
 [0 1 1 0 1]]
Output Embeddings: [[[-2.92048492e-02  2.50686556e-02 -1.63089242e-02 ...  6.18023798e-03
   -4.84248511e-02 -6.03079572e-02]
  [-8.10497999e-03  2.34684870e-02 -2.29900144e-02 ... -6.87457100e-02
    2.10056081e-02  5.60190529e-05]
  [ 7.74686784e-03 -3.25232334e-02 -6.60799146e-02 ... -1.33945048e-03
   -4.31753881e-02 -4.71767075e-02]
  [ 4.96294163e-02  2.52648033e-02  7.52567202e-02 ...  3.51153202e-02
   -5.60104027e-02 -2.70643458e-03]
  [ 3.93309407e-02 -1.60002857e-02 -6.06644750e-02 ...  3.81373316e-02
    2.13111080e-02  2.35103257e-02]]

 [[ 8.45070928e-04  3.54717523e-02 -1.05034046e-01 ... -4.78850007e-02
   -7.80276358e-02 -7.64850751e-02]
  [ 2.38598958e-02  1.41861662e-02 -6.61173090e-02 ...  4.35464680e-02
    6.63404986e-02  5.85090593e-02]
  [-1.54073574e-02 -7.41142035e-02  2.34907418e-02 ...  6.66059405e-02
    7.69123062e-03 -1.29668247e-02]
  [ 1.35094039e-02  2.72904448e-02 

（3）实现多头自注意力层和add&norm层

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Dropout

class MultiHeadAttention(Layer):
    def __init__(self, emb_size, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.emb_size = emb_size
        self.num_heads = num_heads
        self.head_dim = emb_size // num_heads

        assert self.head_dim * num_heads == emb_size, "Embedding size needs to be divisible by heads"

        self.query_linear = Dense(emb_size)
        self.key_linear = Dense(emb_size)
        self.value_linear = Dense(emb_size)

        self.fc_out = Dense(emb_size)

        self.dropout = Dropout(dropout)

    def call(self, inputs):
        query, key, value, mask = inputs
        batch_size = tf.shape(query)[0]
        seq_len = tf.shape(query)[1]

        # 线性变换
        Q = self.query_linear(query)
        K = self.key_linear(key)
        V = self.value_linear(value)

        # 多头分组
        Q = tf.transpose(tf.reshape(Q, (batch_size, seq_len, self.num_heads, self.head_dim)), perm=[0, 2, 1, 3])
        K = tf.transpose(tf.reshape(K, (batch_size, seq_len, self.num_heads, self.head_dim)), perm=[0, 2, 1, 3])
        V = tf.transpose(tf.reshape(V, (batch_size, seq_len, self.num_heads, self.head_dim)), perm=[0, 2, 1, 3])

        # 形状重组
        energy = tf.matmul(Q, tf.transpose(K, perm=[0, 1, 3, 2])) / tf.math.sqrt(tf.cast(self.head_dim, dtype=tf.float32))

        # 掩码处理
        if mask is not None:
            mask = tf.expand_dims(tf.expand_dims(mask, axis=1), axis=2)  # Broadcasting the mask
            energy = tf.where(mask == 0, tf.constant(float('-1e20'), dtype=tf.float32), energy)

        # QKV注意力计算
        attention = tf.nn.softmax(energy, axis=-1)
        x = tf.matmul(self.dropout(attention), V)

        # 重组恢复
        x = tf.transpose(x, perm=[0, 2, 1, 3])
        x = tf.reshape(x, (batch_size, seq_len, self.emb_size))

        # 多头融合
        x = self.fc_out(x)

        return x

class AddNorm(Layer):
    def __init__(self, emb_size, mlp_units, dropout=0.1):
        super(AddNorm, self).__init__()

        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout_layer = Dropout(dropout)

        # 添加线性层，将 MLP 输出维度转换为 emb_size
        self.mlp_linear = Dense(emb_size)

    def call(self, inputs, training=None):
        x, sublayer, mask = inputs

        # 残差连接
        residual = x

        # 确保 sublayer 是一个可以调用的层对象
        sublayer_output = sublayer([x, x, x, mask]) if callable(sublayer) else sublayer

        # 更新这行代码，确保在训练时使用 dropout
        mlp_output = self.mlp_linear(sublayer_output)
        x = residual + self.dropout_layer(mlp_output, training=training)

        # 层规范化
        x = self.norm(x)
        return x




# 示例
emb_size = 300
num_heads = 6
dropout = 0.1

# 创建MultiHeadAttention实例
attention_layer = MultiHeadAttention(emb_size, num_heads, dropout)

# 创建示例输入
query = tf.random.normal((2, 10, emb_size))
key = tf.random.normal((2, 10, emb_size))
value = tf.random.normal((2, 10, emb_size))
mask = tf.ones((2, 10))

# 多头自注意力层
output = attention_layer([query, key, value, mask])

# 打印输入和输出
print("Query shape:", query.shape)
print("Output shape:", output.shape)


Query shape: (2, 10, 300)
Output shape: (2, 10, 300)


（4）搭建一个transformer编码器，利用嵌入层、transformer编码器和合适的分类器构建完成语义匹配模型

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D

# 构建Transformer模型
def build_transformer_model(vocab_size, emb_size, num_heads, num_transformer_blocks, mlp_units, dropout, max_position_embeddings=512):
    # 输入层
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    segment_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)

    # 嵌入层
    embeddings = TransformerEmbedding(vocab_size, emb_size, max_position_embeddings)([input_ids, segment_ids])

    # Transformer 编码器
    for _ in range(num_transformer_blocks):
        attention_output = MultiHeadAttention(emb_size, num_heads, dropout)([embeddings, embeddings, embeddings, None])
        # 加和归一化层
        attention_output = AddNorm(emb_size, dropout)([embeddings, attention_output, None])
        # Feed Forward
        mlp_output = Dense(mlp_units, activation="relu")(attention_output)
        # 加和归一化层
        embeddings = AddNorm(emb_size, dropout)([attention_output, mlp_output, None])

    # 池化层
    pooled = GlobalAveragePooling1D()(embeddings)

    # 分类器
    outputs = Dense(1, activation="sigmoid")(pooled)

    # 构建模型
    model = tf.keras.Model(inputs=[input_ids, segment_ids], outputs=outputs)

    return model

# 构建语义匹配模型
semantic_matching_model = build_transformer_model(vocab_size, emb_size, num_heads=6, num_transformer_blocks=4, mlp_units=512, dropout=0.1)

# 打印模型组成
semantic_matching_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 transformer_embedding_1 (T  (None, None, 300)            3154200   ['input_1[0][0]',             
 ransformerEmbedding)                                                'input_2[0][0]']             
                                                                                                  
 multi_head_attention_1 (Mu  (None, None, 300)            361200    ['transformer_embedding_1[

（5）训练模型，在验证集上计算准确率，并保存在验证集上准确率最高的模型，使用tensorboard等可视化插件，展示训练过程中的精度变化和损失变化

In [8]:
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
import numpy as np

# 定义TensorBoard回调
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)

# 定义模型
semantic_matching_model = build_transformer_model(vocab_size, emb_size, num_heads=6, num_transformer_blocks=4, mlp_units=512, dropout=0.1)

# 编译模型
semantic_matching_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 加载和处理训练集数据
train_data, dev_data, test_data = load_afqmc_data('/kaggle/input/afqmc-datset-zip1/afqmc-datset/AFQMC/')

# 将训练集数据处理成模型输入
train_inputs, train_labels = collate_fn([words2id(example, vocab) for example in train_data])

# 转换为numpy数组
train_inputs = (np.array(train_inputs[0]), np.array(train_inputs[1]))
train_labels = np.array(train_labels)

# 准备验证数据
dev_inputs, dev_labels = collate_fn([words2id(example, vocab) for example in dev_data])

# 转换为numpy数组
dev_inputs = (np.array(dev_inputs[0]), np.array(dev_inputs[1]))
dev_labels = np.array(dev_labels)

# 定义模型保存路径
model_checkpoint = ModelCheckpoint(
    './logs/best_model.h5',  # 模型保存路径
    save_best_only=True,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    verbose=1
)

# 定义早停策略
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    mode='max',
    verbose=1
)

# 训练模型
history = semantic_matching_model.fit(
    train_inputs,
    train_labels,
    epochs=10,
    batch_size=32,
    validation_data=(dev_inputs, dev_labels),
    callbacks=[tensorboard_callback, model_checkpoint, early_stopping]
)

# 评估模型
eval_loss, eval_accuracy = semantic_matching_model.evaluate(dev_inputs, dev_labels)
print(f"Evaluation Loss: {eval_loss}, Evaluation Accuracy: {eval_accuracy}")


Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.68999, saving model to ./logs/best_model.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.68999
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.68999
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.68999
Epoch 4: early stopping
Evaluation Loss: 0.6240836381912231, Evaluation Accuracy: 0.689990758895874


（6）加载保存的模型，在测试集上随机选取50条数据进行语义匹配测试，展示模型的预测结果

In [9]:
import random
import tensorflow as tf
import numpy as np

# 随机选取50条样本
random.seed(42)  # 设置随机种子以保持可重复性
selected_samples = random.sample(test_data, 50)

# 获取特殊标记的id
cls_id = vocab['[CLS]']
sep_id = vocab['[SEP]']

# 加载模型
model_path = '/kaggle/working/logs/best_model.h5'  # 模型保存路径
loaded_model = build_transformer_model(vocab_size, emb_size, num_heads=6, num_transformer_blocks=4, mlp_units=512, dropout=0.1)
loaded_model.load_weights(model_path)

# 对每条样本进行测试
for example in selected_samples:
    text_a = example[0]
    text_b = example[1]

    # 转换成id的形式
    input_ids_a = [vocab.get(item, vocab['[UNK]']) for item in list(text_a)]
    input_ids_b = [vocab.get(item, vocab['[UNK]']) for item in list(text_b)]

    input_ids = [cls_id] + input_ids_a + [sep_id] + input_ids_b + [sep_id]
    segment_ids = [0] * (len(input_ids_a) + 2) + [1] * (len(input_ids_b) + 1)

    # 转换成Tensor张量
    input_ids = tf.convert_to_tensor([input_ids])
    segment_ids = tf.convert_to_tensor([segment_ids])
    inputs = [input_ids, segment_ids]

    # 模型预测
    logits = loaded_model.predict(inputs)

    # 取概率值最大的索引
    label_id = np.argmax(logits, axis=1)[0]

    # 打印预测结果
    print(f"文本A: {text_a}")
    print(f"文本B: {text_b}")
    print(f"预测的label标签: {label_id}")
    print("=" * 50)


文本A: 花呗叫绑定银行卡是怎么回事
文本B: 蚂蚁花呗老是提示绑定银行卡是什么原因
预测的label标签: 0
文本A: 花呗如何关
文本B: 如何确定我是否关闭花呗成功
预测的label标签: 0
文本A: 借呗额度怎么越来越少
文本B: 我借呗额度怎么突然降低了？什么情况
预测的label标签: 0
文本A: 有蚂蚁借呗都开通不了
文本B: 我蚂蚁借呗为何不通过
预测的label标签: 0
文本A: 为什么花呗我有记录
文本B: 我的花呗记录有疑问
预测的label标签: 0
文本A: 蚂蚁借呗怎么认证不了
文本B: 我已实名认证但我还不能蚂蚁借呗
预测的label标签: 0
文本A: 申请的商家收款码。怎么开通花呗信用卡
文本B: 我的店铺怎么开通花呗付款
预测的label标签: 0
文本A: 蚂蚁借呗分期还款可以更改吗
文本B: 可以查看朋友的借呗还款日吗
预测的label标签: 0
文本A: 借呗借***个月，利息是多少
文本B: 蚂蚁借呗的一个月的利率
预测的label标签: 0
文本A: 经开通了花呗功能
文本B: 我能申请开通蚂蚁花呗吗
预测的label标签: 0
文本A: 花呗***号还款我忘了
文本B: 我双十一用花呗付款了，我想还款可以么
预测的label标签: 0
文本A: 为什么花呗不能在实体店里支付
文本B: 为什么我这上面没有花呗支付
预测的label标签: 0
文本A: 我花呗逾期***多天会怎么
文本B: 花呗如果逾期***天未还款
预测的label标签: 0
文本A: 蚂蚁借呗还可以分期嘛
文本B: 我用的是蚂蚁借呗，能不能分期
预测的label标签: 0
文本A: 我提前两天还了借呗怎么还是显示我逾期
文本B: 蚂蚁借呗超一天还款算是逾期吗
预测的label标签: 0
文本A: 怎么申请开通借呗
文本B: 我的借呗开通不了
预测的label标签: 0
文本A: 我的支付宝花呗为什么淘宝用不了
文本B: 为什么我的淘宝和拼多多不能用花呗
预测的label标签: 0
文本A: 支付宝花呗最迟还款几号
文本B: 我的花呗是几号还款日
预测的label标签: 0
文本A: 如何查看花呗详单
文本B: 花呗怎么查看分期降单
预测的label标签: 0
文本A: 借呗还款日怎么规定的
文本B: 怎么样更改借呗还款日
预测的label标签:

（7）输入一条样本提取多头注意力权重，对注意力机制的计算结果进行可视化展示并分析

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Choose a sample index
sample_index = 0
text_a, text_b, true_label = test_data[sample_index]

# Convert true_label to integer
true_label = int(true_label)

# Prepare data
input_ids, segment_ids, labels = collate_fn([(text_a, text_b, true_label)])
input_ids, segment_ids, labels = input_ids.to(device), segment_ids.to(device), labels.to(device)

# Get attention weights
with torch.no_grad():
    _, attention_weights = model(input_ids, segment_ids, mask=None)

# Print attention weights
print("Attention Weights:")
for head in range(attention_weights.size(1)):
    print(f"Head {head + 1}: {attention_weights[0][head].mean().item()}")

# Visualize attention weights
sns.set(font_scale=1.2)
plt.figure(figsize=(12, 8))

# Combine text_a and text_b for visualization
combined_text = ["[CLS]"] + list(jieba.cut(" ".join(map(str, text_a)))) + ["[SEP]"] + list(jieba.cut(" ".join(map(str, text_b)))) + ["[SEP]"]

ax = sns.heatmap(
    attention_weights[0][0].cpu().numpy(),
    cmap="YlGnBu",
    xticklabels=combined_text,
    yticklabels=combined_text,
    annot=True,
    fmt=".2f"
)
ax.set_title("Attention Weight Visualization")
plt.show()


10.层规范化的位置有两种 prenorm 和 postnorm，查询资料了解二者区别并说明自己的模型中层规范化操作的位置是 prenorm 还是 postnorm ，然后尝试另一种层规范化操作，对比二者在具体训练中的区别并分析原因。

In [None]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_size, max_position_embeddings, num_heads, num_layers, mlp_units, dropout_rate):
        super(TransformerEncoder, self).__init__()

        self.embedding_layer = TransformerEmbedding(vocab_size, embed_size, max_position_embeddings)
        self.transformer_blocks = [TransformerBlock(embed_size, num_heads, dropout_rate) for _ in range(num_layers)]
        self.mlp_units = mlp_units
        self.dropout_rate = dropout_rate

    def call(self, inputs, training=None):
        input_ids, segment_ids = inputs

        # 嵌入层
        embeddings = self.embedding_layer([input_ids, segment_ids])

        # Transformer 编码器
        for block in self.transformer_blocks:
            attention_output = block([embeddings, embeddings, embeddings, None])
            # 加和归一化层
            embeddings = AddNorm(embeddings, attention_output, None, self.dropout_rate)

        return embeddings

# 修改 AddNorm 层的 call 方法
class AddNorm(tf.keras.layers.Layer):
    def __init__(self, emb_size, dropout_rate):
        super(AddNorm, self).__init__()

        self.norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout_layer = Dropout(dropout_rate)

    def call(self, inputs, training=None):
        x, sublayer, mask = inputs

        # 残差连接
        residual = x

        # 更新这行代码，确保在训练时使用 dropout
        mlp_output = self.mlp_linear(sublayer(x))
        x = residual + self.dropout_layer(mlp_output, training=training)

        # 层规范化
        x = self.norm(x)
        return x

# 创建语义匹配模型
class SemanticMatchingModel(tf.keras.Model):
    def __init__(self, vocab_size, embed_size, max_position_embeddings, num_heads, num_layers, mlp_units, dropout_rate):
        super(SemanticMatchingModel, self).__init__()

        self.embedding_layer = TransformerEmbedding(vocab_size, embed_size, max_position_embeddings)
        self.transformer_encoder = TransformerEncoder(vocab_size, embed_size, max_position_embeddings, num_heads, num_layers, mlp_units, dropout_rate)
        self.pooling_layer = tf.keras.layers.GlobalAveragePooling1D()
        self.output_layer = Dense(1, activation='sigmoid')

    def call(self, inputs, training=None):
        input_ids, segment_ids = inputs

        # 前向传播
        embeddings = self.embedding_layer([input_ids, segment_ids])
        transformer_output = self.transformer_encoder([input_ids, segment_ids], training=training)
        pooled = self.pooling_layer(transformer_output)
        outputs = self.output_layer(pooled)

        return outputs

# 创建模型
vocab_size = len(vocab)
embed_size = 300
max_position_embeddings = 512
num_heads = 6
num_layers = 4
mlp_units = 512
dropout_rate = 0.1

semantic_matching_model = SemanticMatchingModel(
    vocab_size, embed_size, max_position_embeddings, num_heads, num_layers, mlp_units, dropout_rate
)
