# 导入已预处理数据

In [1]:
import pandas as pd

train_total = pd.read_csv('./data/train_cleaned.csv')

test_total = pd.read_csv('./data/test_cleaned.csv')

### 随机抽样缩小原数据集

In [2]:
train = train_total.sample(frac=0.05, random_state=42).reset_index(drop=True)
test = test_total.sample(frac=0.05, random_state=42).reset_index(drop=True)

# 文本向量化

### 加载数据集（字典化）

In [3]:
from paddle.io import Dataset

class AFQMC(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        data_file = data_file.reset_index(drop=True)  
        Data = data_file.to_dict(orient='index')
        return Data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if idx not in self.data:
            raise KeyError(f"Key {idx} not found in dataset")
        return self.data[idx]

train_dict = AFQMC(train)
test_dict = AFQMC(test)

print(train_dict[0])
print(test_dict[0])

{'id': 275485, 'tid1': 153996, 'tid2': 154013, 'title1_en': 'longer white hair grows dont dye wash washing make hair look black shiny 3 days', 'title2_en': 'yaos exhusband ling soosus high profile comeback revealed reasons divorce', 'label': 'unrelated', 'label_encoded': 2}
{'id': 379653, 'tid1': 186512, 'tid2': 186513, 'title1_en': 'ship hit meteor rock crashed another world astronauts kept pets aliens', 'title2_en': 'alien ship crashed ancient vikings fell knees thought god coming', 'label_encoded': 2, 'Weight': 0.0625, 'Usage': 'Private'}


### 向量化处理

In [2]:
import paddle
from paddle.io import DataLoader
from paddlenlp.transformers import AutoTokenizer

checkpoint = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def collote_fn(batch_samples):
    try:
        batch_sentence_1, batch_sentence_2 = [], []
        batch_label = []
        batch_weights = []
        for sample in batch_samples:
            batch_sentence_1.append(sample['title1_en'])
            batch_sentence_2.append(sample['title2_en'])
            batch_label.append(int(sample['label_encoded']))
            if 'Weight' in sample:
                batch_weights.append(float(sample['Weight']))
            else:
                batch_weights.append(1.0)

        X_1 = tokenizer(
            batch_sentence_1, 
            padding=True, 
            truncation=True, 
            return_tensors="pd",
            return_token_type_ids=False 
        )
        
        X_2 = tokenizer(
            batch_sentence_2, 
            padding=True, 
            truncation=True, 
            return_tensors="pd",
            return_token_type_ids=False 
        )

        y = paddle.to_tensor(batch_label, dtype='int64')
        w = paddle.to_tensor(batch_weights, dtype='float32')

        if 'Weight' in sample:
            return X_1, X_2, y, w
        else:
            return X_1, X_2, y
    except Exception as e:
        print(f"Error in collote_fn: {e}")
        raise 

# 模型训练准备




### 定义模型

In [6]:
import paddle
import paddle.nn as nn

class SiameseNetwork(nn.Layer):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_classes, num_lstm_layers=2, dropout_rate=0.3):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_lstm_layers, direction='bidirectional')
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(hidden_dim * 4, hidden_dim * 2)
        self.bn1 = nn.BatchNorm1D(hidden_dim * 2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim * 2, num_classes)

    def encode(self, src):
        embed = self.embedding(src)
        lstm_output, (hidden, _) = self.lstm(embed)
        hidden = self.dropout(lstm_output[:, -1, :])
        return hidden

    def forward(self, src1, src2):
        output1 = self.encode(src1)
        output2 = self.encode(src2)
        combined = paddle.concat([output1, output2], axis=-1)
        x = self.fc1(combined)
        x = self.bn1(x)
        x = self.relu(x)
        logits = self.fc2(x)
        return logits
    
# 固定的合适参数
input_dim = 30522
num_classes = 3 # 三个类别的分类

### 训练循环

In [7]:
import paddle.nn.functional as F
from tqdm.auto import tqdm

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_step_num = (epoch-1)*len(dataloader)
    
    model.train()
    for step, batch in enumerate(dataloader, start=1):
        input_ids1 = batch[0]['input_ids'].to(device)
        input_ids2 = batch[1]['input_ids'].to(device)
        labels = batch[2].to(device)

        # 将输入传递给模型
        outputs = model(input_ids1, input_ids2)
        loss = loss_fn(outputs, labels)

        optimizer.clear_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.numpy()
        progress_bar.set_description(f'loss: {total_loss / (finish_step_num + step):>7f}')
        progress_bar.update(1)

    return total_loss


### 测试循环

In [8]:
# 加权交叉熵损失函数
class WeightedCrossEntropyLoss(nn.Layer):
    def __init__(self):
        super(WeightedCrossEntropyLoss, self).__init__()

    def forward(self, logits, labels, weights):
        loss = paddle.nn.functional.cross_entropy(logits, labels, reduction='none')
        weighted_loss = loss * weights
        return paddle.mean(weighted_loss)

In [9]:
import numpy as np

def test_loop(dataloader, model, mode='Valid'):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    all_sample_weights = []

    with paddle.no_grad():
        for batch in dataloader:
            try:
                input_ids1 = batch[0]['input_ids'].to(device)
                input_ids2 = batch[1]['input_ids'].to(device)
                labels = batch[2].to(device)
                weights = batch[3].to(device)

                outputs = model(input_ids1, input_ids2)
                loss_fn = WeightedCrossEntropyLoss()
                loss = loss_fn(outputs, labels, weights)         

                total_loss += loss.numpy()

                all_predictions.append(outputs.argmax(axis=1).cpu().numpy())
                all_labels.append(labels.cpu().numpy())
                all_sample_weights.append(weights.cpu().numpy())
            except Exception as e:
                print("Error encountered:", e)
                print("Problematic batch:", batch)
                break 

    all_predictions = np.concatenate(all_predictions)
    all_labels = np.concatenate(all_labels)
    all_sample_weights = np.concatenate(all_sample_weights)

    avg_loss = total_loss / len(dataloader)

    return avg_loss, all_predictions, all_labels, all_sample_weights


### 加权准确率函数

In [10]:
def weighted_categorization_accuracy(predictions, labels, sample_weights):
    
    correct_predictions = (predictions == labels).astype(int)
    weighted_correct_predictions = correct_predictions * sample_weights
    
    weighted_accuracy = np.sum(weighted_correct_predictions) / np.sum(sample_weights)
    
    return weighted_accuracy

# 超参数调参

#### 设置调参数据集

In [None]:
train_opt = pd.read_csv('./data/train_cleaned.csv')
test_opt = pd.read_csv('./data/test_cleaned.csv')

train_opt = train_opt.sample(frac=0.01, random_state=42).reset_index(drop=True)
test_opt = test_opt.sample(frac=0.01, random_state=42).reset_index(drop=True)

In [None]:
train_dict_opt = AFQMC(train_opt)
test_dict_opt = AFQMC(test_opt)

#### 定义搜索空间

In [None]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'embed_dim': [300, 128],
    'hidden_dim': [512],
    'num_lstm_layers': [2, 3],
    'dropout_rate': [0.1],
    'learning_rate': [1e-3, 1e-4, 1e-5],
    'batch_size': [4]
}

grid = ParameterGrid(param_grid)

#### 调参执行

In [None]:
from sklearn.model_selection import ParameterGrid
from paddle.optimizer import AdamW

train_dataloader_opt = DataLoader(train_dict_opt, batch_size=param_grid['batch_size'], collate_fn=collote_fn)
valid_dataloader_opt = DataLoader(test_dict_opt, batch_size=param_grid['batch_size'], shuffle=False, collate_fn=collote_fn)

best_params = None
best_val_loss = float('inf')
loss_fn = nn.CrossEntropyLoss()

total_loss = 0.
patience = 2
trigger_times = 0
best_acc = 0.
time = 0
para_training_steps = len(train_dataloader_opt)
warmup_steps = int(0.5 * para_training_steps)

for params in grid:
    time += 1
    print(f'Execution {time}/{len(grid)}')
    print(f"Training with params: {params}")

    model = SiameseNetwork(
        input_dim=input_dim,
        embed_dim=params['embed_dim'],
        hidden_dim=params['hidden_dim'],
        num_classes=num_classes,
        num_lstm_layers=params['num_lstm_layers'],
        dropout_rate=params['dropout_rate']
    )

#     lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
#     learning_rate=params['learning_rate'],
#     decay_steps=para_training_steps,
#     end_lr=0,
#     power=1.0, 
#     cycle=False 
# )

    lr_scheduler = paddle.optimizer.lr.LinearWarmup(
        learning_rate=params['learning_rate'], 
        warmup_steps=warmup_steps, 
        start_lr=0, 
        end_lr=params['learning_rate'], 
        verbose=False
    )

    optimizer = paddle.optimizer.AdamW(parameters=model.parameters(), learning_rate=params['learning_rate'])

    train_loop(train_dataloader_opt, model, loss_fn, optimizer, lr_scheduler, 10, total_loss)

    val_loss, val_predictions, val_labels, val_sample_weights = test_loop(valid_dataloader_opt, model, mode='Valid')
    valid_acc = weighted_categorization_accuracy(val_predictions, val_labels, val_sample_weights)
    print(f"Weighted Categorization Accuracy: {(100*valid_acc):0.2f}%")
    print(f"Validation Loss: {val_loss:>7f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_params = params

print(f"Best Params: {best_params}")
print(f"Best Validation Loss: {best_val_loss}")


Execution 1/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 300, 'hidden_dim': 512, 'learning_rate': 0.001, 'num_lstm_layers': 2}


loss: 0.102321: 100%|██████████| 1582/1582 [06:43<00:00,  3.92it/s]


Weighted Categorization Accuracy: 5.84%
Validation Loss: 0.081480
Execution 2/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 300, 'hidden_dim': 512, 'learning_rate': 0.001, 'num_lstm_layers': 3}


loss: 0.132562: 100%|██████████| 1582/1582 [09:48<00:00,  2.69it/s]


Weighted Categorization Accuracy: 5.34%
Validation Loss: 0.077473
Execution 3/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 300, 'hidden_dim': 512, 'learning_rate': 0.0001, 'num_lstm_layers': 2}


loss: 0.116332: 100%|██████████| 1582/1582 [05:53<00:00,  4.48it/s]


Weighted Categorization Accuracy: 13.78%
Validation Loss: 0.075580
Execution 4/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 300, 'hidden_dim': 512, 'learning_rate': 0.0001, 'num_lstm_layers': 3}


loss: 0.122569: 100%|██████████| 1582/1582 [08:54<00:00,  2.96it/s]


Weighted Categorization Accuracy: 58.96%
Validation Loss: 0.066441
Execution 5/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 300, 'hidden_dim': 512, 'learning_rate': 1e-05, 'num_lstm_layers': 2}


loss: 0.123968: 100%|██████████| 1582/1582 [06:07<00:00,  4.30it/s]


Weighted Categorization Accuracy: 26.84%
Validation Loss: 0.073065
Execution 6/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 300, 'hidden_dim': 512, 'learning_rate': 1e-05, 'num_lstm_layers': 3}


loss: 0.093652: 100%|██████████| 1582/1582 [08:54<00:00,  2.96it/s]


Weighted Categorization Accuracy: 58.96%
Validation Loss: 0.070570
Execution 7/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 128, 'hidden_dim': 512, 'learning_rate': 0.001, 'num_lstm_layers': 2}


loss: 0.117943: 100%|██████████| 1582/1582 [05:19<00:00,  4.96it/s]


Weighted Categorization Accuracy: 58.13%
Validation Loss: 0.067897
Execution 8/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 128, 'hidden_dim': 512, 'learning_rate': 0.001, 'num_lstm_layers': 3}


loss: 0.119483: 100%|██████████| 1582/1582 [08:14<00:00,  3.20it/s]


Weighted Categorization Accuracy: 5.34%
Validation Loss: 0.076535
Execution 9/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 128, 'hidden_dim': 512, 'learning_rate': 0.0001, 'num_lstm_layers': 2}


loss: 0.128077: 100%|██████████| 1582/1582 [05:13<00:00,  5.05it/s]


Weighted Categorization Accuracy: 55.90%
Validation Loss: 0.070385
Execution 10/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 128, 'hidden_dim': 512, 'learning_rate': 0.0001, 'num_lstm_layers': 3}


loss: 0.100883: 100%|██████████| 1582/1582 [08:27<00:00,  3.12it/s]


Weighted Categorization Accuracy: 58.96%
Validation Loss: 0.065413
Execution 11/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 128, 'hidden_dim': 512, 'learning_rate': 1e-05, 'num_lstm_layers': 2}


loss: 0.116839: 100%|██████████| 1582/1582 [05:42<00:00,  4.62it/s]


Weighted Categorization Accuracy: 12.89%
Validation Loss: 0.074659
Execution 12/12
Training with params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 128, 'hidden_dim': 512, 'learning_rate': 1e-05, 'num_lstm_layers': 3}


loss: 0.131855: 100%|██████████| 1582/1582 [08:19<00:00,  3.17it/s]


Weighted Categorization Accuracy: 35.70%
Validation Loss: 0.074092
Best Params: {'batch_size': 4, 'dropout_rate': 0.1, 'embed_dim': 128, 'hidden_dim': 512, 'learning_rate': 0.0001, 'num_lstm_layers': 3}
Best Validation Loss: 0.06541291740310343


# 训练模型

### 初始化

##### 超参数调参结果输入

In [47]:
best_params

{'batch_size': 4,
 'dropout_rate': 0.1,
 'embed_dim': 128,
 'hidden_dim': 512,
 'learning_rate': 0.0001,
 'num_lstm_layers': 3}

##### 向量化

In [None]:
train_dataloader = DataLoader(train_dict, batch_size=best_params['batch_size'], collate_fn=collote_fn)
valid_dataloader= DataLoader(test_dict, batch_size=best_params['batch_size'], shuffle=False, collate_fn=collote_fn)


batch_X1, batch_X2, batch_y = next(iter(train_dataloader))
print('batch_X1 shape:', {k: v.shape for k, v in batch_X1.items()})
print('batch_X2 shape:', {k: v.shape for k, v in batch_X2.items()})
print('batch_y shape:', batch_y.shape)

In [None]:
batch_X1_test, batch_X2_test, batch_y_test, batch_w = next(iter(valid_dataloader))
print('batch_X1_test shape:', {k: v.shape for k, v in batch_X1_test.items()})
print('batch_X2_test shape:', {k: v.shape for k, v in batch_X2_test.items()})
print('batch_y shape:', batch_y_test.shape)
print(batch_w.shape)

##### 模型初始化

In [41]:


model = SiameseNetwork(
        input_dim=input_dim,
        embed_dim=best_params['embed_dim'],
        hidden_dim=best_params['hidden_dim'],
        num_classes=num_classes,
        num_lstm_layers=best_params['num_lstm_layers'],
        dropout_rate=best_params['dropout_rate']
    )

print(model)

SiameseNetwork(
  (embedding): Embedding(30522, 128, sparse=False)
  (lstm): LSTM(128, 512, num_layers=3
    (0): BiRNN(
      (cell_fw): LSTMCell(128, 512)
      (cell_bw): LSTMCell(128, 512)
    )
    (1): BiRNN(
      (cell_fw): LSTMCell(1024, 512)
      (cell_bw): LSTMCell(1024, 512)
    )
    (2): BiRNN(
      (cell_fw): LSTMCell(1024, 512)
      (cell_bw): LSTMCell(1024, 512)
    )
  )
  (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)
  (fc1): Linear(in_features=2048, out_features=1024, dtype=float32)
  (bn1): BatchNorm1D(num_features=1024, momentum=0.9, epsilon=1e-05, data_format=NCL)
  (relu): ReLU()
  (fc2): Linear(in_features=1024, out_features=3, dtype=float32)
)


##### 输出结构

In [14]:
src1 = batch_X1['input_ids']  # 提取需要的 tensor
src2 = batch_X2['input_ids']  # 提取需要的 tensor

outputs = model(src1, src2)

print(outputs.shape)

[4, 3]




##### 训练步骤次数

In [15]:
import paddle
from paddle.optimizer import AdamW
import paddle.optimizer as optim
from paddle.optimizer.lr import LinearWarmup

optimizer = AdamW(learning_rate=3e-5, parameters=model.parameters())
epochs = 10
num_training_steps = epochs * len(train_dataloader)

lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
    learning_rate=3e-5,
    decay_steps=num_training_steps,
    end_lr=0,
    power=1.0, 
    cycle=False 
)

optimizer = optim.Adam(parameters=model.parameters(), learning_rate=lr_scheduler)

print(num_training_steps)

39550


##### 导入最佳模型权重

In [16]:
model.set_state_dict(paddle.load(f'./paddle_weights/epoch_1_valid_acc_62.9_paddle_weights.pdparams'))
model.eval()

### 训练执行

In [17]:
import paddle
import paddle.nn as nn

learning_rate = 1e-5
epoch_num = 10

loss_fn = nn.CrossEntropyLoss()
optimizer = paddle.optimizer.AdamW(parameters=model.parameters(), learning_rate=learning_rate)

warmup_steps = int(0.05 * epoch_num * len(train_dataloader))

lr_scheduler = paddle.optimizer.lr.LinearWarmup(
    learning_rate=learning_rate, 
    warmup_steps=warmup_steps, 
    start_lr=0, 
    end_lr=learning_rate, 
    verbose=False
)

total_loss = 0.
patience = 2
trigger_times = 0
best_acc = 0.
best_loss = float("inf")

try:
    for t in range(epoch_num):
        print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
        total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)

        val_loss, val_predictions, val_labels, val_sample_weights = test_loop(valid_dataloader, model, mode='Valid')

        valid_acc = weighted_categorization_accuracy(val_predictions, val_labels, val_sample_weights)
        print(f"Weighted Categorization Accuracy: {(100*valid_acc):0.2f}%")
        print(f"Validation Loss: {val_loss:>7f}")

        if valid_acc > best_acc:
            best_acc = valid_acc
            print('saving new weights...\n')
            paddle.save(model.state_dict(), f'./paddle_weights/epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_paddle_weights.pdparams')
            model.set_state_dict(paddle.load(f'./paddle_weights/epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_paddle_weights.pdparams'))
            
        # 早停逻辑
        if val_loss < best_loss:
            best_loss = val_loss
            trigger_times = 0
        else:
            trigger_times += 1

        if trigger_times >= patience:
            print("Early stopping!")
            break

except Exception as e:
    print(f"Error encountered: {e}")
    paddle.save(model.state_dict(), './paddle_weights/paddle_weights_on_error.pdparams')

print("Done!")


Epoch 1/10
-------------------------------


loss: 0.649945: 100%|██████████| 3955/3955 [11:54<00:00,  5.53it/s]


Weighted Categorization Accuracy: 57.58%
Validation Loss: 0.058873
saving new weights...

Epoch 2/10
-------------------------------


loss: 0.632545: 100%|██████████| 3955/3955 [12:02<00:00,  5.48it/s]


Weighted Categorization Accuracy: 56.02%
Validation Loss: 0.057322
Epoch 3/10
-------------------------------


loss: 0.615677: 100%|██████████| 3955/3955 [12:04<00:00,  5.46it/s]


Weighted Categorization Accuracy: 53.16%
Validation Loss: 0.059807
Epoch 4/10
-------------------------------


loss: 0.598518: 100%|██████████| 3955/3955 [12:07<00:00,  5.44it/s]


Weighted Categorization Accuracy: 56.92%
Validation Loss: 0.059589
Early stopping!
Done!


### 加权准确率计算

In [None]:

val_loss, val_predictions, val_labels, val_sample_weights = test_loop(valid_dataloader, model, mode='Test')

valid_acc = weighted_categorization_accuracy(val_predictions, val_labels, val_sample_weights)

print(f"Weighted Categorization Accuracy: {(100*valid_acc):0.2f}%")