In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.metrics import mean_squared_error

### 1. 加载Hugging Face数据集

In [7]:
# 加载 STS-B 数据集
dataset = load_dataset("glue", "stsb")

Generating train split: 100%|██████████| 5749/5749 [00:00<00:00, 357638.40 examples/s]
Generating validation split: 100%|██████████| 1500/1500 [00:00<00:00, 696728.24 examples/s]
Generating test split: 100%|██████████| 1379/1379 [00:00<00:00, 766186.94 examples/s]


### 2. 加载Bert Tokenizer

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### 3. 定义数据集的预处理函数

In [9]:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding="max_length", max_length=128)

### 4. 数据集预处理

In [10]:
# 对训练集和验证集进行tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 设置数据集的格式为 PyTorch tensors
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 5749/5749 [00:01<00:00, 3229.71 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 3172.39 examples/s]
Map: 100%|██████████| 1379/1379 [00:00<00:00, 3637.73 examples/s]


### 5. 准备DataLoader

In [11]:
# 创建训练集和验证集 DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True)
valid_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=16)

### 6. 模型构建

In [12]:
# 加载预训练的BERT模型，指定类别数为1，因为STS-B是回归任务
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# 使用AdamW优化器
optimizer = AdamW(model.parameters(), lr=2e-5)



### 7. 模型训练

In [14]:
# 定义训练过程
def train(model, dataloader, optimizer, num_epochs=3):
    model.train()
    loss_fn = torch.nn.MSELoss()  # 回归任务的损失函数使用均方误差
    
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label'].unsqueeze(1)  # 调整label形状
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")


In [15]:
# 开始训练
train(model, train_dataloader, optimizer, num_epochs=3)


Epoch 1/3, Loss: 1.1832510522670217
Epoch 2/3, Loss: 0.4855095928741826
Epoch 3/3, Loss: 0.32641725952012673


### 7. 模型评估