In [1]:
!pip install transformers
import json
import numpy as np

# 读取训练数据 (Read training data)
with open('训练数据路径', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# 读取验证数据 (Read validation data)
with open('验证数据路径', 'r', encoding='utf-8') as f:
    validation_data = json.load(f)

# 将标签转换为0和1整数列表 (Convert labels to a list of integers 0 and 1)
def convert_labels(data):
    for item in data:
        labels = item['label']
        label_list = np.zeros(len(all_labels))  # 假设有all_labels个标签 (Assume there are all_labels labels)
        for label in labels:
            if label in all_labels:
                label_list[all_labels.index(label)] = 1
        item['label'] = label_list.tolist()

# 获取所有可能的标签 (Get all possible labels)
all_labels = []
for item in train_data + validation_data:
    all_labels.extend(item['label'])
all_labels = list(set(all_labels))

# 转换训练数据和验证数据的标签 (Convert labels of training and validation data)
convert_labels(train_data)
convert_labels(validation_data)
print("标签数量: " + str(len(all_labels)))  # (Number of labels)


标签数量: 9


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset

# 加载Bert预训练模型和分词器 (Load Bert pre-trained model and tokenizer)
model_name = 'bert-base-chinese'  # 或 'bert-base-uncased'（英文）(or 'bert-base-uncased' for English)
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(all_labels))

# 自定义数据集类 (Custom dataset class)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        label = item['label']
        encoded_input = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoded_input['input_ids'].squeeze(),
            'attention_mask': encoded_input['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# 创建训练集和验证集的数据加载器 (Create data loaders for training and validation sets)
train_dataset = CustomDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=len(all_labels), shuffle=True)

validation_dataset = CustomDataset(validation_data)
validation_loader = DataLoader(validation_dataset, batch_size=len(all_labels), shuffle=False)

# 将模型移动到GPU（如果可用） (Move model to GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 定义训练函数 (Define training function)
def train(model, train_loader, validation_loader, device, num_epochs, threshold=0):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    # criterion = torch.nn.BCEWithLogitsLoss()

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        train_results = []
        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            for i in range(len(logits)):
                result = {
                    'logits': logits[i].detach().cpu().numpy().tolist(),
                    'labels': labels[i].detach().cpu().numpy().tolist()
                }
                train_results.append(result)

        # 验证模型 (Validate model)
        model.eval()
        val_loss = 0.0
        predictions = []

        with torch.no_grad():
            valid_results = []
            for batch in validation_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

                val_loss += loss.item()
                predictions.extend(logits.sigmoid().cpu().numpy())
                for i in range(len(logits)):
                    result = {
                        'logits': logits[i].detach().cpu().numpy().tolist(),
                        'labels': labels[i].detach().cpu().numpy().tolist()
                    }
                    valid_results.append(result)
        train_loss /= len(train_loader)
        val_loss /= len(validation_loader)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


        # 将结果保存到json文件 (Save results to JSON file)
        with open('训练集的结果保存路径', 'w', encoding='utf-8') as f:  # ('Path to save training set results')
            json.dump(train_results, f)

        with open('验证集的结果保存路径', 'w', encoding='utf-8') as f:  # ('Path to save validation set results')
            json.dump(valid_results, f)

        # # 根据阈值生成预测 (Generate predictions based on threshold)
        # thresholded_predictions = [[1 if prob >= threshold else 0 for prob in pred] for pred in predictions]
        # print(f'Thresholded Predictions: {thresholded_predictions}')




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:

# 开始训练 (start training)
train(model, train_loader, validation_loader, device, num_epochs=30, threshold=0.5)

Epoch 1/30, Train Loss: 0.3251, Val Loss: 0.2436
Epoch 2/30, Train Loss: 0.2215, Val Loss: 0.1845
Epoch 3/30, Train Loss: 0.1662, Val Loss: 0.1593
Epoch 4/30, Train Loss: 0.1249, Val Loss: 0.1553
Epoch 5/30, Train Loss: 0.1021, Val Loss: 0.1699
Epoch 6/30, Train Loss: 0.0886, Val Loss: 0.1648
Epoch 7/30, Train Loss: 0.0653, Val Loss: 0.1629
Epoch 8/30, Train Loss: 0.0559, Val Loss: 0.1774
Epoch 9/30, Train Loss: 0.0514, Val Loss: 0.1677
Epoch 10/30, Train Loss: 0.0469, Val Loss: 0.1749
Epoch 11/30, Train Loss: 0.0391, Val Loss: 0.1896
Epoch 12/30, Train Loss: 0.0339, Val Loss: 0.1841
Epoch 13/30, Train Loss: 0.0347, Val Loss: 0.1954
Epoch 14/30, Train Loss: 0.0269, Val Loss: 0.1848
Epoch 15/30, Train Loss: 0.0268, Val Loss: 0.1961
Epoch 16/30, Train Loss: 0.0258, Val Loss: 0.1959
Epoch 17/30, Train Loss: 0.0241, Val Loss: 0.2265
Epoch 18/30, Train Loss: 0.0245, Val Loss: 0.2115
Epoch 19/30, Train Loss: 0.0225, Val Loss: 0.2119
Epoch 20/30, Train Loss: 0.0250, Val Loss: 0.1976
Epoch 21/