## 数据集来自清华大学NLP-THUCNews中文语料数据集
http://thuctc.thunlp.org/sendMessage

In [1]:
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import BertTokenizer,BertConfig
from tqdm import tqdm

In [2]:
# 定义数据集类CNewsDataset
class CNewsDataset(Dataset):
    def __init__(self, filename):
        # 数据集初始化  通过手工查阅发现数据集信息标签如下
        self.labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']
        self.labels_id = list(range(len(self.labels)))
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
        self.input_ids = []
        self.token_type_ids = []
        self.attention_mask = []
        self.label_id = []
        self.load_data(filename)
    
    def load_data(self, filename):
        # 加载数据
        print('loading data from:', filename)
        with open(filename, 'r', encoding='utf-8') as rf:
            lines = rf.readlines()
        for line in tqdm(lines, ncols=100):
            label, text = line.strip().split('\t')
            label_id = self.labels.index(label)
            token = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=512)
            self.input_ids.append(np.array(token['input_ids']))
            self.token_type_ids.append(np.array(token['token_type_ids']))
            self.attention_mask.append(np.array(token['attention_mask']))
            self.label_id.append(label_id)

    def __getitem__(self, index):
        return self.input_ids[index], self.token_type_ids[index], self.attention_mask[index], self.label_id[index]

    def __len__(self):
        return len(self.input_ids)

## 建立模型

In [3]:
import torch
import torch.nn as nn
from transformers import BertModel

In [4]:
# 定义Bert分类器
class BertClassifier(nn.Module):
    def __init__(self,bert_config,num_labels):
        super().__init__()
        #定义BERT模型
        self.bert = BertModel(config = bert_config)
        #定义分类器 线性分类器
        self.classifier = nn.Linear(bert_config.hidden_size,num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        # BERT的输出
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # 取[CLS]位置的pooled output
        pooled = bert_output[1]
        # 分类
        logits = self.classifier(pooled)
        # 返回softmax后结果
        return torch.softmax(logits, dim=1)

In [5]:
# Bert+BiLSTM，用法与BertClassifier一样，可直接在train里面调用
class BertLstmClassifier(nn.Module):
    def __init__(self, bert_config, num_labels):
        super().__init__()
        self.bert = BertModel(config=bert_config)
        self.lstm = nn.LSTM(input_size=bert_config.hidden_size, hidden_size=bert_config.hidden_size, num_layers=2, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(bert_config.hidden_size*2, num_labels)  # 双向LSTM 需要乘以2
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state
        out, _ = self.lstm(last_hidden_state)
        logits = self.classifier(out[:, -1, :]) # 取最后时刻的输出
        return self.softmax(logits)

In [6]:
#定义标签
labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']
bert_config = BertConfig.from_pretrained('bert-base-chinese') # 预训练模型

In [7]:
#定义模型
model = BertClassifier(bert_config,len(labels))

In [8]:
model

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

## 模型训练

In [11]:
import os
import torch
import torch.nn as nn
from transformers import BertTokenizer, AdamW, BertConfig
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn import metrics

In [12]:
## 设置预训练超参数
batch_size = 4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 10   # 训练轮次
learning_rate = 5e-6   #学习率设置的比较低

In [13]:
device

'cuda'

In [14]:
#获取数据集
train_dataset = CNewsDataset('./cnews/cnews.train.txt')
valid_dataset = CNewsDataset('./cnews/cnews.val.txt')   # 验证集
test_dataset = CNewsDataset('./cnews/cnews.test.txt')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

loading data from: ./cnews/cnews.train.txt


100%|████████████████████████████████████████████████████████| 50000/50000 [04:54<00:00, 169.97it/s]


loading data from: ./cnews/cnews.val.txt


100%|██████████████████████████████████████████████████████████| 5000/5000 [00:27<00:00, 183.82it/s]


loading data from: ./cnews/cnews.test.txt


100%|████████████████████████████████████████████████████████| 10000/10000 [01:02<00:00, 161.16it/s]


In [24]:
train_dataset.labels

['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']

In [25]:
#生成每次训练所需的Batch
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [27]:
#读取Transformers-BERT的配置文件
bert_config = BertConfig.from_pretrained('bert-base-chinese')
num_labels = len(train_dataset.labels)

In [31]:
bert_config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [29]:
#初始化模型
BERT = BertClassifier(bert_config,num_labels).to(device)

In [30]:
BERT

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [33]:
#定义优化器和损失函数
optimizer = AdamW(BERT.parameters(),lr=learning_rate)
criterion = nn.CrossEntropyLoss()  # 交叉熵损失

In [34]:
best_f1 = 0  #相当于阈值

In [35]:
import matplotlib.pyplot as plt

In [38]:
train_losses = []    # 训练集上的loss
train_accs = []      # 训练集上的准确率

valid_losses = []    # 验证集上的loss
valid_accs = []      # 验证集上的准确率

In [39]:
for epoch in range(1,epochs+1):
    losses = 0  #损失
    accuracy = 0  # 准确率
    BERT.train()   #训练
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    train_bar = tqdm(train_dataloader, ncols=100)
    for input_ids,token_type_ids,attention_mask,label_id in train_bar:
        #梯度清零
        BERT.zero_grad()
        train_bar.set_description('Epoch %i train' % epoch)
        
        #传入数据 调用 model.forward()
        output = BERT(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            token_type_ids=token_type_ids.to(device))
        
        #计算loss
        loss= criterion(output,label_id.to(device))
        losses += loss.item()
        
        pred_labels = torch.argmax(output,dim=1)  #预测的label
        acc = torch.sum(pred_labels == label_id.to(device)).item() / len(pred_labels) #acc
        accuracy += acc
        
        loss.backward()
        optimizer.step()
        train_bar.set_postfix(loss = loss.item(),acc=acc)
    average_loss = losses / len(train_dataloader)
    average_acc = accuracy / len(train_dataloader)
    
    print('\tTrain ACC:', average_acc, '\tLoss:', average_loss)
    
    # 保存训练集的loss和accuracy供后续可视化
    train_losses.append(average_loss)
    train_accs.append(average_acc)
    
    # 验证
    model.eval()
    losses = 0  # 损失
    pred_labels = []
    true_labels = []
    valid_bar = tqdm(valid_dataloader, ncols=100)
    for input_ids, token_type_ids, attention_mask, label_id in valid_bar:
        valid_bar.set_description('Epoch %i valid' % epoch)

        output = model(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            token_type_ids=token_type_ids.to(device),
        )

        loss = criterion(output, label_id.to(device))
        losses += loss.item()

        pred_label = torch.argmax(output, dim=1)  # 预测出的label
        acc = torch.sum(pred_label == label_id.to(device)).item() / len(pred_label)  # acc
        valid_bar.set_postfix(loss=loss.item(), acc=acc)

        pred_labels.extend(pred_label.cpu().numpy().tolist())
        true_labels.extend(label_id.numpy().tolist())

    average_loss = losses / len(valid_dataloader)
    print('\tLoss:', average_loss)
    # 保存验证集的loss供后续可视化
    valid_losses.append(average_loss)
    
    #分类报告
    report = metrics.classification_report(true_labels, pred_labels, labels=valid_dataset.labels_id,
                                               target_names=valid_dataset.labels)
    print('* Classification Report:')
    print(report)
    
    # f1 用来判断最优模型
    f1 = metrics.f1_score(true_labels, pred_labels, labels=valid_dataset.labels_id, average='micro')
    
    if not os.path.exists('models'):
        os.makedirs('models')
    
    #判断并保存验证集上表现最好的模型
    if f1 > best_f1:
        best_f1 = f1
        print("找到了更好的模型")
        torch.save(BERT.state_dict(),'models/best_model.pkl')

Epoch 1 train: 100%|██████████████████████| 12500/12500 [2:30:12<00:00,  1.39it/s, acc=1, loss=1.46]


	Train ACC: 0.73764 	Loss: 1.730507346458435


Epoch 1 valid:   0%|                                                       | 0/1250 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

## 加载预训练模型

In [None]:
#加载训练好的模型
model.load_state_dict