In [61]:
import pandas as pd 
import numpy as np 
import json, time 
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from transformers import BertModel, BertConfig, BertTokenizer, AdamW, get_cosine_schedule_with_warmup
from transformers import BertTokenizer, BertModel, BertConfig, get_cosine_schedule_with_warmup
from transformers import AdamW
import warnings
warnings.filterwarnings('ignore')


bert_path = "/root/yunzhi/retrieval/bert_model"    # 该文件夹下存放三个文件（'vocab.txt', 'pytorch_model.bin', 'config.json'）
tokenizer = BertTokenizer.from_pretrained(bert_path)   # 初始化分词器

In [62]:
from typing import Union
def data_processor(data_path: str)->Union[list, list, list, list]:
    """
    :param data: 原始数据
    :return: 返回处理后的数据, 包括input_ids, input_mask, token_type_id, label
    """
    # 输出预处理
    input_ids, input_masks, input_types = [], [], []
    label = []
    maxlen = 30

    with open('./news_title_dataset.csv', encoding='utf-8') as f:

        for i, line in tqdm(enumerate(f)):
            title, y = line.strip().split('\t')

            # encode_plus会输出一个字典，分别为'input_ids', 'token_type_ids', 'attention_mask'对应的编码
            # 根据参数会短则补齐，长则切断
            encoder_dict = tokenizer.encode_plus(
                text=title,
                max_length=maxlen,
                padding='max_length',
                truncation=True
            )
            input_ids.append(encoder_dict['input_ids'])
            input_masks.append(encoder_dict['attention_mask'])
            input_types.append(encoder_dict['token_type_ids'])
            label.append(int(y))

    input_ids, input_types, input_masks, label = np.array(input_ids), np.array(input_types), np.array(input_masks), np.array(label)
    print(input_ids.shape, input_types.shape, input_masks.shape, label.shape)
    return input_ids, input_types, input_masks, label

input_ids, input_types, input_masks, label = data_processor('./news_title_dataset.csv')

100000it [00:22, 4405.01it/s]

(100000, 30) (100000, 30) (100000, 30) (100000,)





In [63]:
# 切分训练集、验证集、测试集

# 随机打乱索引
idxes = np.arange(input_ids.shape[0])
np.random.seed(1234)
np.random.shuffle(idexs)
print(idxes.shape, idxes[:10])

input_ids_train, input_ids_valid, input_ids_test= input_ids[idexs[:80000]], input_ids[idexs[80000:90000]], input_ids[idexs[90000:]]
input_types_train, input_types_valid, input_types_test = input_types[idexs[:80000]], input_types[idexs[80000:90000]], input_types[idexs[90000:]]
input_masks_train, input_masks_valid, input_masks_test = input_masks[idexs[:80000]], input_masks[idexs[80000:90000]], input_masks[idexs[90000:]]
y_train, y_valid, y_test = label[idexs[:80000]], label[idexs[80000:90000]], label[idexs[90000:]]

print(input_ids_train.shape, y_train.shape, input_ids_valid.shape, y_valid.shape, 
      input_ids_test.shape, y_test.shape)

(100000,) [0 1 2 3 4 5 6 7 8 9]
(80000, 30) (80000,) (10000, 30) (10000,) (10000, 30) (10000,)


In [77]:
# 加载到pytorch的DataLoder中
BATCH_SIZE = 128

# 训练集
trian_dataset = TensorDataset(torch.LongTensor(input_ids_train),
                             torch.LongTensor(input_masks_train),  
                             torch.LongTensor(input_types_train), 
                             torch.LongTensor(y_train))
train_sampler = RandomSampler(trian_dataset)
train_dataloader = DataLoader(trian_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

# 验证集
valid_dataset = TensorDataset(torch.LongTensor(input_ids_valid), 
                             torch.LongTensor(input_masks_valid), 
                             torch.LongTensor(input_types_valid), 
                             torch.LongTensor(y_valid))

valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

# 测试集
test_dataset = TensorDataset(torch.LongTensor(input_ids_test), 
                            torch.LongTensor(input_masks_test), 
                            torch.LongTensor(input_types_test))
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [65]:
# 定义模型

class Bert_Model(nn.Module):
    def __init__(self,  bert_path, num_classes=10):
        super(Bert_Model, self).__init__()
        self.config = BertConfig.from_pretrained(bert_path) # 导入模型超参数
        self.bert = BertModel.from_pretrained(bert_path) # 导入预训练模型权重 
        self.fc = nn.Linear(self.config.hidden_size, num_classes) # 分类器

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        out_pool = outputs[1] # 池化后的输出 [bs, config.hidden_size]
        logit = self.fc(out_pool) # 分类器
        return logit


In [72]:
# 实例化bert模型

def get_parameter_number(model):
    # 打印模型参数数量，包括总训练的和不可训练的
    total_num = sum(p.numel() for p in model.parameters())
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return 'Total parameters:{}, Trainable parameters:{}'.format(total_num, trainable_num)

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
EPOCHS = 5
model = Bert_Model(bert_path).to(DEVICE)
print(get_parameter_number(model))

Total parameters:102275338, Trainable parameters:102275338


In [67]:
# 定义优化器
# 学习率先线性warmup一个epoch，然后cosine式下降。
# 这里给个小提示，一定要加warmup（学习率从0慢慢升上去），如果把warmup去掉，可能收敛不了。
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)
scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=len(train_dataloader), 
                                            num_training_steps=len(train_dataloader)*EPOCHS)

In [76]:
# 定义训练函数和验证测试函数
def evaluate(model, data_loader, device):
    model.eval()
    val_true, val_pred = [], []
    with torch.no_grad():
        for idx, (ids, att, tpe, y) in (enumerate(data_loader)):
            y_pred = model(ids.to(device), att.to(device), tpe.to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
            val_true.extend(y.squeeze().cpu().numpy().tolist())
    
    return accuracy_score(val_true, val_pred) # 返回accuracy


# 测试集没有标签，需要预测提交
def predict(model, data_loader, device):
    model.eval()
    val_pred = []
    with torch.no_grad():
        for idx, (ids, att, tpe) in (enumerate(data_loader)):
            y_pred = model(ids.to(device), att.to(device), tpe.to(device))
            y_pred = torch.argmax(y_pred, dim=1).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
    return val_pred


def trian_and_eval(model, train_loader, valid_loader, 
                  optimizer, scheduler, device, epoch):    
    best_acc = 0.0
    patience = 0
    criterion = nn.CrossEntropyLoss()
    for i in range(epoch):
        """训练模型"""
        start = time.time()
        model.train()
        print("***** Running training epoch {} *****".format(i+1))
        train_loss_sum = 0.0
        for idx, (ids, att, tpe, y) in (enumerate(train_loader)):
            ids, att, tpe, y = ids.to(device), att.to(device), tpe.to(device), y.to(device)
            y_pred = model(ids, att, tpe)
            loss = criterion(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step() # 学习率变化

            train_loss_sum += loss.item()
            if (idx + 1) % (len(train_loader)//5) ==0: # 只打印五次结果
                print("Epoch {:04d} | Step {:04d}/{:04d} | Loss {:.4f} | Time {:.4f}s".format(
                    i + 1, idx + 1, len(train_loader), train_loss_sum/(idx + 1), time.time() - start))
                
        """验证模型"""
        model.eval()
        acc = evaluate(model, valid_loader, device) # 验证模型的性能

        #保存最优模型
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), 'best_bert.pth')
        
        print("current acc is {:.4f}, best acc is {:.4f}".format(acc, best_acc))
        print("time costed = {}s \n".format(round(time.time() - start, 5)))


In [78]:
# 训练和验证模型
trian_and_eval(model, train_dataloader, valid_dataloader, optimizer, scheduler, DEVICE, EPOCHS)

***** Running training epoch 1 *****
Epoch 0001 | Step 0125/0625 | Loss 2.4711 | Time 16.5549s
Epoch 0001 | Step 0250/0625 | Loss 2.4701 | Time 33.2258s
Epoch 0001 | Step 0375/0625 | Loss 2.4698 | Time 50.0695s
Epoch 0001 | Step 0500/0625 | Loss 2.4695 | Time 66.9699s
Epoch 0001 | Step 0625/0625 | Loss 2.4682 | Time 83.9663s
current acc is 0.0772, best acc is 0.0772
time costed = 88.36554s 

***** Running training epoch 2 *****
Epoch 0002 | Step 0125/0625 | Loss 2.4673 | Time 17.0088s
Epoch 0002 | Step 0250/0625 | Loss 2.4657 | Time 34.0161s
Epoch 0002 | Step 0375/0625 | Loss 2.4663 | Time 51.0031s
Epoch 0002 | Step 0500/0625 | Loss 2.4680 | Time 68.1117s
Epoch 0002 | Step 0625/0625 | Loss 2.4690 | Time 85.1929s
current acc is 0.0772, best acc is 0.0772
time costed = 89.05188s 

***** Running training epoch 3 *****
Epoch 0003 | Step 0125/0625 | Loss 2.4696 | Time 17.0297s
Epoch 0003 | Step 0250/0625 | Loss 2.4666 | Time 34.0879s
Epoch 0003 | Step 0375/0625 | Loss 2.4702 | Time 51.0905s

In [80]:
# 加载最优权重对测试集进行测试
model.load_state_dict(torch.load('best_bert.pth'))
pred_test = predict(model, test_dataloader, DEVICE)
print("\n Test Accuracy = {} \n".format(accuracy_score(y_test, pred_test)))
print(classification_report(y_test, pred_test, digits=4))


 Test Accuracy = 0.0816 

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      1013
           1     0.1068    0.2268    0.1452      1036
           2     0.0631    0.3388    0.1063      1039
           3     0.0856    0.0737    0.0792      1031
           4     0.1150    0.1582    0.1332       967
           5     0.0000    0.0000    0.0000       997
           6     0.0000    0.0000    0.0000       985
           7     0.0000    0.0000    0.0000       967
           8     0.0000    0.0000    0.0000      1031
           9     0.0000    0.0000    0.0000       934

    accuracy                         0.0816     10000
   macro avg     0.0370    0.0798    0.0464     10000
weighted avg     0.0376    0.0816    0.0471     10000

