In [1]:
# -*- coding: utf-8 -*-

import torch
import os
import random
import numpy as np
import logging
from config import Config
from model import TorchModel, choose_optimizer
from evaluate import Evaluator
from loader import load_data
import torch
import torch.nn as nn
from transformers import BertModel
from torch.optim import Adam, SGD
from torchcrf import CRF
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

"""
模型训练主程序
"""
config = Config
config["model_type"] = 'bert'
config["bert_path"] = '/mnt/workspace/.cache/modelscope/hub/langboat/mengzi-bert-base'
config["use_crf"] = True

print(config)
if not os.path.isdir(config["model_path"]):
    os.mkdir(config["model_path"])
#加载训练数据
train_data = load_data(config["train_data_path"], config)

"""
建立网络模型结构
"""
class BertMidLayer(nn.Module):
    def __init__(self, config):
        super(BertMidLayer, self).__init__()
        self.bert = BertModel.from_pretrained(config["bert_path"], return_dict=False)
        self.bert.config.output_hidden_states = True

    def forward(self, x):
        layer_states = self.bert(x)[2]#(13, batch, len, hidden)
        layer_states = torch.add(layer_states[-2], layer_states[-1])
        return layer_states
    
class TorchModel(nn.Module):
    def __init__(self, config):
        super(TorchModel, self).__init__()
        hidden_size = config["hidden_size"]
        vocab_size = config["vocab_size"] + 1
        max_length = config["max_length"]
        class_num = config["class_num"]
        num_layers = config["num_layers"]
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.layer = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=True, num_layers=num_layers)
        self.classify = nn.Linear(hidden_size * 2, class_num)
        self.crf_layer = CRF(class_num, batch_first=True)
        self.use_crf = config["use_crf"]
        if config["model_type"] == 'bert':
            self.use_bert = True
            self.encoder = BertModel.from_pretrained(config["bert_path"], return_dict=False)
            hidden_size = self.encoder.config.hidden_size
            self.classify = nn.Linear(hidden_size, class_num)
            
        elif config["model_type"] == "bert_mid_layer":
            self.use_bert = True
            self.encoder = BertMidLayer(config)
            hidden_size = self.encoder.bert.config.hidden_size
            self.classify = nn.Linear(hidden_size, class_num)
        else:
            self.use_bert = False
            self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
            self.layer = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=True, num_layers=num_layers)
            self.classify = nn.Linear(hidden_size * 2, class_num)
        self.loss = torch.nn.CrossEntropyLoss(ignore_index=-1)  #loss采用交叉熵损失

    #当输入真实标签，返回loss值；无真实标签，返回预测值
    def forward(self, x, target=None):
        if self.use_bert:  # bert返回的结果是 (sequence_output, pooler_output)
            #sequence_output:batch_size, max_len, hidden_size
            #pooler_output:batch_size, hidden_size
            x, _ = self.encoder(x)
        else:
            x = self.embedding(x)  #input shape:(batch_size, sen_len)
            x, _ = self.layer(x)      #input shape:(batch_size, sen_len, input_dim)
        # print(x[0].shape, x[1].shape)
        predict = self.classify(x) #ouput:(batch_size, sen_len, num_tags) -> (batch_size * sen_len, num_tags)

        if target is not None:
            if self.use_crf:
                mask = target.gt(-1) 
                return - self.crf_layer(predict, target, mask, reduction="mean")
            else:
                #(number, class_num), (number)
                return self.loss(predict.view(-1, predict.shape[-1]), target.view(-1))
        else:
            if self.use_crf:
                return self.crf_layer.decode(predict)
            else:
                return predict
#加载模型
model = TorchModel(config)
# print(model)

# # 假设我们有一个批次大小为32，序列长度为64的数据
# batch_size = 2
# input_ids = torch.randint(0, config["vocab_size"], (batch_size, config["max_length"]))  # 随机整数输入

# # 生成随机的目标数据，假设每个位置都有可能被标记为-1表示忽略
# target = torch.randint(0, config["class_num"], (batch_size, config["max_length"]))
# print(target.shape)
# target[:, 90:] = -1
# mask = target.gt(-1)
# print(mask)
# if torch.cuda.is_available():
#     model = model.cuda()
#     input_ids = input_ids.cuda()
#     target = target.cuda()

# output = model(input_ids, target=target)
# print("Model output:", output)
# 标识是否使用gpu
cuda_flag = torch.cuda.is_available()
if cuda_flag:
    logger.info("gpu可以使用，迁移模型至gpu")
    model = model.cuda()
#加载优化器
optimizer = choose_optimizer(config, model)
#加载效果测试类
evaluator = Evaluator(config, model, logger)
#训练
for epoch in range(config["epoch"]):
    epoch += 1
    model.train()
    logger.info("epoch %d begin" % epoch)
    train_loss = []
    for index, batch_data in enumerate(train_data):
        optimizer.zero_grad()
        if cuda_flag:
            batch_data = [d.cuda() for d in batch_data]
        input_id, labels = batch_data   #输入变化时这里需要修改，比如多输入，多输出的情况
        loss = model(input_id, labels)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        if index % int(len(train_data) / 2) == 0:
            logger.info("batch loss %f" % loss)
    logger.info("epoch average loss: %f" % np.mean(train_loss))
    evaluator.eval(epoch)
model_path = os.path.join(config["model_path"], "epoch_%d.pth" % epoch)
# torch.save(model.state_dict(), model_path)



{'model_path': 'model_output', 'schema_path': 'ner_data/schema.json', 'train_data_path': 'ner_data/train', 'valid_data_path': 'ner_data/test', 'vocab_path': 'chars.txt', 'max_length': 100, 'hidden_size': 256, 'num_layers': 2, 'epoch': 50, 'batch_size': 32, 'optimizer': 'adam', 'learning_rate': 0.0001, 'use_crf': True, 'class_num': 9, 'bert_path': '/mnt/workspace/.cache/modelscope/hub/langboat/mengzi-bert-base', 'model_type': 'bert'}


2024-08-13 22:21:51,348 - __main__ - INFO - gpu可以使用，迁移模型至gpu
2024-08-13 22:21:55,104 - __main__ - INFO - epoch 1 begin
2024-08-13 22:21:55,338 - __main__ - INFO - batch loss 98.997070
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
2024-08-13 22:22:00,552 - __main__ - INFO - batch loss 35.471371
2024-08-13 22:22:05,641 - __main__ - INFO - batch loss 59.361069
2024-08-13 22:22:05,642 - __main__ - INFO - epoch average loss: 31.886862
2024-08-13 22:22:05,643 - __main__ - INFO - 开始测试第1轮模型效果：
  score = torch.where(mask[i].unsqueeze(1), next_score, score)
2024-08-13 22:22:06,959 - __main__ - INFO - PERSON类实体，准确率：0.000000, 召回率: 0.000000, F1: 0.000000
2024-08-13 22:22:06,960 - __main__ - INFO - LOCATION类实体，准确率：0.000000, 召回率: 0.000000, F1: 0.000000
2024-08-13 22:22:06,960 - __main__ - INFO - TIME类实体，准确率：0.000000, 召回率: 0.000000, F1: 0.000000
202

In [2]:
torch.save(model.state_dict(), model_path)

In [27]:
model.eval()
import json
import re
import os
import torch
import random
import jieba
import numpy as np
from torch.utils.data import Dataset, DataLoader
class data2vec:
    def __init__(self, config):
        self.config = config
        self.vocab = load_vocab(config["vocab_path"])
        self.config["vocab_size"] = len(self.vocab)
        self.sentences = []
        self.schema = self.load_schema(config["schema_path"])
        # self.load()

    def make_load(self, sentenece):
        input_ids = self.encode_sentence(sentenece)
        return torch.LongTensor(input_ids)
    
    def encode_sentence(self, text, padding=True):
        input_id = []
        if self.config["vocab_path"] == "words.txt":
            for word in jieba.cut(text):
                input_id.append(self.vocab.get(word, self.vocab["[UNK]"]))
        else:
            for char in text:
                input_id.append(self.vocab.get(char, self.vocab["[UNK]"]))
        if padding:
            input_id = self.padding(input_id)
        return input_id

    #补齐或截断输入的序列，使其可以在一个batch内运算
    def padding(self, input_id, pad_token=0):
        input_id = input_id[:self.config["max_length"]]
        input_id += [pad_token] * (self.config["max_length"] - len(input_id))
        return input_id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

    def load_schema(self, path):
        with open(path, encoding="utf8") as f:
            return json.load(f)

#加载字表或词表
def load_vocab(vocab_path):
    token_dict = {}
    with open(vocab_path, encoding="utf8") as f:
        for index, line in enumerate(f):
            token = line.strip()
            token_dict[token] = index + 1  #0留给padding位置，所以从1开始
    return token_dict
from collections import defaultdict
def decode(sentence, labels):
    labels = "".join([str(x) for x in labels[:len(sentence)]])
    results = defaultdict(list)
    for location in re.finditer("(04+)", labels):
        s, e = location.span()
        results["LOCATION"].append(sentence[s:e])
    for location in re.finditer("(15+)", labels):
        s, e = location.span()
        results["ORGANIZATION"].append(sentence[s:e])
    for location in re.finditer("(26+)", labels):
        s, e = location.span()
        results["PERSON"].append(sentence[s:e])
    for location in re.finditer("(37+)", labels):
        s, e = location.span()
        results["TIME"].append(sentence[s:e])
    return results

In [50]:
wordk = "邓小平指出同志邓小平在北京的协和医院，今天"
vec = data2vec(config).make_load(wordk)
sentence = model(vec.unsqueeze(dim=0).to('cuda'))
print(sentence)
decode(wordk, sentence[0])

[[2, 6, 6, 8, 8, 8, 8, 2, 6, 6, 8, 0, 4, 8, 8, 8, 8, 8, 8, 3, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]]


defaultdict(list,
            {'LOCATION': ['北京'], 'PERSON': ['邓小平', '邓小平'], 'TIME': ['今天']})