In [1]:
from collections import defaultdict
from operator import itemgetter
from tqdm import tqdm
import numpy as np
import random
import torch 
import jieba
import json
import os
import pickle as pk

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.cuda.set_device(0)
else:
    device = torch.device('cpu')
# 确定模型训练方式，GPU训练或CPU训练
parameter_copy = {
    # 此处embedding维度为768
    'd_model':768, 
    # rnn的隐层维度为300
    'hid_dim':300,
    # 训练的批次为100轮
    'epoch':20,
    # 单次训练的batch_size为100条数据
    'batch_size':50,
    # 设置序列的最大长度为100
    'n_layers':2,
    # 设置dropout，为防止过拟合
    'dropout':0.1,
    # 配置cpu、gpu
    'device':device,
    # 设置训练学习率
    'lr':0.001,
    # 优化器的参数，动量主要用于随机梯度下降
    'momentum':0.99,
    'max_len':50,
}

def build_dataSet(parameter,data_path = '../../dataSet/tagging.txt'):
    data = open(data_path,'r',encoding = 'utf-8').readlines()
    data_set = {'input':[],'label':[]}
    key_table = defaultdict(int)
    vocab_table = defaultdict(int)
    vocab_table['<PAD>'] = 0
    vocab_table['<UNK>'] = 0
    for i in data:
        i = i.strip().split()
        data_set['input'].append(i[0])
        data_set['label'].append(i[1])
        vocab_table[i[0]] += 1
        key_table[i[1]] += 1
    key2ind = dict(zip(key_table.keys(),range(len(key_table))))
    ind2key = dict(zip(range(len(key_table)),key_table.keys()))
    word2ind = dict(zip(vocab_table.keys(),range(len(vocab_table))))
    ind2word = dict(zip(range(len(vocab_table)),vocab_table.keys()))
    parameter['key2ind'] = key2ind
    parameter['ind2key'] = ind2key
    parameter['word2ind'] = word2ind
    parameter['ind2word'] = ind2word
    parameter['data_set'] = data_set
    parameter['output_size'] = len(key2ind)
    parameter['word_size'] = len(word2ind)
    return parameter

def sample(parameter):#数据增强
    while 1:
        data_set = parameter['data_set']
        select_id = random.randint(0,len(data_set['label'])-parameter['max_len'])
        select_id = [select_id,select_id+parameter['max_len']-1]#随机数往后取50个，进行裁切
        #保证关键词不被拆分，开头要为 B\O\S结尾要为O\E\S
        while data_set['label'][select_id[0]][0] not in ['O','B','S'] and select_id[0] < len(data_set['label']):
            select_id[0] += 1
        while data_set['label'][select_id[1]][0] not in ['O','E','S'] and select_id[1] > 0:
            select_id[1] -= 1
            
        if select_id[1] > select_id[0] and \
            data_set['label'][select_id[0]][0] in ['O','B','S'] and \
            data_set['label'][select_id[1]][0] in ['O','E','S']:
            select_label = data_set['label'][select_id[0]:select_id[1]+1]
            select_input = data_set['input'][select_id[0]:select_id[1]+1]
            return select_input,select_label
        else:
            continue


def batch_yield(parameter):
    Epoch = parameter['epoch'] 
    for epoch in range(Epoch):
        inputs,targets = [],[]
        max_len = 0
        for items in tqdm(range(10000)):
            input,label = sample(parameter)
            input = tokenizer.convert_tokens_to_ids(input)
            label = itemgetter(*label)(parameter['key2ind'])
            label = label if type(label) == type(()) else (label,0)
            if len(input) > max_len:
                max_len = len(input)
            inputs.append(list(input))
            targets.append(list(label))
            if len(inputs) >= parameter['batch_size']:
                inputs = [i+[0]*(max_len-len(i)) for i in inputs]
                targets = [i+[0]*(max_len-len(i)) for i in targets]
                if items < 10000-1:
                    yield list2torch(inputs),list2torch(targets),None,False
                else:
                    yield list2torch(inputs),list2torch(targets),epoch,False
                inputs,targets = [],[]
                max_len = 0
        inputs = [i+[0]*(max_len-len(i)) for i in inputs]
        targets = [i+[0]*(max_len-len(i)) for i in targets]
    yield None,None,None,True
            

def list2torch(ins):
    return torch.from_numpy(np.array(ins)).long().to(parameter['device'])

parameter = build_dataSet(parameter_copy)
pk.dump(parameter,open('parameter.pkl','wb'))

In [2]:
a_list,b_list = [],[]
for i in range(2):
    a,b = sample(parameter)
    a_list.append(a)
    b_list.append(b)
print(a_list,'\n\n',b_list)

[['和', 'e', 's', 't', 'i', 'm', 'a', 't', 'o', 'r', '-', '再', '确', '定', '每', '棵', '树', '的', '基', '本', '信', '息', '，', 'm', 'a', 'x', '_', 'd', 'e', 'p', 't', 'h', '和', 'm', 'i', 'n', '_', 'c', 'h', 'i', 'l', 'd', '_', 'w'], ['最', '大', '熵', '-', '在', '解', '决', '二', '分', '类', '问', '题', '是', '等', '同', '的', '-', 'l', 'r', '和', 's', 'v', 'm', '-', '都', '可', '分', '类', '，', '都', '是', '判', '别', '式', '模', '型', '思', '路', '-', '通', '常', '都', '是', '用', '正', '则', '化', '进', '行', '规']] 

 [['O', 'B-机器学习', 'I-机器学习', 'I-机器学习', 'I-机器学习', 'I-机器学习', 'I-机器学习', 'I-机器学习', 'I-机器学习', 'E-机器学习', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'S-机器学习', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-机器学习', 'E-机器学习', 'O', 'B-机器学习', 'I-机器学习', 'E-机器学习', 'O', 'O', 'O', 'B-机器学习', 'E-机器学习', 'O', 'O', 'O', 'B-机器学习', 'I-机器学习', 'E-机器学习', 'O', 'O', 'O', 'O', 'O', 'O'

In [7]:
print(a_list[0],'\n')
print(a_list[1])
print('\n')
print(b_list[0],'\n')
print(b_list[1])

['以', '保', '持', '正', '态', '分', '布', '且', '方', '差', '相', '近', '：', 'n', 'p', '.', 'r', 'a', 'n', 'd', 'o', 'm', '.', 'r', 'a', 'n', 'd', '(', 'l', 'a', 'y', 'e', 'r', '[', 'n', '-', '1', ']', ',', 'l', 'a', 'y', 'e', 'r', '[', 'n', ']', ')', '*', 'n'] 

['A', 't', 't', 'e', 'n', 't', 'i', 'o', 'n', '只', '是', '重', '复', '了', 'h', '次', '的', 'A', 't', 't', 'e', 'n', 't', 'i', 'o', 'n', '，', '最', '后', '把', '结', '果', '进', '行', '拼', '接', 'A', 't', 't', 'e', 'n', 't', 'i', 'o', 'n', '模', '型', '怎', '么', '避']


['O', 'O', 'O', 'B-推荐', 'I-推荐', 'I-推荐', 'E-推荐', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-推荐', 'I-推荐', 'I-推荐', 'I-推荐', 'I-推荐', 'E-推荐', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-深度学习', 'I-深度学习', 'I-深度学习', 'I-深度学习', 'I-深度学习', 'I-深度学习', 'I-深度学习', 'I-深度学习', 'E-深度学习', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [3]:
from transformers import WEIGHTS_NAME, BertConfig,get_linear_schedule_with_warmup,AdamW, BertTokenizer
from transformers import BertModel,BertPreTrainedModel
from torch.nn import CrossEntropyLoss
import torch.nn as nn
import torch

import torch.nn.functional as F # pytorch 激活函数的类
from torch import nn,optim # 构建模型和优化器
from torchcrf import CRF

class bert_crf(BertPreTrainedModel):
    def __init__(self, config,parameter):
        super(bert_crf, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        embedding_dim = parameter['d_model']
        output_size = parameter['output_size']
        self.fc = nn.Linear(embedding_dim, output_size)
        self.init_weights()
        self.crf = CRF(output_size,batch_first=True)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None,labels=None):
        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.fc(sequence_output)
        return logits
    
config_class, bert_crf, tokenizer_class = BertConfig, bert_crf, BertTokenizer
config = config_class.from_pretrained("prev_trained_model")
tokenizer = tokenizer_class.from_pretrained("prev_trained_model")

In [9]:
model = bert_crf.from_pretrained("prev_trained_model",config=config,parameter = parameter)
tmp = model(torch.zeros((100,30)).long())

Some weights of the model checkpoint at prev_trained_model were not used when initializing bert_crf: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing bert_crf from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing bert_crf from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of bert_crf were not initialized from the model checkpoint at prev_trained_model and are newly initialized: ['crf.start_transitions', 'fc.weight

In [None]:
[[i.shape,i] for i in tmp]

In [4]:
import os
import shutil
import pickle as pk
from torch.utils.tensorboard import SummaryWriter

random.seed(2019)

# 构建模型
model = bert_crf.from_pretrained("prev_trained_model",config=config,parameter = parameter).to(parameter['device'])

# 决定训练权重
full_finetuning = True
if full_finetuning:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
             'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
             'weight_decay': 0.0}
        ]
else: 
        param_optimizer = list(model.fc.named_parameters()) 
        optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]

# 确定优化器和策略
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, correct_bias=False)
train_steps_per_epoch = 10000 // parameter['batch_size']
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=train_steps_per_epoch, num_training_steps=parameter['epoch'] * train_steps_per_epoch)

# 确定训练模式
model.train()

# 准备迭代器
train_yield = batch_yield(parameter)

# 开始训练
loss_cal = []
min_loss = float('inf')
logging_steps = 0
while 1:
        inputs,targets,epoch,keys = next(train_yield)
        if keys:
            break
        out = model(inputs)
        loss = -model.crf(out,targets)
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=5)
        optimizer.step()
        scheduler.step()
        loss_cal.append(loss.item())
        logging_steps += 1
        if logging_steps%20 == 0:
            print(sum(loss_cal)/len(loss_cal))
        if epoch is not None:
            if (epoch+1)%1 == 0:
                loss_cal = sum(loss_cal)/len(loss_cal)
                if loss_cal < min_loss:
                    min_loss = loss_cal
                    torch.save(model.state_dict(), 'bert_crf.h5')
                print('epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, \
                                                       parameter['epoch'],loss_cal))
            loss_cal = [loss.item()]

Some weights of the model checkpoint at prev_trained_model were not used when initializing bert_crf: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing bert_crf from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing bert_crf from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of bert_crf were not initialized from the model checkpoint at prev_trained_model and are newly initialized: ['crf.end_transitions', 'crf.start_t

4957.164440917969


 20%|███████████████▍                                                             | 2000/10000 [09:15<29:13,  4.56it/s]

3420.3659912109374


 30%|███████████████████████                                                      | 3000/10000 [12:51<24:41,  4.72it/s]

2837.127402750651


 40%|██████████████████████████████▊                                              | 4000/10000 [16:24<21:00,  4.76it/s]

2486.2299530029295


 50%|██████████████████████████████████████▌                                      | 5000/10000 [20:06<18:51,  4.42it/s]

2210.516372680664


 60%|██████████████████████████████████████████████▏                              | 6000/10000 [23:49<16:04,  4.15it/s]

1991.5955556233723


 70%|█████████████████████████████████████████████████████▉                       | 7000/10000 [27:28<11:15,  4.44it/s]

1809.7765890938895


 80%|█████████████████████████████████████████████████████████████▌               | 8000/10000 [31:06<07:24,  4.49it/s]

1648.2958082199098


 90%|█████████████████████████████████████████████████████████████████████▎       | 9000/10000 [34:52<03:36,  4.62it/s]

1513.8999418470594


100%|████████████████████████████████████████████████████████████████████████████▌| 9950/10000 [38:25<00:11,  4.34it/s]

1399.1111784362793


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [38:37<00:00,  4.31it/s]


epoch [1/20], Loss: 1399.1112


  2%|█▌                                                                            | 200/10000 [00:44<36:16,  4.50it/s]

KeyboardInterrupt: 