In [None]:
import re
import os
from pprint import pprint


import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.autograd as autograd
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from torch import Tensor
from transformers import AlbertModel, BertTokenizer
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences  # padding
from pytorchcrf import CRF

In [None]:
file = "dh_msra.txt"

In [None]:
class Config(object):
    """配置参数"""

    def __init__(self):
        current_path = os.getcwd()
        self.model_name = "pytorch_model.bin"
        self.bert_path = os.path.join(current_path + "/albert_chinese_tiny")
        # self.train_file = '../datas/THUCNews/train.txt'
        self.num_classes = 10  # NER 的 token 类别
        self.hidden_size = 312  # 隐藏层输出维度
        self.hidden_dropout_prob = 0.1  # dropout比例
        self.batch_size = 64  # mini-batch大小
        self.max_len = 128  # 句子的最长padding长度
        self.epochs = 3  # epoch数
        self.learning_rate = 2e-5  # 学习率
        self.save_path = os.path.join(current_path + "/finetuned_albert")  # 模型训练结果保存路径
        self.use_cuda = True
        self.device_id = 5


config = Config()
print(config.bert_path)

In [None]:
# GPUcheck

print("CUDA Available: ", torch.cuda.is_available())
n_gpu = torch.cuda.device_count()

if torch.cuda.is_available() and config.use_cuda:
    print("GPU numbers: ", n_gpu)
    print("device_name: ", torch.cuda.get_device_name(0))
    device_id = config.device_id  # 注意选择
    torch.cuda.set_device(device_id)
    device = torch.device(f"cuda:{device_id}")
    print(f"当前设备：{torch.cuda.current_device()}")
else :
    device = torch.device("cpu")
    print(f"当前设备：{device}")

In [None]:
all_sentences_separate = []
all_letter_labels = []
label_set = set()
with open(file, encoding="utf-8") as f:
    single_sentence = []
    single_sentence_labels = []
    for s in f.readlines():
        if s != "\n":
            word, label = s.split("\t")
            label = label.strip("\n")
            single_sentence.append(word)
            single_sentence_labels.append(label)
            label_set.add(label)
        elif s == "\n":
            all_sentences_separate.append(single_sentence)
            all_letter_labels.append(single_sentence_labels)
            single_sentence = []
            single_sentence_labels = []

In [None]:
print(all_sentences_separate[0:2])
print(all_letter_labels[0:2])
print(f"\n所有的标签：{label_set}")

In [None]:
# 构建 tag 到 索引 的字典
tag_to_ix = {"B-LOC": 0,
             "I-LOC": 1, 
             "B-ORG": 2, 
             "I-ORG": 3,
             "B-PER": 4,
             "I-PER": 5,
             "O": 6,
             "[CLS]":7,
             "[SEP]":8,
             "[PAD]":9}

ix_to_tag = {0:"B-LOC", 
             1:"I-LOC", 
             2:"B-ORG", 
             3:"I-ORG",
             4:"B-PER",
             5:"I-PER",
             6:"O",
             7:"[CLS]",
             8:"[SEP]",
             9:"[PAD]"}

In [None]:
all_sentences = []  # 句子

for one_sentence in all_sentences_separate:
    sentence = "".join(one_sentence)
    all_sentences.append(sentence)

pprint(all_sentences[15:20])

In [None]:
all_labels = []  # labels
for letter_labels in all_letter_labels:
    labels = [tag_to_ix[t] for t in letter_labels]
    all_labels.append(labels)

print(all_labels[0:2])
print(len(all_labels[0]))

In [None]:
# word2token
tokenizer = BertTokenizer.from_pretrained(config.bert_path, do_lower_case=True)
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_sentences]

In [None]:
# 句子padding

# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=config.max_len, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [None]:
# [3] 代表 Other 实体
for label in all_labels:
    label.insert(len(label), 8)  # [SEP] 加在末尾
    label.insert(0, 7) # [CLS] 加在开头
    if config.max_len > len(label) -1:
        for i in range(config.max_len - len(label)):
            label.append(9)  # [PAD]

In [None]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

# train-test-split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    all_labels, 
                                                                                    random_state=2019, 
                                                                                    test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                       random_state=2019, 
                                                       test_size=0.1)

# tensor化
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# dataloader

# 形成训练数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)  
# 随机采样
train_sampler = RandomSampler(train_data) 
# 读取数据
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=config.batch_size)


# 形成验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# 随机采样
validation_sampler = SequentialSampler(validation_data)
# 读取数据
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=config.batch_size)

In [None]:
class ModelAlBert(nn.Module):
    """
    新增 句子位置ID 特征拼接。
    为了保证速度，选择基于预训练的 Albert-tiny 微调

    我们想要对裁判文书进行分类，原文的文本形式，会进入BERT模型。
    代码中的“HYID”本身就是数字形式，没必要放入BERT模型中，于是我们将 BERT 输出后的 768 维向量拼接
    tensor(HYID)，也就是变成了 769 维，再过一个 线形层 + softmax 输出分类结果。
    """

    def __init__(self, config):
        super(ModelAlBert, self).__init__()
        self.num_labels = config.num_classes
        self.albert = AlbertModel.from_pretrained(config.bert_path)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_classes)

    def forward(
        self,
        input_ids: Tensor = None,
        attention_mask: Tensor = None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels: Tensor = None,
    ) -> set:
        """
        模型前向传播结构

        Args:
            input_ids (Tensor[Tensor], optional): Token 化的句子. Defaults to None.
            attention_mask (Tensor[Tensor], optional): Attention Mask，配合Padding使用. Defaults to None.
            token_type_ids ([type], optional): 上下句 id 标记，这里不涉及. Defaults to None.
            position_ids ([type], optional): token 位置 id. Defaults to None.
            head_mask ([type], optional): [description]. Defaults to None.
            inputs_embeds ([type], optional): 不需要. Defaults to None.
            labels (Tensor, optional): 标签. Defaults to None.
            HYID (Tensor, optional): 这里指的是句子的位置ID，也可以是其他特征. Defaults to None.

        Returns:
            (set): 模型的返回值, (loss), logits, (hidden_states), (attentions)
        """
        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output
                

model = ModelAlBert(config)
model.cuda()

In [None]:
class ModelAlBertCRF(nn.Module):
    """
    新增 句子位置ID 特征拼接。
    为了保证速度，选择基于预训练的 Albert-tiny 微调
    """

    def __init__(self, config):
        super(ModelAlBertCRF, self).__init__()
        self.num_labels = config.num_classes
        self.albert = AlbertModel.from_pretrained(config.bert_path)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_classes)
        self.crf = CRF(num_tags=config.num_classes, batch_first=True)

    def forward(
        self,
        input_ids: Tensor = None,
        attention_mask: Tensor = None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels: Tensor = None,
    ) -> set:
        """
        模型前向传播结构。
        注意loss采用的是CRF的 log likelihood

        Args:
            input_ids (Tensor[Tensor], optional): Token 化的句子. Defaults to None.
            attention_mask (Tensor[Tensor], optional): Attention Mask，配合Padding使用. Defaults to None.
            token_type_ids ([type], optional): 上下句 id 标记，这里不涉及. Defaults to None.
            position_ids ([type], optional): token 位置 id. Defaults to None.
            head_mask ([type], optional): [description]. Defaults to None.
            inputs_embeds ([type], optional): 不需要. Defaults to None.
            labels (Tensor, optional): 标签. Defaults to None.

        Returns:
            (set): 模型的返回值, (loss), logits, (hidden_states), (attentions)
        """
        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        loss = None
        outputs = (logits,)
        if labels is not None:
            loss = self.crf(emissions = logits, tags=labels, mask=attention_mask)
            # Note that the returned value is the log likelihood 
            # so you’ll need to make this value negative as your loss. 
            outputs =(-1 * loss,) + outputs
        return outputs # (loss), scores
                

model = ModelAlBertCRF(config)
model.cuda()

In [None]:
# BERT fine-tuning parameters
bert_param_optimizer = list(model.albert.named_parameters())
crf_param_optimizer = list(model.crf.named_parameters())
linear_param_optimizer = list(model.classifier.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

# 权重衰减
optimizer_grouped_parameters = [
    {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.01,
     'lr': config.learning_rate},
    {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0,
     'lr': config.learning_rate},
    
    {'params': [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01, 
     'lr': config.crf_learning_rate},
    {'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0,
     'lr': config.crf_learning_rate},
    
    {'params': [p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01,
     'lr': config.crf_learning_rate},
    {'params': [p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0,
     'lr': config.crf_learning_rate}
]
# 优化器
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=config.learning_rate)


In [None]:
# 保存loss
train_loss_set = []

In [None]:
# BERT training loop
for _ in range(config.epochs): 
    ## 训练
    print(f"当前epoch： {_}")
    # 开启训练模式
    model.train()
    tr_loss = 0  # train loss
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in tqdm(enumerate(train_dataloader)):
        # 把batch放入GPU
        batch = tuple(t.to(device) for t in batch)
        # 解包batch
        b_input_ids, b_input_mask, b_labels = batch
        # 梯度归零
        optimizer.zero_grad()
        # 前向传播loss计算
        output = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels)  
        loss = output[0]
        # print(loss)
        # 反向传播
        loss.backward()
        # Update parameters and take a step using the computed gradient
        # 更新模型参数
        optimizer.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print(f"当前 epoch 的 Train loss: {tr_loss/nb_tr_steps}")

In [None]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [None]:
# 验证集的读取也要batch
for batch in tqdm(validation_dataloader):
    # 元组打包放进GPU
    batch = tuple(t.to(device) for t in batch)
    # 解开元组
    b_input_ids, b_input_mask, b_labels = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        outputs = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # print(logits[0])
    # Move logits and labels to CPU
    scores = outputs[0].detach().cpu().numpy()  # 每个字的标签的概率
    pred_flat = np.argmax(scores[0], axis=1).flatten()
    label_ids = b_labels.to('cpu').numpy()  # 真实labels
    # print(logits, label_ids)