In [1]:
# BERT imports
import torch
from sklearn.metrics import f1_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences  # padding句子用
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import numpy as np
import pandas as pd

Using TensorFlow backend.


In [2]:
print(f"PyTorch 版本： {torch.__version__}")

PyTorch 版本： 1.3.1


In [3]:
# GPUcheck

print("Is CUDA available: ", torch.cuda.is_available())
n_gpu = torch.cuda.device_count()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("GPU numbers: ", n_gpu)
    print("device_name: ", torch.cuda.get_device_name(0))

Is CUDA available:  True
GPU numbers:  2
device_name:  Tesla M40 24GB


In [4]:
train_labeled = pd.read_csv('./input/train_dataset/nCoV_100k_train_labled_utf8.csv', encoding='utf-8')

train_labeled.rename(columns = {"微博id": "Weibo_ID",
                                "微博发布时间": "Publish_Time", 
                                "发布人账号": "Account_ID",
                                "微博中文内容": "Chinese_Content",
                                "微博图片": "Pictures",
                                "微博视频": "Videos",
                                "情感倾向": "Labels"},  inplace=True)

In [5]:
# 前几句
train_labeled.iloc[0:3]

Unnamed: 0,Weibo_ID,Publish_Time,Account_ID,Chinese_Content,Pictures,Videos,Labels
0,4456072029125500,01月01日 23:50,存曦1988,写在年末冬初孩子流感的第五天，我们仍然没有忘记热情拥抱这2020年的第一天。带着一丝迷信，早...,['https://ww2.sinaimg.cn/orj360/005VnA1zly1gah...,[],0
1,4456074167480980,01月01日 23:58,LunaKrys,开年大模型…累到以为自己发烧了腰疼膝盖疼腿疼胳膊疼脖子疼#Luna的Krystallife#?,[],[],-1
2,4456054253264520,01月01日 22:39,小王爷学辩论o_O,邱晨这就是我爹，爹，发烧快好，毕竟美好的假期拿来养病不太好，假期还是要好好享受快乐，爹，新...,['https://ww2.sinaimg.cn/thumb150/006ymYXKgy1g...,[],1


In [6]:
# 清洗数据
train_labeled_clean = train_labeled[(train_labeled['Labels'] == '-1') |
                                    (train_labeled['Labels'] == '0') |
                                    (train_labeled['Labels'] == '1') ]

train_labeled_clean = train_labeled_clean.dropna()

In [7]:
sentences = [content for content in train_labeled_clean['Chinese_Content'].values]

In [8]:
sentences[0]

'写在年末冬初孩子流感的第五天，我们仍然没有忘记热情拥抱这2020年的第一天。带着一丝迷信，早晨给孩子穿上红色的羽绒服羽绒裤，祈祷新的一年，孩子们身体康健。仍然会有一丝焦虑，焦虑我的孩子为什么会过早的懂事，从两岁多开始关注我的情绪，会深沉地说：妈妈，你终于笑了！这句话像刀子一样扎入我?展开全文c'

In [11]:
labels = [label for label in train_labeled_clean['Labels'].values]

In [12]:
assert len(sentences) == len(labels)

print(len(sentences))

99560


In [14]:
print(sentences[0:3])
print(labels[0:3])

['写在年末冬初孩子流感的第五天，我们仍然没有忘记热情拥抱这2020年的第一天。带着一丝迷信，早晨给孩子穿上红色的羽绒服羽绒裤，祈祷新的一年，孩子们身体康健。仍然会有一丝焦虑，焦虑我的孩子为什么会过早的懂事，从两岁多开始关注我的情绪，会深沉地说：妈妈，你终于笑了！这句话像刀子一样扎入我?展开全文c', '开年大模型…累到以为自己发烧了腰疼膝盖疼腿疼胳膊疼脖子疼#Luna的Krystallife#?', '\ue627邱晨这就是我爹，爹，发烧快好，毕竟美好的假期拿来养病不太好，假期还是要好好享受快乐，爹，新年快乐，发烧好了就去浪吧，快快乐乐的度过这个美好假期，说不定以后就没有了嗷@邱晨虫仔2泉州·泉州理工学院?']
['0', '-1', '1']


In [15]:
tokenizer = BertTokenizer.from_pretrained('../BERT-NER/bert-chinese/', do_lower_case=True)
tokenizer

<transformers.tokenization_bert.BertTokenizer at 0x7f2e53a3f350>

In [16]:
tokenized_texts = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

In [17]:
# 这句话的input_ids
print(f"Tokenize 前的第一句话：\n{sentences[0]}\n")
print(f"Tokenize 后的第一句话: \n{tokenized_texts[0]}")

Tokenize 前的第一句话：
写在年末冬初孩子流感的第五天，我们仍然没有忘记热情拥抱这2020年的第一天。带着一丝迷信，早晨给孩子穿上红色的羽绒服羽绒裤，祈祷新的一年，孩子们身体康健。仍然会有一丝焦虑，焦虑我的孩子为什么会过早的懂事，从两岁多开始关注我的情绪，会深沉地说：妈妈，你终于笑了！这句话像刀子一样扎入我?展开全文c

Tokenize 后的第一句话: 
[101, 1091, 1762, 2399, 3314, 1100, 1159, 2111, 2094, 3837, 2697, 4638, 5018, 758, 1921, 8024, 2769, 812, 793, 4197, 3766, 3300, 2563, 6381, 4178, 2658, 2881, 2849, 6821, 8439, 2399, 4638, 5018, 671, 1921, 511, 2372, 4708, 671, 692, 6837, 928, 8024, 3193, 3247, 5314, 2111, 2094, 4959, 677, 5273, 5682, 4638, 5417, 5309, 3302, 5417, 5309, 6175, 8024, 4857, 4876, 3173, 4638, 671, 2399, 8024, 2111, 2094, 812, 6716, 860, 2434, 978, 511, 793, 4197, 833, 3300, 671, 692, 4193, 5991, 8024, 4193, 5991, 2769, 4638, 2111, 2094, 711, 784, 720, 833, 6814, 3193, 4638, 2743, 752, 8024, 794, 697, 2259, 1914, 2458, 1993, 1068, 3800, 2769, 4638, 2658, 5328, 8024, 833, 3918, 3756, 1765, 6432, 8038, 1968, 1968, 8024, 872, 5303, 754, 5010, 749, 8013, 6821, 1368, 6413, 1008, 1143, 2094, 671, 3416, 2799, 1057, 2769, 136, 2245, 2

In [18]:
print (len(tokenized_texts))  # 99560句话

99560


In [19]:
# 句子最长长度
MAX_LEN = 300

# 输入padding
# 此函数在keras里面
input_ids = pad_sequences([txt for txt in tokenized_texts],
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          truncating="post", 
                          padding="post")

In [20]:
print(f"Tokenize 前的第一句话：\n\n{sentences[0]}\n\n")
print(f"Tokenize 后的第一句话: \n\n{tokenized_texts[0]}\n\n")
print(f"Padding 后的第一句话： \n\n{input_ids[0]}")

Tokenize 前的第一句话：

写在年末冬初孩子流感的第五天，我们仍然没有忘记热情拥抱这2020年的第一天。带着一丝迷信，早晨给孩子穿上红色的羽绒服羽绒裤，祈祷新的一年，孩子们身体康健。仍然会有一丝焦虑，焦虑我的孩子为什么会过早的懂事，从两岁多开始关注我的情绪，会深沉地说：妈妈，你终于笑了！这句话像刀子一样扎入我?展开全文c


Tokenize 后的第一句话: 

[101, 1091, 1762, 2399, 3314, 1100, 1159, 2111, 2094, 3837, 2697, 4638, 5018, 758, 1921, 8024, 2769, 812, 793, 4197, 3766, 3300, 2563, 6381, 4178, 2658, 2881, 2849, 6821, 8439, 2399, 4638, 5018, 671, 1921, 511, 2372, 4708, 671, 692, 6837, 928, 8024, 3193, 3247, 5314, 2111, 2094, 4959, 677, 5273, 5682, 4638, 5417, 5309, 3302, 5417, 5309, 6175, 8024, 4857, 4876, 3173, 4638, 671, 2399, 8024, 2111, 2094, 812, 6716, 860, 2434, 978, 511, 793, 4197, 833, 3300, 671, 692, 4193, 5991, 8024, 4193, 5991, 2769, 4638, 2111, 2094, 711, 784, 720, 833, 6814, 3193, 4638, 2743, 752, 8024, 794, 697, 2259, 1914, 2458, 1993, 1068, 3800, 2769, 4638, 2658, 5328, 8024, 833, 3918, 3756, 1765, 6432, 8038, 1968, 1968, 8024, 872, 5303, 754, 5010, 749, 8013, 6821, 1368, 6413, 1008, 1143, 2094, 671, 3416, 2799, 1057, 2769, 136, 2245

In [21]:
# 转换回来
raw_texts = [tokenizer.decode(input_ids[0])]
print(raw_texts)
print(len(raw_texts))

['[CLS] 写 在 年 末 冬 初 孩 子 流 感 的 第 五 天 ， 我 们 仍 然 没 有 忘 记 热 情 拥 抱 这 2020 年 的 第 一 天 。 带 着 一 丝 迷 信 ， 早 晨 给 孩 子 穿 上 红 色 的 羽 绒 服 羽 绒 裤 ， 祈 祷 新 的 一 年 ， 孩 子 们 身 体 康 健 。 仍 然 会 有 一 丝 焦 虑 ， 焦 虑 我 的 孩 子 为 什 么 会 过 早 的 懂 事 ， 从 两 岁 多 开 始 关 注 我 的 情 绪 ， 会 深 沉 地 说 ： 妈 妈 ， 你 终 于 笑 了 ！ 这 句 话 像 刀 子 一 样 扎 入 我? 展 开 全 文 c [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

## BERT输入准备

In [22]:
# 创建attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

In [23]:
# 第一句话的 attention_masks
print(np.array(attention_masks[0]))
print(len(np.array(attention_masks[0])))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
300


In [24]:
print(len(labels))
print(labels[0:10])

99560
['0', '-1', '1', '1', '1', '-1', '-1', '0', '-1', '1']


In [25]:
clean_labels = []
for label in labels:
    clean_labels.append(int(label) + 1)  # 我们把标签变成非负数

print(clean_labels[0:10])

[1, 0, 2, 2, 2, 0, 0, 1, 0, 2]


In [26]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, clean_labels, 
                                                            random_state=2019, test_size=0.2)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2019, test_size=0.2)

In [27]:
train_labels[0:10]

[0, 1, 1, 0, 2, 0, 1, 2, 0, 1]

In [28]:
print(f"      标签总数：", len(labels))
print(f"训练集标签总数：", len(train_labels))
print(f"验证集标签总数：", len(validation_labels))

      标签总数： 99560
训练集标签总数： 79648
验证集标签总数： 19912


In [29]:
# tensor化
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [30]:
print(len(validation_inputs))
print(len(validation_labels))
print(len(validation_masks))

19912
19912
19912


In [31]:
# batch size
batch_size = 32

In [32]:
# 形成训练数据集
train_data = TensorDataset(train_inputs, train_masks, train_labels)  
# 随机采样
train_sampler = RandomSampler(train_data) 
# 读取数据
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# 形成验证数据集
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# 随机采样
validation_sampler = SequentialSampler(validation_data)
# 读取数据
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## 准备微调

In [33]:
# 统计标签种类
label_count = len(set(labels))
print(label_count)

3


In [34]:
# 读取 BertForSequenceClassification 模型，
# 是一个预训练的BERT模型，在最后面加了一个线形层用于分类。

model = BertForSequenceClassification.from_pretrained("../BERT-NER/bert-chinese/", 
                                                      num_labels=label_count)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [35]:
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']

# 权重衰减
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
     'weight_decay': 0.0}]

In [36]:
# 优化器
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=5e-5)

In [37]:
# f1_score(y_true, y_pred, average='macro')  

In [38]:
# 保存loss
train_loss_set = []
# epochs 
epochs = 4

In [None]:
# BERT training loop
for _ in range(epochs): 
    ## 训练
    print(f"当前epoch： {_}")
    # 开启训练模式
    model.train()
    tr_loss = 0  # train loss
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in tqdm(enumerate(train_dataloader)):
        # 把batch放入GPU
        batch = tuple(t.to(device) for t in batch)
        # 解包batch
        b_input_ids, b_input_mask, b_labels = batch
        # 梯度归零
        optimizer.zero_grad()
        # 前向传播loss计算
        output = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels)  # 有labels的时候，且labels>1就直接返回Cross-Entropy
        loss = output[0]
        # print(loss)
        # 反向传播
        loss.backward()
        # Update parameters and take a step using the computed gradient
        # 更新模型参数
        optimizer.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print(f"当前 epoch 的 Train loss: {tr_loss/nb_tr_steps}")

0it [00:00, ?it/s]

当前epoch： 0


2489it [1:10:08,  1.69s/it]
0it [00:00, ?it/s]

当前 epoch 的 Train loss: 0.6141455009214464
当前epoch： 1


2489it [1:10:19,  1.70s/it]
0it [00:00, ?it/s]

当前 epoch 的 Train loss: 0.5282878417981203
当前epoch： 2


407it [11:29,  1.69s/it]

### 验证数据集

In [None]:
# 验证状态
model.eval()

# 建立变量
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch

In [None]:
# 验证集的读取也要batch
for batch in tqdm(validation_dataloader):
    # 元组打包放进GPU
    batch = tuple(t.to(device) for t in batch)
    # 解开元组
    b_input_ids, b_input_mask, b_labels = batch
    # 预测
    with torch.no_grad():
        # segment embeddings，如果没有就是全0，表示单句
        # position embeddings，[0,句子长度-1]
        logits = model(input_ids=b_input_ids, 
                       attention_mask=b_input_mask,
                       token_type_ids=None,
                       position_ids=None)  
                       
    # print(logits[0])
    # Move logits and labels to CPU
    logits = logits[0].detach().cpu().numpy()  # 注意这里的logits是在softmax之前，所以和不为1
    label_ids = b_labels.to('cpu').numpy()
    # print(logits, label_ids)
    tmp_eval_f1 = f1_score(logits, label_ids, average='macro')  # 计算f1
    eval_f1 += tmp_eval_f1  # f1积累
    nb_eval_steps += 1  # 步数积累
print(f"Validation F1: {eval_f1/nb_eval_steps}")    

### 保存模型

In [None]:
# 保存模型
# They can then be reloaded using `from_pretrained()`

output_dir = "./model_save"
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
torch.save(model_to_save.state_dict(), os.path.join(output_dir, 'training_args.bin'))

### 读取模型

In [None]:
# GPU
print("Is CUDA available: ", torch.cuda.is_available())
if torch.cuda.is_available():
    n_gpu = torch.cuda.device_count()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("GPU numbers: ", n_gpu)
    print("device_name: ", torch.cuda.get_device_name(0))
    torch.cuda.set_device(1)
    print(f"Current device: {torch.cuda.current_device()}")

# 读取模型
# Load a trained model and vocabulary that you have fine-tuned
output_dir = "./model_save"
model = BertForTokenClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
model.to(device)