# Bert+MLP

In [None]:
from fastNLP.io import WeiboNERLoader
data_bundle = WeiboNERLoader().load()
print(data_bundle)
print(data_bundle.get_dataset('train')[:4])

In [None]:
from matplotlib import pyplot as plt

labels = list(data_bundle.get_dataset('train')['target'])
labels+= list(data_bundle.get_dataset('test')['target'])
labels+= list(data_bundle.get_dataset('dev')['target'])
dic_label = {}
for label_list in labels:
    for label in label_list:
        if label in dic_label:
            dic_label[label]+=1
        else:
            dic_label[label]=1

In [None]:
plt.figure(figsize=(20,10),dpi=200)
plt.title('num_labels')
bar = plt.bar(dic_label.keys(),dic_label.values())
plt.bar_label(bar,label_type='edge')
plt.show()

In [None]:
from fastNLP.transformers.torch import BertTokenizer
from fastNLP import cache_results, Vocabulary

def process_data(data_bundle, model_name):

    tokenizer = BertTokenizer.from_pretrained(model_name)
    def bpe(raw_words):
        bpes = [tokenizer.cls_token_id]
        first = [0]
        first_index = 1  # 记录第一个bpe的位置
        for word in raw_words:
            bpe = tokenizer.encode(word, add_special_tokens=False)
            bpes.extend(bpe)
            first.append(first_index)
            first_index += len(bpe)
        bpes.append(tokenizer.sep_token_id)
        first.append(first_index)
        return {'input_ids': bpes, 'input_len': len(bpes), 'first': first, 'seq_len': len(raw_words)}
    # 对data_bundle中每个dataset的每一条数据中的raw_words使用bpe函数，并且将返回的结果加入到每条数据中。
    data_bundle.apply_field_more(bpe, field_name='raw_chars', num_proc=4)

    # tag的词表，由于这是词表，所以不需要有padding和unk
    tag_vocab = Vocabulary(padding=None, unknown=None)
    # 从 train 数据的 raw_target 中获取建立词表
    tag_vocab.from_dataset(data_bundle.get_dataset('train'), field_name='target')
    # 使用词表将每个 dataset 中的raw_target转为数字，并且将写入到target这个field中
    tag_vocab.index_dataset(data_bundle.datasets.values(), field_name='target', new_field_name='new_target')

    # 可以将 vocabulary 绑定到 data_bundle 上，方便之后使用。
    data_bundle.set_vocab(tag_vocab, field_name='new_target')

    return data_bundle, tokenizer

data_bundle, tokenizer = process_data(data_bundle, 'hfl/rbt3')
print(data_bundle)
print(data_bundle.get_dataset("train")[:4])

In [None]:
from fastNLP import prepare_torch_dataloader

dataloaders = prepare_torch_dataloader(data_bundle, batch_size=8)

for dl in dataloaders.values():
    # 可以通过 set_pad 修改 padding 的行为。
    dl.set_pad('input_ids', pad_val=tokenizer.pad_token_id)
    dl.set_pad('new_target', pad_val=-100)

In [None]:
import torch
from torch import nn
from fastNLP.transformers.torch import BertModel
from fastNLP import seq_len_to_mask
import torch.nn.functional as F


class BertNER(nn.Module):
    def __init__(self, model_name, num_class):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.mlp = nn.Sequential(nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size),
                                nn.Dropout(0.3),
                                nn.Linear(self.bert.config.hidden_size, num_class))

    def forward(self, input_ids, input_len, first):
        attention_mask = seq_len_to_mask(input_len)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        first = first.unsqueeze(-1).repeat(1, 1, last_hidden_state.size(-1))
        first_bpe_state = last_hidden_state.gather(dim=1, index=first)
        first_bpe_state = first_bpe_state[:, 1:-1]  # 删除 cls 和 sep

        pred = self.mlp(first_bpe_state)
        # print(first_bpe_state.shape)
        return {'pred': pred}

    def train_step(self, input_ids, input_len, first, target):
        print('input_ids',input_ids.shape)
        print('input_len',input_len.shape)
        print('first',first.shape)
        print('target',target.shape)
        pred = self(input_ids, input_len, first)['pred']
        loss = F.cross_entropy(pred.transpose(1, 2), target)
        return {'loss': loss}

    def evaluate_step(self, input_ids, input_len, first):
        pred = self(input_ids, input_len, first)['pred'].argmax(dim=-1)
        return {'pred': pred}

model = BertNER('hfl/rbt3', len(data_bundle.get_vocab('new_target')))


In [None]:
from torch import optim
from fastNLP import Trainer, LoadBestModelCallback, TorchWarmupCallback
from fastNLP import SpanFPreRecMetric

optimizer = optim.Adam(model.parameters(), lr=2e-5)
callbacks = [
    LoadBestModelCallback(),
    TorchWarmupCallback(),
]
metrics = {
    "f": SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('new_target')),
}

def input_mapping(data):
    data['target'] = data['new_target']
    return data

trainer = Trainer(model=model, train_dataloader=dataloaders['train'], optimizers=optimizer,
                  evaluate_dataloaders=dataloaders['dev'],
                  metrics=metrics, n_epochs=50, callbacks=callbacks,
                  monitor='f#f',device='cuda',driver="torch",input_mapping=input_mapping)
trainer.run()

In [None]:
from fastNLP import Evaluator

def output_labeling(evaluator, batch):
    outputs = evaluator.evaluate_step(batch)["pred"]
    raw_words, raw_targets = batch["raw_chars"], batch["target"]
    for words, raw_target, output in zip(raw_words, raw_targets, outputs):

        print("sentence:", words)
        labels = [data_bundle.get_vocab("new_target").idx2word[idx] for idx in output[:len(words)].tolist() ]
        print("labels:", labels)
        print("target:", raw_target)

evaluator = Evaluator(model=model, dataloaders=dataloaders["test"],
                      device=0, evaluate_batch_step_fn=output_labeling)
evaluator.run(1)

In [None]:
import os

torch.save(model,'rbt3-mlp-ner.pth')

In [None]:
# 通过函数处理数据
from fastNLP.io import DataBundle
from fastNLP import DataSet, Instance

def text2dataset(text:str):
    ds = DataSet()
    if text != '':  
        ds.append(Instance(raw_words = list(text)))
    return ds

text = '我今天就要在中国传媒大学吃上崔永元真面！'

predict_data_bundle = DataBundle(datasets={
    "predict": text2dataset(text),
})
print(predict_data_bundle)
print(predict_data_bundle.get_dataset("predict"))

In [None]:
from fastNLP.transformers.torch import BertTokenizer
from fastNLP import cache_results, Vocabulary

def process_predict_data(data_bundle, model_name):

    tokenizer = BertTokenizer.from_pretrained(model_name)
    def bpe(raw_words):
        bpes = [tokenizer.cls_token_id]
        first = [0]
        first_index = 1  # 记录第一个bpe的位置
        for word in raw_words:
            bpe = tokenizer.encode(word, add_special_tokens=False)
            bpes.extend(bpe)
            first.append(first_index)
            first_index += len(bpe)
        bpes.append(tokenizer.sep_token_id)
        first.append(first_index)
        return {'input_ids': bpes, 'input_len': len(bpes), 'first': first, 'seq_len': len(raw_words)}
    # 对data_bundle中每个dataset的每一条数据中的raw_words使用bpe函数，并且将返回的结果加入到每条数据中。
    data_bundle.apply_field_more(bpe, field_name='raw_words', num_proc=1)

    return data_bundle, tokenizer

predict_data_bundle, predict_tokenizer = process_predict_data(predict_data_bundle, 'hfl/rbt3')

print(predict_data_bundle)
print(predict_data_bundle.get_dataset("predict"))

from fastNLP import prepare_torch_dataloader

predict_dataloaders = prepare_torch_dataloader(predict_data_bundle, batch_size=1)

In [None]:
ner_model = torch.load('rbt3-mlp-ner.pth')

label_idx_list = list(data_bundle.get_vocab("new_target"))

In [None]:
def write_list_into_text(path,label_idx_list):
    with open(path,'w') as f:
        for pair in label_idx_list:
            f.writelines(str(pair[0])+','+str(pair[1]))
            f.write('\n')
        print('write over!')
write_list_into_text('label_idx_list.txt',label_idx_list)

In [None]:
def read_list_from_text(path):
    final_list = []
    with open(path,'r') as f:
        lines = f.readlines()
        
        for line in lines:
            line = line.strip().split(',')
            final_list.append((line[0],int(line[1])))
    return final_list
label_idx_list = read_list_from_text('label_idx_list.txt')

In [None]:
def idx2label(label_idx_list:list,idx:int):
        # [('O',0),(label,idx)...]
        # return label
        label = None
        for pair in label_idx_list:
            if pair[1] == idx:
                 label = pair[0]
        return label

def predict_output_labeling(evaluator, batch):
    outputs = evaluator.evaluate_step(batch)["pred"]
    raw_words = batch["raw_words"]
    for words, output in zip(raw_words, outputs):
        print("sentence:", words)
        labels = [idx2label(label_idx_list,idx) for idx in output[:len(words)].tolist() ]
        print("labels:", labels)
    print('outputs:',outputs)
predictor = Evaluator(model=model, dataloaders=predict_dataloaders["predict"],
                      device=0, evaluate_batch_step_fn=predict_output_labeling)
predictor.run()

In [None]:
dev = next(ner_model.parameters()).device
ner_model.eval()

for data in predict_dataloaders['predict']:
    input_ids = torch.LongTensor(data['input_ids']).to(dev)
    input_len = torch.LongTensor(data['input_len']).to(dev)
    first =     torch.LongTensor(data['first']).to(dev)
    
    result = ner_model.evaluate_step(input_ids,input_len,first)['pred']
    
print(result)

In [None]:
print([idx2label(label_idx_list,r) for r in result[0]])

# BiLSTM+CRF

In [1]:
from fastNLP.io import WeiboNERLoader
data_bundle = WeiboNERLoader().load()
print(data_bundle)
print(data_bundle.get_dataset('train')[:4])

  from .autonotebook import tqdm as notebook_tqdm


In total 3 datasets:
	dev has 270 instances.
	test has 270 instances.
	train has 1350 instances.

+------------------------------------------+------------------------------------------+
| raw_chars                                | target                                   |
+------------------------------------------+------------------------------------------+
| ['科', '技', '全', '方', '位', '资', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['对', '，', '输', '给', '一', '个', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER... |
| ['今', '天', '下', '午', '起', '来', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['今', '年', '拜', '年', '不', '短', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
+------------------------------------------+------------------------------------------+


In [2]:
from fastNLP.transformers.torch import BertTokenizer
from fastNLP import cache_results, Vocabulary

def process_data(data_bundle, model_name):

    tokenizer = BertTokenizer.from_pretrained(model_name)
    def get_tokens(raw_words):
        tokens = [tokenizer.cls_token_id]
        tokens.extend(tokenizer.encode(raw_words, add_special_tokens=False))
        tokens.append(tokenizer.sep_token_id)
        return {'input_ids': tokens, 'input_len': len(tokens),'seq_len': len(raw_words)}
    # 对data_bundle中每个dataset的每一条数据中的raw_words使用get_tokens函数，并且将返回的结果加入到每条数据中。
    data_bundle.apply_field_more(get_tokens, field_name='raw_chars', num_proc=4)

    # tag的词表，由于这是词表，所以不需要有padding和unk
    tag_vocab = Vocabulary(padding=None, unknown=None)
    # 从 train 数据的 raw_target 中获取建立词表
    tag_vocab.from_dataset(data_bundle.get_dataset('train'), field_name='target')
    # 使用词表将每个 dataset 中的raw_target转为数字，并且将写入到target这个field中
    tag_vocab.index_dataset(data_bundle.datasets.values(), field_name='target', new_field_name='new_target')

    # 可以将 vocabulary 绑定到 data_bundle 上，方便之后使用。
    data_bundle.set_vocab(tag_vocab, field_name='new_target')

    return data_bundle, tokenizer

data_bundle, tokenizer = process_data(data_bundle, 'hfl/rbt3')
print(data_bundle)
print(data_bundle.get_dataset("train")[790:800])

In total 3 datasets:
	dev has 270 instances.
	test has 270 instances.
	train has 1350 instances.
In total 1 vocabs:
	new_target has 17 entries.

+------------------+------------------+------------------+-----------+---------+------------------+
| raw_chars        | target           | input_ids        | input_len | seq_len | new_target       |
+------------------+------------------+------------------+-----------+---------+------------------+
| ['【', '小', ... | ['O', 'O', 'O... | [101, 523, 22... | 68        | 66      | [0, 0, 0, 0, ... |
| ['当', '明', ... | ['O', 'O', 'O... | [101, 2496, 3... | 111       | 109     | [0, 0, 0, 0, ... |
| ['人', '生', ... | ['O', 'O', 'O... | [101, 782, 44... | 132       | 130     | [0, 0, 0, 0, ... |
| ['詹', '姆', ... | ['B-PER.NAM',... | [101, 6285, 1... | 35        | 33      | [4, 2, 2, 0, ... |
| ['他', '剪', ... | ['O', 'O', 'O... | [101, 800, 11... | 63        | 61      | [0, 0, 0, 0, ... |
| ['#', 'T', 'S... | ['O', 'O', 'O... | [101, 108, 10... | 73    

In [13]:
from fastNLP import prepare_torch_dataloader

dataloaders = prepare_torch_dataloader(data_bundle, batch_size=128,shuffle=True)

for dl in dataloaders.values():
    # 可以通过 set_pad 修改 padding 的行为。
    dl.set_pad('input_ids', pad_val=tokenizer.pad_token_id)
    dl.set_pad('new_target', pad_val=0)

In [12]:
# 第2\3种
import torch
from torch import nn
from fastNLP.transformers.torch import BertModel
from fastNLP import seq_len_to_mask
import torch.nn.functional as F
from fastNLP.modules.torch import ConditionalRandomField

class BertBilstmCrfNER(nn.Module):
    def __init__(self, model_name,num_class, embedding_dim = 768,hidden_size=512,dropout=0.5):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            num_layers=2,
            hidden_size=hidden_size,
            bidirectional=True,
            batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, num_class)
        self.crf = ConditionalRandomField(num_class)
        

    def forward(self, input_ids, input_len,target=None):
        attention_mask = seq_len_to_mask(input_len)
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        first_bpe_state = last_hidden_state[:, 1:-1]
        feats, _ = self.lstm(first_bpe_state) # 输入lstm
        feats = self.fc(feats)
        feats = self.dropout(feats)
        logits = F.log_softmax(feats, dim=-1)
        
        mask = seq_len_to_mask(input_len-2)
        
        if target is None:
            pred, _ = self.crf.viterbi_decode(logits, mask)
            return {'pred': pred}
        else:
            loss = self.crf(logits, target, mask).mean()
            return {'loss': loss}

    def train_step(self, input_ids, input_len, target):
        # {'loss':loss}
        return self(input_ids, input_len,target)

    def evaluate_step(self, input_ids, input_len):
        #  {'pred': pred}
        return self(input_ids, input_len)

model = BertBilstmCrfNER('hfl/rbt3', len(data_bundle.get_vocab('new_target')))

In [None]:
from torch import optim
from fastNLP import Trainer, LoadBestModelCallback, TorchWarmupCallback
from fastNLP import SpanFPreRecMetric

optimizer = optim.AdamW(model.parameters(), lr=2e-2, eps=1e-6)
callbacks = [
    LoadBestModelCallback(),
    TorchWarmupCallback(),
]
metrics = {
    "f": SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('new_target')),
}

def input_mapping(data):
    data['target'] = data['new_target']
    return data

trainer = Trainer(model=model, 
                  train_dataloader=dataloaders['train'],
                  evaluate_dataloaders=dataloaders['dev'], 
                  metrics=metrics, 
                  optimizers=optimizer,
                  n_epochs=100, 
                  callbacks=callbacks,
                  monitor='f#f',
                  device='cuda',
                  driver="torch",
                  input_mapping=input_mapping)
trainer.run()

In [16]:
torch.save(model,'rbt3-bilstm-crf-ner70.pth')

In [None]:
data_bundle.get_dataset('dev')

In [None]:
for i in range(1000):
    if data_bundle.get_dataset('dev').get_field('input_len')[i]-data_bundle.get_dataset('dev').get_field('seq_len')[i].item()!=2:
        fuck = data_bundle.get_dataset('dev')[i]
        break
print(len(fuck['raw_chars']))
print(fuck['raw_chars'])
print(len(fuck['target']))
print(fuck['target'])

print(len(fuck['input_ids']))
print(fuck['input_ids'])

print(fuck['input_len'])
# for i in range(len(fuck['raw_chars'])):
#     print(fuck['raw_chars'][i],fuck['input_ids'][i])

In [None]:
data_bundle.get_dataset('dev')[:100]

In [None]:
words = tokenizer.decode(fuck['input_ids'])
print(words)

In [None]:
tokenizer.decode([101, 138, 5862, 1383, 140, 4510, 6413, 2458, 6858, 5018, 753, 2476, 928, 4500, 1305, 704, 8024, 119, 119, 119, 100, 2445, 1044, 4495, 8024, 2644, 4638, 6821, 697, 2476, 1305, 3221, 1066, 775, 4638, 671, 702, 2600, 7583, 2428, 511, 100, 100, 738, 2218, 3221, 6432, 6206, 3221, 2769, 5314, 671, 2476, 1305, 1957, 1351, 8024, 1961, 1170, 749, 2600, 7583, 2428, 4638, 126, 126, 110, 8024, 2769, 2218, 1372, 5543, 1170, 2600, 7583, 2428, 4638, 125, 126, 110, 749, 8043, 100, 100, 928, 4500, 1305, 788, 7361, 3315, 782, 886, 4500, 8024, 6435, 1257, 2199, 1305, 769, 5314, 2644, 4638, 2157, 782, 2772, 3301, 1351, 8013, 100, 7561, 3198, 2682, 6629, 5635, 4415, 1399, 6241, 100, 154, 151, 161, 162, 147, 156, 8024, 156, 147, 164, 147, 160, 147, 164, 147, 160, 149, 151, 8148, 9074, 12298, 10585, 12333, 12298, 10905, 12026, 12298, 11537, 151, 8144, 10927, 11854, 10301, 11345, 11089, 10927, 157, 8144, 167, 8144, 157, 8144, 163, 8152, 160, 8152, 165, 8152, 12224, 11651, 10440, 107, 138, 1677, 1677, 140, 102])

In [None]:
tokenizer.decode(102)