In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
import pandas as pd
import os
from config import *

## 源数据预处理

In [2]:
# 获取文件名（编号）
def get_doc_ids(data_dir):
    fileids = set()
    for filename in os.listdir(data_dir):
        fileids.add(filename.split('.')[0])
    return fileids

fileids = get_doc_ids(raw_train_data)
# fileids

In [3]:
# 读取单个txt文件内容
def read_single_txt(data_dir, doc_id):
    filename = os.path.join(data_dir, doc_id + '.txt')
    with open(filename, encoding='utf-8') as f:
        text = f.read()
    return text

text = read_single_txt(raw_train_data, '0')
# text

In [4]:
# 读取单个标注文件内容
def read_single_ann(data_dir, doc_id):
    ann = pd.read_csv(data_dir + doc_id + '.ann', header=None, sep='\t', encoding='utf-8')
    return ann

ann = read_single_ann(raw_train_data, '0')    
ann.head()

Unnamed: 0,0,1,2
0,T1,Disease 1845 1850,1型糖尿病
1,T2,Disease 1983 1988,1型糖尿病
2,T4,Disease 30 35,2型糖尿病
3,T5,Disease 1822 1827,2型糖尿病
4,T6,Disease 2055 2060,2型糖尿病


In [5]:
def get_char_data(data_dir):
    texts = []
    tags = []

    # 获取所有文件编号
    fileids = get_doc_ids(data_dir)

    for fileid in fileids:
        text = read_single_txt(data_dir, fileid)
        text_list = [char for char in text]
        
        tag_list = ['O' for _ in range(len(text_list))]
        tag = read_single_ann(data_dir, fileid)
        
        for i in range(tag.shape[0]):
            tag_item = tag.iloc[i][1].split(' ')
            label, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])

            tag_list[start] = 'B-' + label
            for j in range(start + 1, end):
                tag_list[j] = 'I-' + label
                
        assert (len(text_list) == len(tag_list))
        texts.append(text_list)
        tags.append(tag_list)
    return texts,tags

texts, tags = get_char_data(mytrain_data)
print(texts[:1])

[['大', '学', '第', '一', '医', '院', '内', '分', '泌', '科', '(', '郭', '晓', '蕙', ')', ',', '心', '内', '科', '(', '霍', '勇', ')', ';', '南', '京', '鼓', '楼', '医', '院', '内', '分', '泌', '科', '(', '朱', '大', '龙', ')', ';', '四', '川', '大', '学', '华', '西', '医', '院', '内', '分', '泌', '科', '(', '童', '南', '伟', ')', ';', '北', '京', '安', '贞', '医', '院', '肾', '内', '\n', '科', '(', '谌', '贻', '璞', ')', ',', '流', '行', '病', '研', '究', '室', '(', '赵', '冬', ')', ';', '第', '四', '军', '医', '大', '学', '唐', '都', '医', '院', '内', '分', '泌', '科', '(', '焦', '凯', ')', ';', '火', '箭', '军', '总', '医', '院', '内', '分', '泌', '科', '(', '李', '全', '民', ')', ';', '中', '山', '大', '学', '孙', '≥', '仙', '纪', '念', '医', '院', '内', '分', '\n', '泌', '科', '(', '李', '焱', ')', ';', '哈', '尔', '滨', '医', '科', '大', '学', '附', '属', '第', '一', '医', '院', '内', '分', '泌', '科', '(', '李', '艳', '波', ')', ';', '北', '京', '协', '和', '医', '院', '内', '分', '泌', '科', '(', '李', '玉', '秀', ')', ';', '南', '昌', '大', '学', '第', '一', '附', '属', '医', '院', '内', '分', '泌', '科', '(', '刘', '建', '英', ')', '

In [6]:
print(tags[:1])

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

## 拆分数据集

In [7]:
def split_train_data(texts, tags, rate=0.1):
#     if os.path.exists(train_file):
#         os.remove(train_file)
#     if os.path.exists(dev_file):
#         os.remove(dev_file)

    split_chars = ['。', '！', '？', '，']
    train_num = 0
    dev_num = 0
    doc_dev_num = int(len(texts) * rate)
    print('文档总数:', len(texts))
    print('测试集文档数量:', doc_dev_num)
    
    with open(my_train_data, 'a', encoding='utf-8') as f:
        for k in range(len(texts) - doc_dev_num):
            text_ = texts[k]
            tag_ = tags[k]
            for p in range(len(text_)):
                if text_[p] == '\n': 
                    f.write('LB' + '\t' + tag_[p] + '\n')
                elif text_[p] == ' ':
                    f.write('SPACE' + '\t' + tag_[p] + '\n')
                elif text_[p] in split_chars:
                    train_num += 1
                    f.write(text_[p] + '\t' + tag_[p] + '\n\n')
                else:
                    f.write(text_[p] + '\t' + tag_[p] + '\n')
                    
        with open(my_dev_data, 'a', encoding='utf-8') as f:
            for k in range(len(texts) - doc_dev_num, len(texts)):
                text_ = texts[k]
                tag_ = tags[k]
                for p in range(len(text_)):
                    if text_[p] == '\n':
                        f.write('LB' + '\t' + tag_[p] + '\n')
                    elif text_[p] == ' ':
                        f.write('SPACE' + '\t' + tag_[p] + '\n')
                    elif text_[p] in split_chars:
                        dev_num += 1
                        f.write(text_[p] + '\t' + tag_[p] + '\n\n')
                    else:
                        f.write(text_[p] + '\t' + tag_[p] + '\n')
    
    print('train_num:{}, dev_num:{}'.format(train_num, dev_num))
    
split_train_data(texts, tags)

文档总数: 24
测试集文档数量: 2
train_num:2637, dev_num:239


## 构建字典

In [8]:
def read_corpus(corpus_path):
    data = []
    with open(corpus_path, 'r', encoding='utf-8') as fr:
        lines = fr.readlines()
    text_, tag_ = [], []
    for line in lines:
        if line != '\n':
            [char, label] = line.strip().split()
            text_.append(char)
            tag_.append(label)
        else:
            data.append((text_, tag_))
            text_, tag_ = [], []

    return data

train_data = read_corpus(my_train_data)
dev_data = read_corpus(my_dev_data)

print(train_data[:2])

[(['心', '血', '管', '疾', '病', '合', '并', '糖', '尿', '病', '口', '服', '降', '糖', '药', '物', '应', '用', '专', '家', '共', '识', 'LB', '高', '颖', 'SPACE', '杨', '光', '燃', 'SPACE', '周', '迎', '生', 'SPACE', '洪', '天', '配', 'SPACE', '姜', '红', '孙', '宁', '玲', '严', '晓', '伟', 'SPACE', '李', '建', '军', 'LB', '董', '吁', '钢', 'SPACE', '李', '新', '立', 'SPACE', '李', '凌', 'SPACE', '杨', '萍', 'SPACE', '李', '全', '民', '唐', '梅', '谌', '贻', '璞', 'SPACE', '马', '长', '生', 'LB', '2', '0', '0', '8', '年', '中', '国', '糖', '尿', '病', '流', '行', '病', '学', '调', '查', '显', '示', ',', '中', '国', '2', '0', '岁', '及', '以', 'LB', '上', '人', '群', '2', '型', '糖', '尿', '病', '患', '病', '率', '为', '9', '.', '7', '%', '⋯', '。'], ['B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'B-Drug', 'I-Drug', 'I-Drug', 'I-Drug', 'I-Drug', 'I-Drug', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [9]:
import pickle

def build_vocab(data, min_count=3):
    char2id = {}
    for text_, tag_ in data:
        for char in text_:
            if char.isdigit():
                char = '<NUM>'
            elif ('\u0041' <= char <='\u005a') or ('\u0061' <= char <='\u007a'):  # A-Z or a-z
                char = '<ENG>'
            if char not in char2id:
                char2id[char] = [len(char2id)+1, 1]
            else:
                char2id[char][1] += 1
    # 低频字           
    low_freq_chars = []
    for char, [char_id, char_freq] in char2id.items():
        if char_freq < min_count and char != '<NUM>' and char != '<ENG>':
            low_freq_chars.append(char)
    
    for char in low_freq_chars:
        del char2id[char]

    new_id = 1
    for char in char2id.keys():
        char2id[char] = new_id
        new_id += 1
    char2id['<UNK>'] = new_id
    char2id['<PAD>'] = 0
    
    print(char2id)

    with open(vocab_pkl, 'wb') as fw:
        pickle.dump(char2id, fw)
        
        
def read_dictionary():
    with open(vocab_pkl, 'rb') as fr:
        char2id = pickle.load(fr)
    print('vocab_size:', len(char2id))
    id2char = dict([(idx, char) for char, idx in char2id.items()])
    return char2id, id2char

build_vocab(train_data + dev_data)
char2id, id2char = read_dictionary()

{'心': 1, '血': 2, '管': 3, '疾': 4, '病': 5, '合': 6, '并': 7, '糖': 8, '尿': 9, '口': 10, '服': 11, '降': 12, '药': 13, '物': 14, '应': 15, '用': 16, '专': 17, '家': 18, '共': 19, '识': 20, '<ENG>': 21, '高': 22, '颖': 23, '杨': 24, '光': 25, '燃': 26, '周': 27, '迎': 28, '生': 29, '洪': 30, '天': 31, '配': 32, '姜': 33, '红': 34, '孙': 35, '宁': 36, '玲': 37, '严': 38, '晓': 39, '伟': 40, '李': 41, '建': 42, '军': 43, '董': 44, '吁': 45, '钢': 46, '新': 47, '立': 48, '凌': 49, '萍': 50, '全': 51, '民': 52, '唐': 53, '梅': 54, '谌': 55, '贻': 56, '璞': 57, '马': 58, '长': 59, '<NUM>': 60, '年': 61, '中': 62, '国': 63, '流': 64, '行': 65, '学': 66, '调': 67, '查': 68, '显': 69, '示': 70, ',': 71, '岁': 72, '及': 73, '以': 74, '上': 75, '人': 76, '群': 77, '型': 78, '患': 79, '率': 80, '为': 81, '.': 82, '%': 83, '⋯': 84, '。': 85, '的': 86, '成': 87, '际': 88, '联': 89, '盟': 90, '发': 91, '布': 92, '第': 93, '版': 94, '“': 95, '地': 96, '图': 97, '”': 98, '公': 99, '数': 100, '万': 101, '居': 102, '球': 103, '首': 104, '位': 105, '而': 106, '与': 107, '关': 108, '系': 109, '密': 110,

## 构造训练数据集

In [10]:
tags = ['O', 'B-Disease', 'I-Disease', 'B-Reason', 'I-Reason', "B-Symptom", "I-Symptom", "B-Test", "I-Test",
        "B-Test_Value", "I-Test_Value", "B-Drug", "I-Drug", "B-Frequency", "I-Frequency", "B-Amount","I-Amount",
        "B-Treatment", "I-Treatment", "B-Operation", "I-Operation", "B-Method", "I-Method","B-SideEff", "I-SideEff",
        "B-Anatomy", "I-Anatomy", "B-Level", "I-Level", "B-Duration", "I-Duration"]

tag2idx = dict(zip(tags, range(len(tags))))

idx2tag = {idx: tag for tag, idx in tag2idx.items()}

def build_data(data, char2id, tag2idx):
    datas, tags = [], []
    with open(data, 'r', encoding='utf-8') as fr:
        lines = fr.readlines()
        
    text_, tag_ = [], []
    for line in lines:
        if line != '\n':
            [char, tag] = line.strip().split()
            text_.append(char)
            tag_.append(tag)
        else:
            text_ids = [char2id[char] if char in char2id else char2id['<UNK>'] for char in text_]
            tag_ids = [tag2idx[tag] if tag in tag2idx else 0 for tag in tag_]
            datas.append(text_ids)
            tags.append(tag_ids)
            text_, tag_ = [], []
    return datas, tags

# 加载训练集
x_train, y_train = build_data(my_train_data, char2id, tag2idx)
# 加载测试集
x_dev, y_dev = build_data(my_dev_data, char2id, tag2idx)

print(x_train[:2])
print(y_train[:2])

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 5, 10, 11, 12, 8, 13, 14, 15, 16, 17, 18, 19, 20, 2078, 22, 23, 2078, 24, 25, 26, 2078, 27, 28, 29, 2078, 30, 31, 32, 2078, 33, 34, 35, 36, 37, 38, 39, 40, 2078, 41, 42, 43, 2078, 44, 45, 46, 2078, 41, 47, 48, 2078, 41, 49, 2078, 24, 50, 2078, 41, 51, 52, 53, 54, 55, 56, 57, 2078, 58, 59, 29, 2078, 2078, 2078, 2078, 2078, 61, 62, 63, 8, 9, 5, 64, 65, 5, 66, 67, 68, 69, 70, 71, 62, 63, 2078, 2078, 72, 73, 74, 2078, 75, 76, 77, 2078, 78, 8, 9, 5, 79, 5, 80, 81, 2078, 82, 2078, 83, 84, 85], [2078, 2078, 2078, 2078, 61, 67, 68, 86, 51, 63, 2078, 87, 61, 76, 8, 9, 5, 79, 5, 80, 81, 2078, 2078, 82, 2078, 83, 85]]
[[1, 2, 2, 2, 2, 0, 0, 1, 2, 2, 11, 12, 12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 

In [11]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# padding
x_train = pad_sequences(x_train, maxlen)
y_train = pad_sequences(y_train, maxlen)

x_dev = pad_sequences(x_dev, maxlen)
y_dev = pad_sequences(y_dev, maxlen)
print('x_train shape:', x_train.shape)
print('x_dev shape:', x_dev.shape)

# 将整型标签转为onehot,y为int数组
num_classes = len(tags)
print('num_classes:',num_classes)
y_train = to_categorical(y_train, num_classes)
y_dev = to_categorical(y_dev, num_classes)
print('y_train shape:', y_train.shape)
print('y_dev shape:', y_dev.shape)

Using TensorFlow backend.


x_train shape: (13261, 200)
x_dev shape: (1119, 200)
num_classes: 31
y_train shape: (13261, 200, 31)
y_dev shape: (1119, 200, 31)


## 模型构建

In [12]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras_contrib.layers import CRF
from keras.layers import Input, Embedding, Bidirectional, LSTM
from keras import Sequential
from keras import backend as K
K.clear_session() # 防止OOM

class BiLSTM_CRF(object):
    def __init__(self, maxlen, vocab_size, embedding_dims, class_num):
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.model = self.build_model()
        
    def build_model(self):
        model = Sequential() # 顺序模型是多个网络层的线性堆叠
        model.add(Embedding(self.vocab_size, self.embedding_dims, mask_zero=False)) # mask_zero:是否将输入中的0看作是被忽略的padding值
        model.add(Bidirectional(LSTM(128, return_sequences=True))) # return_sequences:只返回最后一个状态的输出
        crf = CRF(self.class_num, sparse_target=True)
        model.add(crf)
        model.summary()
        model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
        return model    

## 模型训练

In [13]:
from keras.callbacks import ModelCheckpoint

vocab_size = len(char2id)
print(vocab_size)

model = BiLSTM_CRF(maxlen, vocab_size, embedding_dims, num_classes).build_model()

# model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,verbose=1,validation_split=0.1) # batch_size：默认32
  
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,verbose=1,validation_data=[x_dev, y_dev],callbacks=[checkpoint])


# # 模型评估
# score = model.evaluate(x_dev, y_dev, batch_size=batch_size)
# print(model.metrics_names)
# print(score)

# # 模型保存
# model.save("./model/lstm_crf.h5")

2079
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          133056    
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 256)         197632    
_________________________________________________________________
crf_1 (CRF)                  (None, None, 31)          8990      
Total params: 339,678
Trainable params: 339,678
Non-trainable params: 0
_________________________________________________________________




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          133056    
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 256)         197632    
_________________________________________________________________
crf_2 (CRF)                  (None, None, 31)          8990      
Total params: 339,678
Trainable params: 339,678
Non-trainable params: 0
_________________________________________________________________

Train on 13261 samples, validate on 1119 samples
Epoch 1/1




<keras.callbacks.callbacks.History at 0x16132f8b888>

## 模型预测

### 多个文件预测

In [14]:
from tqdm import tqdm
import numpy as np

def read_txt(txt_file):
    with open(txt_file, encoding='utf-8') as f:
        text = f.read()
    return text


def process_data(text, char2id, maxlen = 2000):
    x = [char2id.get(w, 1) for w in text]
    length = len(x)
    print('length:', length)
    x = pad_sequences([x], maxlen)  # left padding
    return x, length


def savefile(tag_file, result_tags, predict_text):
    tags = []
    for tag in result_tags:
        if tag != 'O':
            tag_ = tag.split('-')[1]
        else:
            tag_ = tag
        tags.append(tag_)
    
    # write here
    prev = tags[0]
    start = 0
    num = 0
    for i in range(1, len(tags)):
        cur = tags[i]
        if cur != prev:
            end = i
            if prev != 'O':
                num += 1
                content = predict_text[start:end]
                content = content.replace('\n',' ')
                tag_file.write('T' + str(num) + '\t' + prev + ' ' + str(start) + ' ' + str(end) + '\t'+ content + '\n') 
            start = i
            prev = cur
    tag_file.close()


def test(test_dir, submit_dir, model, char2id, tags):
    for filename in tqdm(os.listdir(test_dir)):
        fileid = filename.split('.')[0]
        txt_file = test_dir + filename
        tag_file = open(submit_dir + fileid +'.ann', 'w', encoding='utf-8')
        # 读取txt内容
        predict_text = read_txt(txt_file)
        str_, length = process_data(predict_text, char2id, len(predict_text))
        # 预测
        raw = model.predict(str_)[0][-length:]
        result = [np.argmax(row) for row in raw]
        result_tags = [tags[i] for i in result]
        savefile(tag_file, result_tags, predict_text)
        

# model.load_weights(filepath)
test(mytest_data, submit_dir, model, char2id, tags)

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

length: 11584


 25%|█████████████████████                                                               | 1/4 [00:03<00:10,  3.42s/it]

length: 4558


 50%|██████████████████████████████████████████                                          | 2/4 [00:04<00:05,  2.71s/it]

length: 16626


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:08<00:03,  3.12s/it]

length: 14625


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:11<00:00,  2.99s/it]


### 本地预测

In [15]:
# 读取单个txt文件
# with open(txt_file, 'rb') as f:
#     predict_text = f.read().decode('utf-8')

def read_txt(txt_file):
    with open(txt_file, encoding='utf-8') as f:
        text = f.read()
    return text


test_txt_file =  './data/local_test/30_3.txt'
predict_text = read_txt(test_txt_file)
# predict_text

In [16]:
def get_test_data(test_txt_file, test_tag_file):
    predict_text = read_txt(test_txt_file)
    predict_list = [char for char in predict_text]
    char_num = len(predict_list)

    tag = pd.read_csv(test_tag_file, header=None, sep='\t', encoding='utf-8')
    tag_list = ['O' for _ in range(char_num)]
    for i in range(tag.shape[0]):
        tag_item = tag.iloc[i][1].split(' ')
        label, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])
        tag_list[start] = 'B-'+ label
        for j in range(start+1, end):
            tag_list[j] = 'I-'+ label
            
    return predict_text, tag_list


test_txt_file =  './data/local_test/30_3.txt'
test_tag_file =  './data/local_test/30_3.ann'
predict_text, tag_list = get_test_data(test_txt_file, test_tag_file)

In [17]:
def process_data(text, char2id, maxlen = 2000):
    x = [char2id.get(w, 1) for w in text]
    length = len(x)
    print('length:', length)
    x = pad_sequences([x], maxlen)  # left padding
    return x, length

x, length = process_data(predict_text, char2id, len(predict_text))
x.shape
length

length: 6419


(1, 6419)

6419

In [18]:
import numpy as np
from sklearn.metrics import classification_report

def local_test(test_txt_file, test_tag_file, model, char2id, tags):
    predict_text, tag_list = get_test_data(test_txt_file, test_tag_file)
    str_, length = process_data(predict_text, char2id, len(tag_list))
    raw = model.predict(str_)[0][-length:]
    result = [np.argmax(row) for row in raw]
    result_tags = [tags[i] for i in result]
    print(classification_report(tag_list, result_tags))
    

local_test(test_txt_file, test_tag_file, model, char2id, tags)

length: 6419
              precision    recall  f1-score   support

    B-Amount       0.00      0.00      0.00         4
   B-Anatomy       0.00      0.00      0.00         6
   B-Disease       0.02      0.46      0.03       175
      B-Drug       0.00      0.00      0.00        38
  B-Duration       0.00      0.00      0.00         2
     B-Level       0.00      0.00      0.00         8
    B-Method       0.00      0.00      0.00         1
    B-Reason       0.00      0.00      0.00        17
   B-SideEff       0.00      0.00      0.00         1
   B-Symptom       0.00      0.00      0.00         9
      B-Test       0.00      0.00      0.00        95
B-Test_Value       0.00      0.00      0.00        23
 B-Treatment       0.00      0.00      0.00         7
    I-Amount       0.00      0.00      0.00        31
   I-Anatomy       0.00      0.00      0.00         7
   I-Disease       0.00      0.00      0.00       517
      I-Drug       0.00      0.00      0.00       162
  I-Duration  

  _warn_prf(average, modifier, msg_start, len(result))
