In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import os
import numpy as np
import tensorflow as tf
import codecs
import pickle

from bert_base.train import tf_metrics
from bert_base.bert import modeling
from bert_base.bert import optimization
from bert_base.bert import tokenization

# import

from bert_base.train.models import create_model, InputFeatures, InputExample

__version__ = '0.1.0'

__all__ = ['__version__', 'DataProcessor', 'NerProcessor', 'write_tokens', 'convert_single_example',
           'filed_based_convert_examples_to_features', 'file_based_input_fn_builder',
           'model_fn_builder', 'train']


In [2]:
import os
from train.train_helper import get_args_parser
from train.bert_lstm_ner import train
args = get_args_parser()
os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map
args.task_name="NER"
args.do_train=True
args.do_eval=True
args.do_predict=True
args.data_dir="D:/project/python_project/bert-lstm-crf-ner\data_demo"
args.vocab_file="D:/project/python_project/bert-lstm-crf-ner/bert\chinese_L-12_H-768_A-12/vocab.txt"
args.bert_config_file="D:/project/python_project/bert-lstm-crf-ner/bert/chinese_L-12_H-768_A-12/bert_config.json"
args.init_checkpoint="D:/project/python_project/bert-lstm-crf-ner/bert/chinese_L-12_H-768_A-12/bert_model.ckpt"
args.max_seq_length=128
args.train_batch_size=32
args.learning_rate=2e-5
args.num_train_epochs=3.0
args.output_dir="D:/project/python_project/bert-lstm-crf-ner/output"

In [3]:
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_data(cls, input_file):
        """Reads a BIO data."""
        with codecs.open(input_file, 'r', encoding='utf-8') as f:
            lines = []
            words = []
            labels = []
            for line in f:
                contends = line.strip()
                tokens = contends.split(' ')
                if len(tokens) == 2:
                    words.append(tokens[0])
                    labels.append(tokens[1])
                else:
                    if len(contends) == 0:
                        l = ' '.join([label for label in labels if len(label) > 0])
                        w = ' '.join([word for word in words if len(word) > 0])
                        lines.append([l, w])
                        words = []
                        labels = []
                        continue
                if contends.startswith("-DOCSTART-"):
                    words.append('')
                    continue
            return lines

In [4]:
class NerProcessor(DataProcessor):
    def __init__(self, output_dir):
        self.labels = set()
        self.output_dir = output_dir

    def get_train_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "train.txt")), "train"
        )

    def get_dev_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "dev.txt")), "dev"
        )

    def get_test_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "test.txt")), "test")

    def get_labels(self, labels=None):
        #传入参数可能是labels文件路径，也可能是逗号分隔的labels文本
        if labels is not None:
            try:
                # 支持从文件中读取标签类型
                if os.path.exists(labels) and os.path.isfile(labels):
                    with codecs.open(labels, 'r', encoding='utf-8') as fd:
                        for line in fd:
                            self.labels.append(line.strip())
                else:
                    # 否则通过传入的参数，按照逗号分割
                    self.labels = labels.split(',')
                self.labels = set(self.labels) # to set
            except Exception as e:
                print(e)
        # 通过读取train文件获取标签的方法会出现一定的风险。
        if os.path.exists(os.path.join(self.output_dir, 'label_list.pkl')):
            with codecs.open(os.path.join(self.output_dir, 'label_list.pkl'), 'rb') as rf:
                self.labels = pickle.load(rf)
        else:
            if len(self.labels) > 0:
                #pkl文件不存在，就按照读取的标签集合加上一些其他标签，写入pkl
                self.labels = self.labels.union(set(["X", "[CLS]", "[SEP]"]))
                with codecs.open(os.path.join(self.output_dir, 'label_list.pkl'), 'wb') as rf:
                    pickle.dump(self.labels, rf)
            else:
                #如果什么都没有，都按照代码写好的标签集合
                self.labels = ["O", 'B-TIM', 'I-TIM', "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "X", "[CLS]", "[SEP]"]
        return self.labels

    def _create_example(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text = tokenization.convert_to_unicode(line[1])
            label = tokenization.convert_to_unicode(line[0])
            # if i == 0:
            #     print('label: ', label)
            examples.append(InputExample(guid=guid, text=text, label=label))
        return examples

    def _read_data(self, input_file):
        """Reads a BIO data."""
        with codecs.open(input_file, 'r', encoding='utf-8') as f:
            lines = []
            words = []
            labels = []
            for line in f:
                contends = line.strip()
                tokens = contends.split(' ')
                if len(tokens) == 2:
                    words.append(tokens[0])
                    labels.append(tokens[-1])
                else:
                    if len(contends) == 0 and len(words) > 0:
                        label = []
                        word = []
                        for l, w in zip(labels, words):
                            if len(l) > 0 and len(w) > 0:
                                label.append(l)
                                self.labels.add(l)
                                word.append(w)
                        lines.append([' '.join(label), ' '.join(word)])
                        words = []
                        labels = []
                        continue
                if contends.startswith("-DOCSTART-"):
                    continue
            return lines

In [5]:
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_data(cls, input_file):
        """Reads a BIO data."""
        with codecs.open(input_file, 'r', encoding='utf-8') as f:
            lines = []
            words = []
            labels = []
            for line in f:
                contends = line.strip()
                tokens = contends.split(' ')
                if len(tokens) == 2:
                    words.append(tokens[0])
                    labels.append(tokens[1])
                else:
                    if len(contends) == 0:
                        l = ' '.join([label for label in labels if len(label) > 0])
                        w = ' '.join([word for word in words if len(word) > 0])
                        lines.append([l, w])
                        words = []
                        labels = []
                        continue
                if contends.startswith("-DOCSTART-"):
                    words.append('')
                    continue
            return lines

In [6]:
class NerProcessor(DataProcessor):
    def __init__(self, output_dir):
        self.labels = set()
        self.output_dir = output_dir

    def get_train_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "train.txt")), "train"
        )

    def get_dev_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "dev.txt")), "dev"
        )

    def get_test_examples(self, data_dir):
        return self._create_example(
            self._read_data(os.path.join(data_dir, "test.txt")), "test")

    def get_labels(self, labels=None):
        #传入参数可能是labels文件路径，也可能是逗号分隔的labels文本
        if labels is not None:
            try:
                # 支持从文件中读取标签类型
                if os.path.exists(labels) and os.path.isfile(labels):
                    with codecs.open(labels, 'r', encoding='utf-8') as fd:
                        for line in fd:
                            self.labels.append(line.strip())
                else:
                    # 否则通过传入的参数，按照逗号分割
                    self.labels = labels.split(',')
                self.labels = set(self.labels) # to set
            except Exception as e:
                print(e)
        # 通过读取train文件获取标签的方法会出现一定的风险。
        if os.path.exists(os.path.join(self.output_dir, 'label_list.pkl')):
            with codecs.open(os.path.join(self.output_dir, 'label_list.pkl'), 'rb') as rf:
                self.labels = pickle.load(rf)
        else:
            if len(self.labels) > 0:
                #pkl文件不存在，就按照读取的标签集合加上一些其他标签，写入pkl
                self.labels = self.labels.union(set(["X", "[CLS]", "[SEP]"]))
                with codecs.open(os.path.join(self.output_dir, 'label_list.pkl'), 'wb') as rf:
                    pickle.dump(self.labels, rf)
            else:
                #如果什么都没有，都按照代码写好的标签集合
                self.labels = ["O", 'B-TIM', 'I-TIM', "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "X", "[CLS]", "[SEP]"]
        return self.labels

    def _create_example(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text = tokenization.convert_to_unicode(line[1])
            label = tokenization.convert_to_unicode(line[0])
            # if i == 0:
            #     print('label: ', label)
            examples.append(InputExample(guid=guid, text=text, label=label))
        return examples

    def _read_data(self, input_file):
        """Reads a BIO data."""
        with codecs.open(input_file, 'r', encoding='utf-8') as f:
            lines = []
            words = []
            labels = []
            for line in f:
                contends = line.strip()
                tokens = contends.split(' ')
                if len(tokens) == 2:
                    words.append(tokens[0])
                    labels.append(tokens[-1])
                else:
                    if len(contends) == 0 and len(words) > 0:
                        label = []
                        word = []
                        for l, w in zip(labels, words):
                            if len(l) > 0 and len(w) > 0:
                                label.append(l)
                                self.labels.add(l)
                                word.append(w)
                        lines.append([' '.join(label), ' '.join(word)])
                        words = []
                        labels = []
                        continue
                if contends.startswith("-DOCSTART-"):
                    continue
            return lines


In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map

#一个处理的类，包括训练数据的输入等
processors = {
    "ner": NerProcessor
}
#载入bert配置文件
bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)

#检查序列的最大长度是否超出范围
if args.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (args.max_seq_length, bert_config.max_position_embeddings))

# 在re train 的时候，才删除上一轮产出的文件，在predicted 的时候不做clean
if args.clean and args.do_train:
    if os.path.exists(args.output_dir):
        def del_file(path):
            ls = os.listdir(path)
            for i in ls:
                c_path = os.path.join(path, i)
                if os.path.isdir(c_path):
                    del_file(c_path)
                else:
                    os.remove(c_path)

        try:
            del_file(args.output_dir)
        except Exception as e:
            print(e)
            print('pleace remove the files of output dir and data.conf')
            exit(-1)

#check output dir exists
if not os.path.exists(args.output_dir):
    os.mkdir(args.output_dir)

#通过output_dir初始化数据处理类，processor
processor = processors[args.ner](args.output_dir)

#通过bert字典，初始化bert自带分词类
tokenizer = tokenization.FullTokenizer(
    vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

#创建session的时候，对session进行配置
session_config = tf.ConfigProto(
    log_device_placement=False,#记录各项操作在哪台机器运行
    inter_op_parallelism_threads=0,
    intra_op_parallelism_threads=0,
    allow_soft_placement=True)

#estimator运行配置，包括模型保存等
run_config = tf.estimator.RunConfig(
    model_dir=args.output_dir,
    save_summary_steps=500,
    save_checkpoints_steps=500,
    session_config=session_config
)

train_examples = None
eval_examples = None
num_train_steps = None
num_warmup_steps = None

In [8]:
os.environ['CUDA_VISIBLE_DEVICES'] = args.device_map

    #一个处理的类，包括训练数据的输入等
processors = {
        "ner": NerProcessor
    }
    #载入bert配置文件
bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)

    #检查序列的最大长度是否超出范围
if args.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (args.max_seq_length, bert_config.max_position_embeddings))

# 在re train 的时候，才删除上一轮产出的文件，在predicted 的时候不做clean
if args.clean and args.do_train:
    if os.path.exists(args.output_dir):
        def del_file(path):
            ls = os.listdir(path)
            for i in ls:
                c_path = os.path.join(path, i)
                if os.path.isdir(c_path):
                    del_file(c_path)
                else:
                    os.remove(c_path)

        try:
            del_file(args.output_dir)
        except Exception as e:
            print(e)
            print('pleace remove the files of output dir and data.conf')
            exit(-1)

#check output dir exists
if not os.path.exists(args.output_dir):
    os.mkdir(args.output_dir)

#通过output_dir初始化数据处理类，processor
processor = processors[args.ner](args.output_dir)

#通过bert字典，初始化bert自带分词类
tokenizer = tokenization.FullTokenizer(
    vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

#创建session的时候，对session进行配置
session_config = tf.ConfigProto(
    log_device_placement=False,#记录各项操作在哪台机器运行
    inter_op_parallelism_threads=0,
    intra_op_parallelism_threads=0,
    allow_soft_placement=True)

#estimator运行配置，包括模型保存等
run_config = tf.estimator.RunConfig(
    model_dir=args.output_dir,
    save_summary_steps=500,
    save_checkpoints_steps=500,
    session_config=session_config
)

train_examples = None
eval_examples = None
num_train_steps = None
num_warmup_steps = None

In [9]:
print(args.data_dir)

D:/project/python_project/bert-lstm-crf-ner\data_demo


In [10]:
train_examples = processor.get_train_examples(args.data_dir)

In [11]:
lines=processor._read_data(os.path.join(args.data_dir, "train.txt"))
print(lines)

[['O O O O O O O B-LOC I-LOC O B-LOC I-LOC O O O O O O', '海 钓 比 赛 地 点 在 厦 门 与 金 门 之 间 的 海 域 。'], ['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', '这 座 依 山 傍 水 的 博 物 馆 由 国 内 一 流 的 设 计 师 主 持 设 计 ， 整 个 建 筑 群 精 美 而 恢 宏 。'], ['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', '但 作 为 一 个 共 产 党 员 、 人 民 公 仆 ， 应 当 胸 怀 宽 阔 ， 真 正 做 到 “ 先 天 下 之 忧 而 忧 ， 后 天 下 之 乐 而 乐 ” ， 淡 化 个 人 的 名 利 得 失 和 宠 辱 悲 喜 ， 把 改 革 大 业 摆 在 首 位 ， 这 样 才 能 超 越 自 我 ， 摆 脱 世 俗 ， 有 所 作 为 。'], ['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', '在 发 达 国 家 ， 急 救 保 险 十 分 普 及 ， 已 成 为 社 会 保 障 体 系 的 重 要 组 成 部 分 。'], ['B-LOC B-LOC O O O O O O O O O O O O O O B-LOC B-LOC O O O O O O O O O O O O O O O O O O O O O O', '日 俄 两 国 国 内 政 局 都 充 满 变 数 ， 尽 管 日 俄 关 系 目 前 是 历 史 最 佳 时 期 ， 但 其 脆 弱 性 不 言 自 明 。'], ['B-PER I-PER I-PER O O O B-PER I-PER O O O O O O O O O O O O O O O O O O O

In [12]:
#训练步数
num_train_steps = int(
    len(train_examples) *1.0 / args.batch_size * args.num_train_epochs)
if num_train_steps < 1:
    raise AttributeError('training data is so small...')
#
num_warmup_steps = int(num_train_steps * args.warmup_proportion)

tf.logging.info("***** Running training *****")
tf.logging.info("  Num examples = %d", len(train_examples))
tf.logging.info("  Batch size = %d", args.batch_size)
tf.logging.info("  Num steps = %d", num_train_steps)
#读取验证集
eval_examples = processor.get_dev_examples(args.data_dir)

# 打印验证集数据信息
tf.logging.info("***** Running evaluation *****")
tf.logging.info("  Num examples = %d", len(eval_examples))
tf.logging.info("  Batch size = %d", args.batch_size)

INFO:tensorflow:***** Running training *****
INFO:tensorflow:  Num examples = 43
INFO:tensorflow:  Batch size = 64
INFO:tensorflow:  Num steps = 2
INFO:tensorflow:***** Running evaluation *****
INFO:tensorflow:  Num examples = 43
INFO:tensorflow:  Batch size = 64


In [35]:
#获取标签集合，是一个list
label_list = processor.get_labels()
print(label_list)

{'I-LOC', 'I-ORG', '[CLS]', 'B-LOC', 'O', 'B-ORG', '[SEP]', 'B-PER', 'X', 'I-PER'}


In [38]:
with tf.name_scope('input'):
    input_ids = tf.placeholder(tf.int32, [None, args.max_seq_length])
    input_mask = tf.placeholder(tf.int32, [None, args.max_seq_length])
    segment_ids  = tf.placeholder(tf.int32, [None, args.max_seq_length])
    label_ids = tf.placeholder(tf.int32, [None, args.max_seq_length])
#对参数赋值，对于训练模型来说
is_training=True
num_labels=len(label_list) + 1
init_checkpoint = args.init_checkpoint
learning_rate = args.learning_rate
total_loss, logits, trans, pred_ids = create_model(
    bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
    num_labels, False, args.dropout_rate, args.lstm_size, args.cell, args.num_layers)
#输出loss的smmary
tf.summary.scalar('total_loss', total_loss)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


<tf.Tensor 'total_loss:0' shape=() dtype=string>

In [39]:
#加载预训练隐变量
    tvars = tf.trainable_variables()
    # 加载BERT模型，assignmen_map，加载的预训练变量值
    if init_checkpoint:
        (assignment_map, initialized_variable_names) = \
            modeling.get_assignment_map_from_checkpoint(tvars,
                                                        init_checkpoint)
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
    #优化loss
    train_op = optimization.create_optimizer(
        total_loss, learning_rate, num_train_steps, num_warmup_steps, False)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [40]:
# 1. 将数据转化为tf_record 数据,并把训练数据序列化，并写出到文件
train_file = os.path.join(args.output_dir, "train.tf_record")
#ok
if not os.path.exists(train_file):
    filed_based_convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir)

# 2.读取record 数据，组成batch，把上一部输出到文件的训练数据读取
train_input_fn = file_based_input_fn_builder(
    input_file=train_file,
    seq_length=args.max_seq_length,
    is_training=True,
    drop_remainder=True,
    batch_size=args.batch_size)
# estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

eval_file = os.path.join(args.output_dir, "eval.tf_record")
if not os.path.exists(eval_file):
    filed_based_convert_examples_to_features(
        eval_examples, label_list, args.max_seq_length, tokenizer, eval_file, args.output_dir)
#构建验证集数据
eval_input_fn = file_based_input_fn_builder(
    input_file=eval_file,
    seq_length=args.max_seq_length,
    is_training=False,
    drop_remainder=False,
    batch_size=args.batch_size)

INFO:tensorflow:Writing example 0 of 43
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: dev-0
INFO:tensorflow:tokens: 海 钓 比 赛 地 点 在 厦 门 与 金 门 之 间 的 海 域 。
INFO:tensorflow:input_ids: 101 3862 7157 3683 6612 1765 4157 1762 1336 7305 680 7032 7305 722 7313 4638 3862 1818 511 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [42]:
train_input=train_input_fn.make_one_shot_iterator()
sess = tf.InteractiveSession()
max_step=100
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter('./log', sess.graph)



In [52]:
meta_train_data = train_input.get_next()
train_data=sess.run([meta_train_data])
print(len(train_data))
# for t in train_data:
#     print(t)

1


In [None]:
init_op = tf.initialize_all_variables()

sess = tf.Session()
sess.run(init_op)

meta_train_data = train_input.get_next()
#一个batch的数据大小是64，repeat后，batch=32*2
for i in range(max_step):

    #把tensor转化为numpy输入
    train_data=sess.run([meta_train_data])[0]
    sess.run(train_op,feed_dict={input_ids:train_data['input_ids'],input_mask:train_data['input_mask'],
                                 segment_ids:train_data['segment_ids'],label_ids:train_data['label_ids']})
    if i%10==0:
        train_summary = sess.run(merged, feed_dict={input_ids:train_data['input_ids'],input_mask:train_data['input_mask'],
                                 segment_ids:train_data['segment_ids'],label_ids:train_data['label_ids']})
        train_writer.add_summary(train_summary, i)
        print('Saving summary loss at %s'%(i))
train_writer.close()

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Saving summary loss at 0
