In [21]:
import pickle
import pdb
import codecs
import re
import sys
import math

import numpy as np

import tensorflow as tf
from Batch import BatchGenerator
from bilstm_crf import Model
from utils import *

In [23]:
with open('../Bosondata.pkl', 'rb') as inp:
	word2id = pickle.load(inp)
	id2word = pickle.load(inp)
	tag2id = pickle.load(inp)
	id2tag = pickle.load(inp)
	x_train = pickle.load(inp)
	y_train = pickle.load(inp)
	x_test = pickle.load(inp)
	y_test = pickle.load(inp)
	x_valid = pickle.load(inp)
	y_valid = pickle.load(inp)
print("train len:",len(x_train))
print("valid len:",len(x_valid))
print("test len:",len(x_test))
print("word2id len", len(word2id))
print('Creating the data generator ...')

train len: 10721
valid len: 2681
test len: 3351
word2id len 3435
Creating the data generator ...


In [24]:
data_train = BatchGenerator(x_train, y_train, shuffle=True)
data_valid = BatchGenerator(x_valid, y_valid, shuffle=False)
data_test = BatchGenerator(x_test, y_test, shuffle=False)
print('Finished creating the data generator.')

Finished creating the data generator.


In [26]:
data_train.x.shape

(10721, 60)

In [27]:
epochs = 31
batch_size = 32

config = {}
config["lr"] = 0.001
config["embedding_dim"] = 100 # 词向量的维度
config["sen_len"] = len(x_train[0]) # 句子长度
config["batch_size"] = batch_size # 批次大小
config["embedding_size"] = len(word2id)+1 # 有多少个词向量
config["tag_size"] = len(tag2id) # 有多少个标签
config["pretrained"]=False # 预训练

In [28]:
embedding_pre = []

print("use pretrained embedding")
config["pretrained"]=True
word2vec = {}
with codecs.open('vec.txt','r','utf-8') as input_data:   
    for line in input_data.readlines():
        word2vec[line.split()[0]] = list(map(eval,line.split()[1:]))

unknow_pre = []
unknow_pre.extend([1]*100)
embedding_pre.append(unknow_pre) #wordvec id 0
for word in id2word:
    if word in word2vec:
        embedding_pre.append(word2vec[word])
    else:
        embedding_pre.append(unknow_pre)

embedding_pre = np.asarray(embedding_pre)

use pretrained embedding


In [29]:
embedding_pre

array([[ 1.      ,  1.      ,  1.      , ...,  1.      ,  1.      ,
         1.      ],
       [ 0.087197, -0.083435,  0.057956, ...,  0.045114, -0.08465 ,
         0.111534],
       [ 0.084424, -0.118023,  0.135428, ...,  0.273897,  0.150512,
        -0.172031],
       ...,
       [ 0.163283,  0.341556,  0.380325, ..., -0.099092,  0.167281,
        -0.46092 ],
       [-0.233846, -0.435442,  0.366504, ..., -0.282544, -0.129922,
         0.357925],
       [-0.578525, -0.012686, -0.060288, ...,  0.013066, -0.469461,
        -0.127309]])

In [30]:
config["batch_size"],config["sen_len"],config["embedding_size"],config["embedding_dim"],config["pretrained"]

(32, 60, 3436, 100, True)

In [19]:
class Model:
    def __init__(self,config,embedding_pretrained,dropout_keep=1):
        self.lr = config["lr"]
        self.batch_size = config["batch_size"]
        self.embedding_size = config["embedding_size"]
        self.embedding_dim = config["embedding_dim"] 
        self.sen_len = config["sen_len"]
        self.tag_size = config["tag_size"]
        self.pretrained = config["pretrained"]
        self.dropout_keep = dropout_keep
        self.embedding_pretrained = embedding_pretrained
        self.input_data = tf.placeholder(tf.int32, shape=[self.batch_size,self.sen_len], name="input_data") 
        self.labels = tf.placeholder(tf.int32,shape=[self.batch_size,self.sen_len], name="labels")
        self.embedding_placeholder = tf.placeholder(tf.float32,shape=[self.embedding_size,self.embedding_dim], name="embedding_placeholder")
        with tf.variable_scope("bilstm_crf") as scope:
            self._build_net()
    def _build_net(self):
        word_embeddings = tf.get_variable("word_embeddings",[self.embedding_size, self.embedding_dim])
        if self.pretrained:
            embeddings_init = word_embeddings.assign(self.embedding_pretrained)

        input_embedded = tf.nn.embedding_lookup(word_embeddings, self.input_data)
        input_embedded = tf.nn.dropout(input_embedded,self.dropout_keep)

        lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
        lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
        (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, 
                                                                         lstm_bw_cell, 
                                                                         input_embedded,
                                                                         dtype=tf.float32,
                                                                         time_major=False,
                                                                         scope=None)

        bilstm_out = tf.concat([output_fw, output_bw], axis=2)


        # Fully connected layer.
        W = tf.get_variable(name="W", shape=[self.batch_size,2 * self.embedding_dim, self.tag_size],
                        dtype=tf.float32)

        b = tf.get_variable(name="b", shape=[self.batch_size, self.sen_len, self.tag_size], dtype=tf.float32,
                        initializer=tf.zeros_initializer())

        bilstm_out = tf.tanh(tf.matmul(bilstm_out, W) + b)

        # Linear-CRF.
        log_likelihood, self.transition_params = tf.contrib.crf.crf_log_likelihood(bilstm_out, self.labels, tf.tile(np.array([self.sen_len]),np.array([self.batch_size])))

        loss = tf.reduce_mean(-log_likelihood)

        # Compute the viterbi sequence and score (used for prediction and test time).
        self.viterbi_sequence, viterbi_score = tf.contrib.crf.crf_decode(bilstm_out, self.transition_params,tf.tile(np.array([self.sen_len]),np.array([self.batch_size])))

        # Training ops.
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.minimize(loss)


In [20]:
model = Model(config,embedding_pre,dropout_keep=0.5)

ValueError: Variable bilstm_crf/word_embeddings already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "D:\Anaconda\envs\tfenv\lib\site-packages\tensorflow\python\framework\ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()
  File "D:\Anaconda\envs\tfenv\lib\site-packages\tensorflow\python\framework\ops.py", line 3300, in create_op
    op_def=op_def)
  File "D:\Anaconda\envs\tfenv\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
