In [1]:
import gensim
import numpy as np
import pickle
import matplotlib.pyplot as plt
import tensorflow as tf
import re
from tqdm import tqdm, tqdm_notebook



In [2]:
class corpusClean():

    def __init__(self):
        self.num_re = re.compile('[0-9一二三四五六七八九十]+')
        self.char_re = re.compile('[a-zA-Z\-\,\.\。\，]+')

    def clean(self, line):
        raw_words = [i for i in line]
#         print(raw_words)
        new_words = []
        for i in range(len(raw_words)):
            word = raw_words[i]
            if 65281 <= ord(word) and ord(word) <= 65375:
                word = chr(ord(word)-65248)
            if (33 <= ord(word) and ord(word) < 48) or (8208 <= ord(word) and ord(word) <= 8232):
                word = ''
            new_words.append(word)
        sentence = ''.join(new_words)
        
        removed_char = re.sub(self.char_re, ' <CHAR> ', sentence)
        removed_num = re.sub(self.num_re, ' <NUMBER> ', removed_char)
        words = removed_num.split()
        saved_words = ['<BEGIN>']
        for word in words:
            if len(word) == 1 or word.find('<') != -1:
                saved_words.append(word)
            else:
                for w in word:
                    saved_words.append(w)
        saved_words.append('<END>')
        return saved_words
    

    
class layer(object):
    def __init__(self):
        raise NotImplementedError
    
    def __str__(self):
        raise NotImplementedError
    
class affine_layer(layer):
    def __init__(self, name, inputs, shape, activation, reuse=False):
        with tf.variable_scope('affine', reuse=reuse):
            self.weights = tf.get_variable(name = name+'_weights', initializer=tf.truncated_normal(shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([shape[-1]]))
            self.outputs = activation(tf.add(tf.matmul(inputs, self.weights), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class conv_layer(layer):
    def __init__(self, name, inputs, shape, activation, reuse=False):
        with tf.variable_scope('conv', reuse=reuse):
            self.kernel = tf.get_variable(name = name+'_kernel', initializer=tf.truncated_normal(shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([shape[-1]]))
            self.outputs = activation(tf.add(tf.nn.conv2d(inputs, self.kernel, padding='VALID', strides=[1,1,1,1]), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class pooling_layer(layer):
    def __init__(self, name, inputs, reuse=False):
        with tf.variable_scope('pooling', reuse=reuse):
            self.outputs = tf.nn.max_pool(name = name+'_maxpooling', value=inputs, ksize=[1,2,2,1], padding='VALID', strides=[1,2,2,1])
    def __str__(self):
        return self.outputs.__str__()
        
class reshape_layer(layer):
    def __init__(self, name, inputs, shape, reuse=False):
        with tf.variable_scope('reshape', reuse=reuse):
            self.outputs = tf.reshape(name=name, tensor=inputs, shape=shape)
    def __str__(self):
        return self.outputs.__str__()
        
class deconv_layer(layer):
    def __init__(self, name, inputs, kernel_shape, output_shape, activation, reuse=False):
        with tf.variable_scope('deconv', reuse=reuse):
            self.kernel = tf.get_variable(name = name+'_kernel', initializer=tf.truncated_normal(kernel_shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([kernel_shape[-2]]))
            self.outputs = activation(tf.add(tf.nn.conv2d_transpose(inputs, self.kernel, output_shape=output_shape, padding='VALID', strides=[1,2,2,1]), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class lstm_layer(layer):
    def __init__(self, name, inputs, n_units, reuse=False):
        with tf.variable_scope('lstm', reuse=reuse):
            self.outputs, self.states = tf.nn.dynamic_rnn(
                cell = tf.contrib.rnn.BasicLSTMCell(n_units),
                inputs = inputs,
                dtype = tf.float32)
    def __str__(self):
        return self.outputs.__str__()

In [38]:
sentences = open('../data/cleaned_corpus.txt', encoding='utf-8').readlines()
lines = open('../data/TrainSet-eCarX-171019.txt', encoding='gbk').readlines()
vector_size = 200
ngram = 3
min_count = 2
workers = 4
# models = gensim.models.Word2Vec.load('../word2vec/word2vec.model')
models = gensim.models.Word2Vec(sentences, size=vector_size, window=ngram, min_count=min_count, workers=workers)
cleaner = corpusClean()

In [39]:
models.save('../word2vec/word2vec.model')

In [None]:
pickle.dump(train_data, open('../word2vec/train_inputs.npz', 'wb'))
pickle.dump(train_target_one_hot, open('../word2vec/train_targets.npz', 'wb'))

In [None]:
train_data = pickle.load(open('../word2vec/train_inputs.npz', 'rb'))
train_target_one_hot = pickle.load(open('../word2vec/train_targets.npz', 'rb'))

In [None]:
batched_inputs = dict()
batched_targets = dict()
for key in train_data.keys():
    count = int(len(train_data[key])/50)+1
    for i in range(count):
        new_key = str(key)+'_'+str(i)
        if i+1 != count:
            batched_inputs[new_key] = train_data[key][i*50:(i+1)*50]
            batched_targets[new_key] = train_target_one_hot[key][i*50:(i+1)*50]
        else:
            batched_inputs[new_key] = train_data[key][i*50:]
            batched_targets[new_key] = train_target_one_hot[key][i*50:]

In [4]:
# pickle.dump(batched_inputs, open('../word2vec/batched_inputs.npz', 'wb'))
# pickle.dump(batched_targets, open('../word2vec/batched_targets.npz', 'wb'))
batched_inputs = pickle.load(open('../word2vec/batched_inputs.npz', 'rb'))
batched_targets = pickle.load(open('../word2vec/batched_targets.npz', 'rb'))

In [None]:
graph = tf.Graph()
with graph.as_default():
    
    input_placeholder = tf.placeholder(tf.float32, [None, None, 200])
    target_placeholder = tf.placeholder(tf.float32, [None, 134])
    
    lstm_layer1 = lstm_layer('lstm1', input_placeholder, 200)
    affine1 = affine_layer('affine1', lstm_layer1.outputs[:,-1,:], [200, 256], tf.nn.relu)
    affine2 = affine_layer('affine2', affine1.outputs, [256, 256], tf.nn.relu)
    affine3 = affine_layer('affine3', affine2.outputs, [256, 134], tf.identity)
    
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=affine3.outputs, labels=target_placeholder))
    optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
    accuracy = tf.reduce_sum(tf.cast(tf.equal(tf.argmax(affine3.outputs, 1), tf.argmax(target_placeholder, 1)), tf.float32))
    
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    
    for i in range(500):
        
        accs = 0.0
        errs = 0.0
        n = 0
        for key in batched_inputs.keys():
            inputs_batch = batched_inputs[key]
            targets_batch = batched_targets[key]
#             print(inputs_batch.shape)
#             print(targets_batch.shape)
            feed_dict = {input_placeholder:inputs_batch, target_placeholder:targets_batch}
            _, acc, err = sess.run([optimizer, accuracy, loss], feed_dict=feed_dict)
            accs += acc
            errs += err
            n += len(inputs_batch)
        print('#Training Epoch %d, acc: %f, err: %f, n: %d'%(i, accs/n, errs/n, n))

#Training Epoch 0, acc: 0.004216, err: nan, n: 134000
#Training Epoch 1, acc: 0.013530, err: nan, n: 134000
#Training Epoch 2, acc: 0.030985, err: nan, n: 134000
#Training Epoch 3, acc: 0.063888, err: nan, n: 134000
#Training Epoch 4, acc: 0.090015, err: nan, n: 134000
#Training Epoch 5, acc: 0.132321, err: nan, n: 134000
#Training Epoch 6, acc: 0.174157, err: nan, n: 134000
#Training Epoch 7, acc: 0.207254, err: nan, n: 134000
#Training Epoch 8, acc: 0.245701, err: nan, n: 134000
#Training Epoch 9, acc: 0.284731, err: nan, n: 134000
#Training Epoch 10, acc: 0.327642, err: nan, n: 134000
#Training Epoch 11, acc: 0.350366, err: nan, n: 134000
#Training Epoch 12, acc: 0.376813, err: nan, n: 134000
#Training Epoch 13, acc: 0.409552, err: nan, n: 134000
#Training Epoch 14, acc: 0.433149, err: nan, n: 134000
#Training Epoch 15, acc: 0.458679, err: nan, n: 134000
#Training Epoch 16, acc: 0.474097, err: nan, n: 134000
#Training Epoch 17, acc: 0.501836, err: nan, n: 134000


# Use raw corpus to generate word vector

In [48]:
import matplotlib.pyplot as plt
%matplotlib inline

In [45]:
vector_size = 200
ngram = 3
min_count = 2
workers = 4
lines = open('../data/cleaned_corpus.txt', encoding='utf-8').readlines()
sentences = [i.split() for i in lines]
# print(sentences[0])
models = gensim.models.Word2Vec(sentences, size=vector_size, window=ngram, min_count=min_count, workers=workers)
models.save('../word2vec/word2vec.model')

In [74]:
lines = open('../data/TrainSet-eCarX-171019.txt').readlines()
target_set = set()
for line in lines:
    target = line.split()[0]
    target_set.add(target)
target_set = list(target_set)
print(len(target_set))
models = gensim.models.Word2Vec.load('../word2vec/word2vec.model')
cleaner = corpusClean()
data = dict()
data['train_inputs'] = dict()
data['train_targets'] = dict()
data['valid_inputs'] = dict()
data['valid_targets'] = dict()
data['label_map'] = target_set
data['batched_train_inputs'] = dict()
data['batched_train_targets'] = dict()
data['batched_valid_inputs'] = dict()
data['batched_valid_targets'] = dict()

batch_size = 50

for line in lines:
    values = line.split()
    sentence = values[-1]
    tag = values[0]
    cleaned_sentence = cleaner.clean(sentence)
    inputs = np.zeros((len(cleaned_sentence), 200))
    for word, i in zip(cleaned_sentence, range(len(cleaned_sentence))):
        try:
            inputs[i] = models.wv[word]
        except:
            continue
    targets = np.zeros(len(target_set))
    targets[target_set.index(tag)] = 1.0
    key = len(inputs)
    if key not in data['train_inputs'].keys():
        data['train_inputs'][key] = []
        data['train_targets'][key] = []
    data['train_inputs'][key].append(inputs)
    data['train_targets'][key].append(targets)
    
data['train_keys'] = data['train_inputs'].keys()
print('Feature Extraction for training data finished.')


for key in data['train_keys']:
    count = int(len(data['train_inputs'][key])/batch_size)+1
    for i in range(count):
        new_key = str(key)+'_'+str(i)
        if i+1 != count:
            data['batched_train_inputs'][new_key] = np.array(data['train_inputs'][key][i*batch_size:(i+1)*batch_size])
            data['batched_train_targets'][new_key] = np.array(data['train_targets'][key][i*batch_size:(i+1)*batch_size])
        else:
            data['batched_train_inputs'][new_key] = np.array(data['train_inputs'][key][i*batch_size:])
            data['batched_train_targets'][new_key] = np.array(data['train_targets'][key][i*batch_size:])
    if count != 1:
        if (data['batched_train_inputs'][str(key)+'_1'][0] == data['train_inputs'][key][batch_size+1]).all() == True:
            print('SUCCESS!')
            
print('Batch data extraction finished.')

lines = open('../data/TestSet-eCarX-171019.txt', encoding='gbk').readlines()
for line in lines:
    values = line.split('#')
    sentence = values[0]
    tag = values[2]
    cleaned_sentence = cleaner.clean(sentence)
    inputs = np.zeros((len(cleaned_sentence), 200))
    for word, i in zip(cleaned_sentence, range(len(cleaned_sentence))):
        try:
            inputs[i] = models.wv[word]
        except:
            continue
    targets = np.zeros(len(target_set))
    targets[target_set.index(tag)] = 1.0
    key = len(inputs)
    if key not in data['valid_inputs'].keys():
        data['valid_inputs'][key] = []
        data['valid_targets'][key] = []
    data['valid_inputs'][key].append(inputs)
    data['valid_targets'][key].append(targets)
#     break
data['valid_keys'] = data['valid_inputs'].keys()
print('Feature Extraction for validation data finished.')

for key in data['valid_keys']:
    count = int(len(data['valid_inputs'][key])/batch_size)+1
    for i in range(count):
        new_key = str(key)+'_'+str(i)
        if i+1 != count:
            data['batched_valid_inputs'][new_key] = np.array(data['valid_inputs'][key][i*batch_size:(i+1)*batch_size])
            data['batched_valid_targets'][new_key] = np.array(data['valid_targets'][key][i*batch_size:(i+1)*batch_size])
        else:
            data['batched_valid_inputs'][new_key] = np.array(data['valid_inputs'][key][i*batch_size:])
            data['batched_valid_targets'][new_key] = np.array(data['valid_targets'][key][i*batch_size:])
    if count != 1:
        if (data['batched_valid_inputs'][str(key)+'_1'][0] == data['valid_inputs'][key][batch_size+1]).all() == True:
            print('SUCCESS!')
print('Batch data extraction finished.')

134
Feature Extraction for training data finished.
SUCCESS!
Batch data extraction finished.
Feature Extraction for validation data finished.
SUCCESS!
SUCCESS!
SUCCESS!
Batch data extraction finished.


In [78]:
pickle.dump(data['train_inputs'], open('../word2vec/gensim_word2vec_train_inputs.npz', 'wb'))
pickle.dump(data['train_targets'], open('../word2vec/gensim_word2vec_train_targets.npz', 'wb'))
pickle.dump(data['valid_inputs'], open('../word2vec/gensim_word2vec_valid_inputs.npz', 'wb'))
pickle.dump(data['valid_targets'], open('../word2vec/gensim_word2vec_valid_targets.npz', 'wb'))
pickle.dump(data['label_map'], open('../word2vec/gensim_word2vec_label_map.npz', 'wb'))
pickle.dump(data['batched_train_inputs'], open('../word2vec/gensim_word2vec_batched_train_inputs.npz', 'wb'))
pickle.dump(data['batched_train_targets'], open('../word2vec/gensim_word2vec_batched_train_targets.npz', 'wb'))
pickle.dump(data['batched_valid_inputs'], open('../word2vec/gensim_word2vec_batched_valid_inputs.npz', 'wb'))
pickle.dump(data['batched_valid_targets'], open('../word2vec/gensim_word2vec_batched_valid_targets.npz', 'wb'))

# Use id to generate word vector

In [6]:
cleaner = corpusClean()
train_lines = open('../data/TrainSet-eCarX-171019.txt').readlines()
vocabulary = set()
for line in train_lines:
    values = line.split()
    tag = values[0]
    sentence = values[-1]
    cleaned_sentence = cleaner.clean(sentence)
    for word in cleaned_sentence:
        vocabulary.add(word)
vocabulary.add('UNKNOWN')
vocabulary = list(vocabulary)
pickle.dump(vocabulary, open('../word2vec/dictionary.dict', 'wb'))


In [7]:
dictionary = pickle.load(open('../word2vec/dictionary.dict', 'rb'))
label_map = pickle.load(open('../word2vec/gensim_word2vec_label_map.npz', 'rb'))
anouymous = open('../word2vec/train.txt','w')
for line in train_lines:
    values = line.split()
    tag = values[0]
    sentence = values[-1]
    cleaned_sentence = cleaner.clean(sentence)
    ids = [None]*len(cleaned_sentence)
    for word, i in zip(cleaned_sentence, range(len(cleaned_sentence))):
        if word not in dictionary:
            ids[i] = dictionary.index('UNKNOWN')
        else:
            ids[i] = dictionary.index(word)
    save_sentence = [str(label_map.index(tag))] + [str(i) for i in ids]
#     print(save_sentence)
    save_sentence = ' '.join(save_sentence)
#     print(save_sentence)
#     print(ids)
    anouymous.write(save_sentence+'\n')
anouymous.close()

In [None]:
anouymous = open('../word2vec/valid.txt','w')
valid_lines = open('../data/TestSet-eCarX-171019.txt').readlines()
for line in valid_lines:
    values = line.split('#')
    tag = values[2]
    sentence = values[0]
    cleaned_sentence = cleaner.clean(sentence)
    ids = [None]*len(cleaned_sentence)
    for word, i in zip(cleaned_sentence, range(len(cleaned_sentence))):
        if word not in dictionary:
            ids[i] = dictionary.index('UNKNOWN')
        else:
            ids[i] = dictionary.index(word)
    save_sentence = [str(label_map.index(tag))] + [str(i) for i in ids]
#     print(save_sentence)
    save_sentence = ' '.join(save_sentence)
#     print(save_sentence)
#     print(ids)
    anouymous.write(save_sentence+'\n')
anouymous.close()

In [None]:
lines = open('../word2vec/train.txt').readlines()
corpus = [line.split()[1:] for line in lines]

In [214]:
vector_size = 200
ngram = 3
min_count = 2
workers = 4

models = gensim.models.Word2Vec(corpus, size=vector_size, window=ngram, min_count=min_count, workers=workers)
models.save('../word2vec/word2vec_anouymous.model')

In [3]:
class dataProvider():
    
    def __init__(self,filename, models, label_map):
        
        self.filename = filename
        self.models = models
        self.size = self.models.vector_size
        self.label_map = label_map
        self.data = dict()
        self.data['inputs'] = dict()
        self.data['targets'] = dict()
        self.lines = open(filename).readlines()
    
    def extract(self):
        
        for line in self.lines:
            values = line.split()
            words = values[1:]
            tag = values[0]
            key = len(words)
            inputs = np.zeros((len(words), self.size))
            targets = np.zeros(len(self.label_map))
            for word, i in zip(words, range(len(words))):
                try:
                    inputs[i] = self.models.wv[word]
                except:
                    continue
            targets[int(tag)] = 1.0
            if key not in self.data['inputs'].keys():
                self.data['inputs'][key] = []
                self.data['targets'][key] = []
            self.data['inputs'][key].append(inputs)
            self.data['targets'][key].append(targets)

In [8]:
models = gensim.models.Word2Vec.load('../word2vec/word2vec_anouymous.model')
label_map = pickle.load(open('../word2vec/gensim_word2vec_label_map.npz', 'rb'))
train_provider = dataProvider('../word2vec/train.txt', models, label_map)
train_provider.extract()
pickle.dump(train_provider.data, open('../word2vec/anouymous_train_data.npz', 'wb'))

valid_provider = dataProvider('../word2vec/valid.txt', models, label_map)
valid_provider.extract()
pickle.dump(valid_provider.data, open('../word2vec/anouymous_valid_data.npz', 'wb'))

In [9]:
def batch(data, batch_size):
    
    keys = data['inputs'].keys()
    results = dict()
    results['inputs'] = dict()
    results['targets'] = dict()
    
    for key in keys:
        count = int(len(data['inputs'][key])/batch_size)+1
        for i in range(count):
            new_key = str(key)+'_'+str(i)
            if i+1 != count:
                results['inputs'][new_key] = np.array(data['inputs'][key][i*batch_size:(i+1)*batch_size])
                results['targets'][new_key] = np.array(data['targets'][key][i*batch_size:(i+1)*batch_size])
            else:
                results['inputs'][new_key] = np.array(data['inputs'][key][i*batch_size:])
                results['targets'][new_key] = np.array(data['targets'][key][i*batch_size:])
    return results

In [10]:
train_batch = batch(train_provider.data, 200)
valid_batch = batch(valid_provider.data, 200)
pickle.dump(train_batch, open('../word2vec/anouymous_train_data_200.npz', 'wb'))
pickle.dump(valid_batch, open('../word2vec/anouymous_valid_data_200.npz', 'wb'))

In [253]:
import numpy as np
import pickle
import tensorflow as tf

    
class layer(object):
    def __init__(self):
        raise NotImplementedError
    
    def __str__(self):
        raise NotImplementedError
    
class affine_layer(layer):
    def __init__(self, name, inputs, shape, activation, reuse=False):
        with tf.variable_scope('affine', reuse=reuse):
            self.weights = tf.get_variable(name = name+'_weights', initializer=tf.truncated_normal(shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([shape[-1]]))
            self.outputs = activation(tf.add(tf.matmul(inputs, self.weights), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class conv_layer(layer):
    def __init__(self, name, inputs, shape, activation, reuse=False):
        with tf.variable_scope('conv', reuse=reuse):
            self.kernel = tf.get_variable(name = name+'_kernel', initializer=tf.truncated_normal(shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([shape[-1]]))
            self.outputs = activation(tf.add(tf.nn.conv2d(inputs, self.kernel, padding='VALID', strides=[1,1,1,1]), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class pooling_layer(layer):
    def __init__(self, name, inputs, reuse=False):
        with tf.variable_scope('pooling', reuse=reuse):
            self.outputs = tf.nn.max_pool(name = name+'_maxpooling', value=inputs, ksize=[1,2,2,1], padding='VALID', strides=[1,2,2,1])
    def __str__(self):
        return self.outputs.__str__()
        
class reshape_layer(layer):
    def __init__(self, name, inputs, shape, reuse=False):
        with tf.variable_scope('reshape', reuse=reuse):
            self.outputs = tf.reshape(name=name, tensor=inputs, shape=shape)
    def __str__(self):
        return self.outputs.__str__()
        
class deconv_layer(layer):
    def __init__(self, name, inputs, kernel_shape, output_shape, activation, reuse=False):
        with tf.variable_scope('deconv', reuse=reuse):
            self.kernel = tf.get_variable(name = name+'_kernel', initializer=tf.truncated_normal(kernel_shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([kernel_shape[-2]]))
            self.outputs = activation(tf.add(tf.nn.conv2d_transpose(inputs, self.kernel, output_shape=output_shape, padding='VALID', strides=[1,2,2,1]), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class lstm_layer(layer):
    def __init__(self, name, inputs, n_units, reuse=False):
        with tf.variable_scope('lstm', reuse=reuse):
            self.outputs, self.states = tf.nn.dynamic_rnn(
                cell = tf.contrib.rnn.BasicLSTMCell(n_units),
                inputs = inputs,
                dtype = tf.float32)
    def __str__(self):
        return self.outputs.__str__()

    
train = pickle.load(open('../word2vec/anouymous_train_data_50.npz', 'rb'))
valid = pickle.load(open('../word2vec/anouymous_valid_data_50.npz', 'rb'))
batched_inputs = train['inputs']
batched_targets = train['targets']
v_batched_inputs = valid['inputs']
v_batched_targets = valid['targets']

In [256]:
graph = tf.Graph()
with graph.as_default():
    
    input_placeholder = tf.placeholder(tf.float32, [None, None, 200])
    target_placeholder = tf.placeholder(tf.float32, [None, 134])
    
    lstm_layer1 = lstm_layer('lstm1', input_placeholder, 200)
    affine1 = affine_layer('affine1', lstm_layer1.outputs[:,-1,:], [200, 256], tf.nn.relu)
    affine2 = affine_layer('affine2', affine1.outputs, [256, 256], tf.nn.relu)
    affine3 = affine_layer('affine3', affine2.outputs, [256, 134], tf.identity)
    
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=affine3.outputs, labels=target_placeholder))
    optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
    accuracy = tf.reduce_sum(tf.cast(tf.equal(tf.argmax(affine3.outputs, 1), tf.argmax(target_placeholder, 1)), tf.float32))
    
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    
    for i in range(500):
        
        accs = 0.0
        errs = 0.0
        n = 0
        for key in batched_inputs.keys():
            inputs_batch = batched_inputs[key]
            targets_batch = batched_targets[key]
#             print(inputs_batch.shape)
#             print(targets_batch.shape)
            if len(inputs_batch) == 0:
                continue
            feed_dict = {input_placeholder:inputs_batch, target_placeholder:targets_batch}
            _, acc, err = sess.run([optimizer, accuracy, loss], feed_dict=feed_dict)
            accs += acc
            errs += err
            n += len(inputs_batch)
        print('#Training Epoch %d, acc: %f, err: %f, n: %d'%(i, accs/n, errs/n, n))
        if (i+1)%5 == 0:
            for key in v_batched_inputs.keys():
                inputs_batch = v_batched_inputs[key]
                targets_batch = v_batched_targets[key]
#                 print(inputs_batch.shape)
#                 print(targets_batch.shape)
                if len(v_inputs_batch) == 0:
                    continue
                feed_dict = {input_placeholder:inputs_batch, target_placeholder:targets_batch}
                acc, err = sess.run([accuracy, loss], feed_dict=feed_dict)
                accs += acc
                errs += err
                n += len(inputs_batch)
            print('__________________________________________________________________________')
            print('#Testing Epoch %d, acc: %f, err: %f, n: %d'%(i, accs/n, errs/n, n))
            print('__________________________________________________________________________')

KeyboardInterrupt: 

# Generate with padding

In [None]:
models = gensim.models.Word2Vec.load('../word2vec/word2vec_anouymous.model')
label_map = pickle.load(open('../word2vec/gensim_word2vec_label_map.npz', 'rb'))
train_provider = dataProvider('../word2vec/train.txt', models, label_map)
train_provider.extract()
pickle.dump(train_p rovider.data, open('../word2vec/anouymous_train_data.npz', 'wb'))

valid_provider = dataProvider('../word2vec/valid.txt', models, label_map)
valid_provider.extract()
pickle.dump(valid_provider.data, open('../word2vec/anouymous_valid_data.npz', 'wb'))