In [1]:
import gensim
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re
from tqdm import tqdm, tqdm_notebook



In [2]:
class corpusClean():

    def __init__(self):
        self.num_re = re.compile('[0-9一二三四五六七八九十]+')
        self.char_re = re.compile('[a-zA-Z\-\,\.\。\，]+')

    def clean(self, line):
        sentence = line
        removed_char = re.sub(self.char_re, ' <CHAR> ', sentence)
        removed_num = re.sub(self.num_re, ' <NUMBER> ', removed_char)
        words = removed_num.split()
        saved_words = ['<BEGIN>']
        for word in words:
            if len(word) == 1 or word.find('<') != -1:
                saved_words.append(word)
            else:
                for w in word:
                    saved_words.append(w)
        saved_words.append('<END>')
        return saved_words
    
    
class layer(object):
    def __init__(self):
        raise NotImplementedError
    
    def __str__(self):
        raise NotImplementedError
    
class affine_layer(layer):
    def __init__(self, name, inputs, shape, activation, reuse=False):
        with tf.variable_scope('affine', reuse=reuse):
            self.weights = tf.get_variable(name = name+'_weights', initializer=tf.truncated_normal(shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([shape[-1]]))
            self.outputs = activation(tf.add(tf.matmul(inputs, self.weights), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class conv_layer(layer):
    def __init__(self, name, inputs, shape, activation, reuse=False):
        with tf.variable_scope('conv', reuse=reuse):
            self.kernel = tf.get_variable(name = name+'_kernel', initializer=tf.truncated_normal(shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([shape[-1]]))
            self.outputs = activation(tf.add(tf.nn.conv2d(inputs, self.kernel, padding='VALID', strides=[1,1,1,1]), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class pooling_layer(layer):
    def __init__(self, name, inputs, reuse=False):
        with tf.variable_scope('pooling', reuse=reuse):
            self.outputs = tf.nn.max_pool(name = name+'_maxpooling', value=inputs, ksize=[1,2,2,1], padding='VALID', strides=[1,2,2,1])
    def __str__(self):
        return self.outputs.__str__()
        
class reshape_layer(layer):
    def __init__(self, name, inputs, shape, reuse=False):
        with tf.variable_scope('reshape', reuse=reuse):
            self.outputs = tf.reshape(name=name, tensor=inputs, shape=shape)
    def __str__(self):
        return self.outputs.__str__()
        
class deconv_layer(layer):
    def __init__(self, name, inputs, kernel_shape, output_shape, activation, reuse=False):
        with tf.variable_scope('deconv', reuse=reuse):
            self.kernel = tf.get_variable(name = name+'_kernel', initializer=tf.truncated_normal(kernel_shape, stddev=0.05), dtype=tf.float32)
            self.bias = tf.get_variable(name = name+'_bias', initializer=tf.zeros([kernel_shape[-2]]))
            self.outputs = activation(tf.add(tf.nn.conv2d_transpose(inputs, self.kernel, output_shape=output_shape, padding='VALID', strides=[1,2,2,1]), self.bias))
    def __str__(self):
        return self.outputs.__str__()
    
class lstm_layer(layer):
    def __init__(self, name, inputs, n_units, reuse=False):
        with tf.variable_scope('lstm', reuse=reuse):
            self.outputs, self.states = tf.nn.dynamic_rnn(
                cell = tf.contrib.rnn.BasicLSTMCell(n_units),
                inputs = inputs,
                dtype = tf.float32)
    def __str__(self):
        return self.outputs.__str__()

In [3]:
sentences = open('../data/cleaned_corpus.txt', encoding='utf-8').readlines()
lines = open('../data/TrainSet-eCarX-171019.txt', encoding='gbk').readlines()
vector_size = 200
ngram = 3
min_count = 2
workers = 4
models = gensim.models.Word2Vec(sentences, size=vector_size, window=ngram, min_count=min_count, workers=workers)
cleaner = corpusClean()

In [None]:
models.save('../word2vec/word2vec.model')

In [None]:
train_data = dict()
train_target = dict()
for sentence, line in zip(sentences, lines):
    words = sentence.split()
    vector = np.zeros((len(words), vector_size))
    for word,i in zip(words, range(len(words))):
        try:
            vector[i] = models.wv[word]
        except:
            continue
    target = line.split()[0]
    if len(words) not in train_data.keys():
        train_data[len(words)] = []
        train_target[len(words)] = []
    train_data[len(words)].append(vector)
    train_target[len(words)].append(target)
    
print(train_data.keys())
for key in train_data.keys():
    train_data[key] = np.array(train_data[key])


In [None]:
def one_hot(targets, n_target):
    targets_set = list(set(targets))
    results = np.zeros((len(targets), n_target))
    for i in range(len(targets)):
        results[i][targets_set.index(targets[i])] = 1.0
    return results

def one_hot_dict(targets, n_target):
    results = targets
    for key in targets.keys():
        results[key] = one_hot(targets[key], n_target)
    return results

In [None]:
targets = [i.split()[0] for i in lines]
n_target = len(set(targets))
train_target_one_hot = one_hot_dict(train_target, n_target)

In [9]:
print(train_data[13].shape)
print(train_target[13].shape)
print(train_target_one_hot[13].shape)

(5843, 13, 200)
(5843, 134)
(5843, 134)


In [None]:
graph = tf.Graph()
with graph.as_default():
    
    input_placeholder = tf.placeholder(tf.float32, [None, None, 200])
    target_placeholder = tf.placeholder(tf.float32, [None, 134])
    
    lstm_layer1 = lstm_layer('lstm1', input_placeholder, 200)
    affine1 = affine_layer('affine1', lstm_layer1.outputs[:,-1,:], [200, 256], tf.nn.relu)
    affine2 = affine_layer('affine2', affine1.outputs, [256, 256], tf.nn.relu)
    affine3 = affine_layer('affine3', affine2.outputs, [256, 134], tf.identity)
    
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=affine3.outputs, labels=target_placeholder))
    optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
    accuracy = tf.reduce_sum(tf.cast(tf.equal(tf.argmax(affine3.outputs, 1), tf.argmax(target_placeholder, 1)), tf.float32))
    
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    
    for i in range(500):
        
        accs = 0.0
        errs = 0.0
        n = 0
        for key,i in zip(train_data.keys(), range(len(train_data.keys()))):
            inputs_batch = train_data[key]
            targets_batch = train_target_one_hot[key]
#             print(inputs_batch.shape)
#             print(targets_batch.shape)
            feed_dict = {input_placeholder:inputs_batch, target_placeholder:targets_batch}
            _, acc, err = sess.run([optimizer, accuracy, loss], feed_dict=feed_dict)
            accs += acc
            errs += err
            n += len(inputs_batch)
        print('#Training Epoch %d, acc: %f, err: %f, n: %d'%(i, accs/n, errs/n, n))

#Training Epoch 35, acc: 0.008209, err: 0.000053, n: 134000


In [None]:
a = 1