In [2]:
from text_embedder import TextEmbedder
from gensim import corpora, models
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import random
import pickle
import nltk
from sklearn.metrics import confusion_matrix
from datetime import datetime
import tensorflow as tf
import random
import pickle

In [3]:
# load pre-trained data
business = pd.read_csv('../data/chinese_business_clean.csv')
reviews = pd.read_csv('../data/chinese_reviews_clean_offsets.csv')
reviews['date_tuple'] = [eval(i) for i in reviews['date_tuple']]

# load gensim model
lda =  models.LdaModel.load('../data/gensim/lda.model')
dictionary = corpora.Dictionary.load('../data/gensim/chinsese_dict.dict')

# load idf matrices
with open('../data/u_idf.pickle', 'rb') as f:
    uidf_data = pickle.load(f)
with open('../data/b_idf.pickle', 'rb') as f:
    bidf_data = pickle.load(f)

In [11]:
embedder = TextEmbedder(model = lda, dictionary = dictionary, user_idf = uidf_data, business_idf = bidf_data)

In [5]:
def layer(input_data, size_in, size_out, name):
    '''
    Implement tensor
    '''
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')
        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation

In [6]:
def output_layer(input_data, size_in, size_out, name):
    '''
    output tensor
    '''
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')

        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation, w

In [7]:
def build_model(x,input_size, hidden, out_size, gate_dimention = 0.0, drop_out = 1.0):
    '''
    implement a filter gate before input
    '''
    # implement random filter here
    
    prev = input_size
    if gate_dimention != 0.0:
        gate = np.zeros(input_size)
        i = random.sample(set(np.arange(input_size)), gate_dimention)
        gate[[i]] = 1.0
        activation = tf.multiply(x, gate)
    else:
        activation = x
        
    # add dropout layer with specified probability
    if drop_out != 1.0:
        activation = tf.nn.dropout(x, drop_out)

    # build a series of hidden layers
    for name,i in enumerate(hidden):
        activation = layer(activation, prev, i, 'hiddenlayer-'+str(name))
        prev = i

    # build an output layer
    embedding_in = activation
    if out_size == 1:
        out, weights = output_layer_sm(activation, hidden[-1], out_size, 'output')
    else:
        out, weights = output_layer(activation, hidden[-1], out_size, 'output')

    return out, embedding_in, weights

In [8]:
# try basic methods with January of 2013 - 2017
def modified_split(df, year_month, embedder, t_size = 0.2, enum = 0, binary = False):
    # select regions
    data = df[df['date_tuple'] == year_month]
    if binary:
        data = data[data['offset'] != 0]
    # create labels
    label = data['offset'].values
    if enum == 0: 
        embed = np.array([embedder.augmented_embed_text(t) for t in data['text'].values])
    elif enum == 1: 
        embed = np.array([embedder.user_tfidf_embed(t, u) for t, u in zip(data['test'].values, data['user_id'].values)])
    elif enum == 2: 
        embed = np.array([embedder.user_tf_business_idf(t, b) for t in zip(data['test'].values, data['business_id'].values)])
    elif enum == 3: 
        embed = np.array([embedder.user_tfidf_business_idf(t, u, b) for t, u, b in zip(data['test'].values, data['user_id'].values, data['business_id'].values)])
    elif enum == 4: 
        embed = np.array([embedder.embed(t) for t in zip(data['test'].values)])
    elif enum == 5:
        embed = np.array([embedder.embed_sent(t) for t in zip(data['test'].values)])
    else:
        print ('enum {} is not supported'.format(enum))
        return None
    return embed, label

In [30]:
# label data, try to predict simple labels -- positive(1), negative(-1) or average(0)
def labels(data):
    out = []
    for offsets in data:
        if offsets < 0.0:
            out.append(np.array([1.,0.,0.]))
        elif offsets == 0.0:
            out.append(np.array([0.,1.,0.]))
        else:
            out.append(np.array([0.,0.,1.0]))
    return np.array(out)

In [45]:
def labels_binary(data):
    out = []
    for offsets in data:
        if offsets < 0.0:
            out.append(np.array([1.,0.]))
        else:
            out.append(np.array([0.,1.0]))
    return np.array(out)

In [31]:
def train(x_data, y_data, x_test, y_test, training_epoch, beta = 0.0, gate_size = 0.0, drop_out = 1.0, learning_rate = 0.01, hidden_layer = [100, 80], out_layer = 5):
    
    x = tf.placeholder(tf.float32, shape = [None, 128], name = 'input_topic') # number of topics
    y = tf.placeholder(tf.float32, shape = [None, out_layer], name = 'softmax') # 5 stars

    embedded_size = hidden_layer[-1]
    out, embedding_in, weights = build_model(x, 128, hidden_layer, out_layer, gate_size, drop_out) # shape of (?, 5)
    
    
    # loss
    with tf.name_scope("loss"):
        if out_layer == 1:
            cross_entropy = tf.multiply(tf.reduce_mean(tf.pow(out - y, 2)), 10)
        else:
            cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits= out, labels = y))
            regularizer = tf.nn.l2_loss(weights)
            cross_entropy = tf.reduce_mean(cross_entropy + beta * regularizer)
    # optimization
    with tf.name_scope("train"):
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)
    # reports
    with tf.name_scope("accuracy"):
        if out_layer == 1:
            correct = tf.reduce_mean(tf.cast(cross_entropy , dtype = tf.float32))
        else:
            correct = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), tf.argmax(y, 1)), dtype = tf.float32))
    
    print ('training starts ...')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(training_epoch):
            idx = random.sample(set(np.arange(len(x_data))), 10)
            # create embedding with embedder
            x_in = x_data[idx]
            y_out = y_data[idx]
            
            if epoch % 50000 == 0:
                [accuracy] = sess.run([correct], feed_dict = {x:x_data[:10], y:y_data[:10]})
                print ('%.4f' % accuracy)
            sess.run(opt, feed_dict = {x:x_in, y:y_out})

        pred = tf.nn.softmax(out)  # Apply softmax to logits
        # Calculate accuracy
        print("Accuracy:", sess.run(correct, feed_dict = {x: x_test, y: y_test}))

In [12]:
%time x0, y0 = modified_split(reviews, (2013, 1), embedder, 0)

CPU times: user 19.6 s, sys: 56.2 ms, total: 19.7 s
Wall time: 18.5 s


In [32]:
y0 = labels(y0)
X_train, X_test, y_train, y_test = train_test_split(x0, y0, test_size=0.2)

In [34]:
train(X_train, y_train, X_test, y_test, 500000, out_layer = 3)

training starts ...
0.1000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
0.7000
Accuracy: 0.433692


In [35]:
%time x1, y1 = modified_split(reviews, (2013, 1), embedder, 1)

CPU times: user 19.6 s, sys: 58.7 ms, total: 19.7 s
Wall time: 18.6 s


In [36]:
y1 = labels(y1)
X_train, X_test, y_train, y_test = train_test_split(x1, y1, test_size=0.2)

In [37]:
train(X_train, y_train, X_test, y_test, 200000, out_layer = 3)

training starts ...
0.3000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
Accuracy: 0.480287


In [38]:
%time x2, y2 = modified_split(reviews, (2013, 1), embedder, 2)

CPU times: user 19.7 s, sys: 69.2 ms, total: 19.7 s
Wall time: 18.6 s


In [39]:
y2 = labels(y2)
X_train, X_test, y_train, y_test = train_test_split(x2, y2, test_size=0.2)

In [40]:
train(X_train, y_train, X_test, y_test, 200000, out_layer = 3)

training starts ...
0.3000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
Accuracy: 0.451613


In [41]:
%time x3, y3 = modified_split(reviews, (2013, 1), embedder, 3)

CPU times: user 18.9 s, sys: 29.1 ms, total: 19 s
Wall time: 17.8 s


In [42]:
y3 = labels(y3)
X_train, X_test, y_train, y_test = train_test_split(x3, y3, test_size=0.2)

In [43]:
train(X_train, y_train, X_test, y_test, 200000, out_layer = 3)

training starts ...
0.5000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
0.2000
Accuracy: 0.448029


## Test with binary class

In [44]:
%time x0, y0 = modified_split(reviews, (2013, 1), embedder, 0, binary = True)

CPU times: user 11.9 s, sys: 49.7 ms, total: 12 s
Wall time: 11.4 s


In [46]:
y0 = labels_binary(y0)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(x0, y0, test_size=0.2)

In [50]:
train(X_train, y_train, X_test, y_test, 200000, out_layer = 2)

training starts ...
0.4000
0.4000
0.4000
0.6000
0.6000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
Accuracy: 0.470199


In [52]:
%time x1, y1 = modified_split(reviews, (2013, 1), embedder, 1, binary = True)

CPU times: user 11.4 s, sys: 28.9 ms, total: 11.5 s
Wall time: 10.9 s


In [53]:
y1 = labels_binary(y1)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(x1, y1, test_size=0.2)

In [55]:
train(X_train, y_train, X_test, y_test, 200000, out_layer = 2)

training starts ...
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.7000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
Accuracy: 0.496689


In [56]:
%time x2, y2 = modified_split(reviews, (2013, 1), embedder, 2, binary = True)

CPU times: user 11.3 s, sys: 17.3 ms, total: 11.3 s
Wall time: 10.7 s


In [57]:
y2 = labels_binary(y2)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(x2, y2, test_size=0.2)

In [59]:
train(X_train, y_train, X_test, y_test, 200000, out_layer = 2)

training starts ...
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
0.4000
Accuracy: 0.536424


In [60]:
%time x3, y3 = modified_split(reviews, (2013, 1), embedder, 3, binary = True)

CPU times: user 11.3 s, sys: 19.1 ms, total: 11.3 s
Wall time: 10.7 s


In [61]:
y3 = labels_binary(y3)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(x3, y3, test_size=0.2)

In [63]:
train(X_train, y_train, X_test, y_test, 200000, out_layer = 2)

training starts ...
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
0.5000
Accuracy: 0.503311
