# Base line model 

## Adding random gate after input layer, see if the training result contains variance

In [2]:
from gensim_lda_model import Gensimembedder
from gensim import corpora, models
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import random

In [3]:
def gen_data(ids):
    '''
    generate embedding from the set of ids
    '''
    data = reviews[reviews['review_id'].isin(ids)]
    out = []
    for d in data['text']:
        out.append(model.embed_sent(d))
    return np.array(out)

In [4]:
def one_hot(stars):
    '''
    build one hot encoding for the star rating
    '''
    res = []
    for s in stars:
        out = np.array([0.0]*5)
        out[s-1] = 1.0
        res.append(out)
    return np.array(res)

In [6]:
def layer(input_data, size_in, size_out, name):
    '''
    Implement tensor
    '''
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')
        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation

In [7]:
def output_layer(input_data, size_in, size_out, name):
    '''
    output tensor
    '''
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')

        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation

In [8]:
def build_model_with_gate(x, gate_dimention ,input_size, hidden, out_size):
    '''
    implement a filter gate before input
    '''
    # implement random filter here
    gate = np.zeros(input_size)
    i = random.sample(set(np.arange(input_size)), gate_dimention)
    gate[[i]] = 1.0

    prev = input_size
    activation = tf.multiply(x, gate)

    # build a series of hidden layers
    for name,i in enumerate(hidden):
        activation = layer(activation, prev, i, 'hiddenlayer-'+str(name))
        prev = i

    # build an output layer
    embedding_in = activation
    out = output_layer(activation, hidden[-1], out_size, 'output')

    return out, embedding_in

In [45]:
def build_model(x,input_size, hidden, out_size, gate_dimention = 0.0, drop_out = 1.0):
    '''
    implement a filter gate before input
    '''
    # implement random filter here
    
    prev = input_size
    if gate_dimention != 0.0:
        gate = np.zeros(input_size)
        i = random.sample(set(np.arange(input_size)), gate_dimention)
        gate[[i]] = 1.0
        activation = tf.multiply(x, gate)
    else:
        activation = x
        
    # add dropout layer with specified probability
    if drop_out != 1.0:
        activation = tf.nn.dropout(x, drop_out)

    # build a series of hidden layers
    for name,i in enumerate(hidden):
        activation = layer(activation, prev, i, 'hiddenlayer-'+str(name))
        prev = i

    # build an output layer
    embedding_in = activation
    out = output_layer(activation, hidden[-1], out_size, 'output')

    return out, embedding_in

In [10]:
# load pre-trained data
business = pd.read_csv('chinese_business_clean.csv')
reviews = pd.read_csv('chinese_reviews_clean.csv')

lda =  models.LdaModel.load('gensim/lda.model')
dictionary = corpora.Dictionary.load('gensim/chinsese_dict.dict')

In [11]:
model = Gensimembedder(model = lda, dictionary = dictionary)

In [16]:
reviews.groupby('business_id').count().reset_index().sort_values(by = 'review_id', ascending = False)[:5]

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
3692,yfxDa8RFOvJPQh0rNtakHA,2446,2446,2446,2446,2446,2446,2446,2446
3088,pH0BLkL4cbxKzu471VZnuA,1972,1972,1972,1972,1972,1972,1972,1972
1944,X8c23dur0ll2D9XTu-I8Qg,1548,1548,1548,1548,1548,1548,1548,1548
953,GJ_bXUPv672YwNg4TneJog,1319,1319,1319,1319,1319,1319,1319,1319
2312,cHdJXLlKNWixBXpDwEGb_A,1262,1262,1262,1262,1262,1262,1262,1262


In [20]:
# focus on multiple business
case_review1 = reviews[reviews['business_id'] == 'yfxDa8RFOvJPQh0rNtakHA']
case_review2 = reviews[reviews['business_id'] == 'pH0BLkL4cbxKzu471VZnuA']
case_review3 = reviews[reviews['business_id'] == 'X8c23dur0ll2D9XTu-I8Qg']
case_review4 = reviews[reviews['business_id'] == 'GJ_bXUPv672YwNg4TneJog']
case_review5 = reviews[reviews['business_id'] == 'cHdJXLlKNWixBXpDwEGb_A']

In [43]:
def train(data, gate_size, training_epoch, drop_out = 1.0, learning_rate = 0.01, hidden_layer = [100, 80], out_layer = 5):
    id_train, id_test, star_train, star_test = train_test_split(data['review_id'], data['stars'], test_size=0.2)

    # generate data by specific embedding 
    embed_train = gen_data(id_train)
    embed_test = gen_data(id_test)

    # generate labels
    one_hot_star = one_hot(star_train)
    one_hot_star_test = one_hot(star_test)

    x = tf.placeholder(tf.float32, shape = [None, 128], name = 'input_topic') # number of topics
    y = tf.placeholder(tf.float32, shape = [None, out_layer], name = 'softmax') # 5 stars

    embedded_size = hidden_layer[-1]
    out, embedding_in = build_model(x, 128, hidden_layer, out_layer, gate_size, drop_out) # shape of (?, 5)
    
    # loss
    with tf.name_scope("loss"):
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits= out, labels = y))
    # optimization
    with tf.name_scope("train"):
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)
    # reports
    with tf.name_scope("accuracy"):
        correct = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), tf.argmax(y, 1)), dtype = tf.float32))
        
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(training_epoch):

            idx = random.sample(set(np.arange(len(embed_train))), 10)
            x_in = embed_train[idx]
            y_out = one_hot_star[idx]            
            if epoch % 10000 == 0:
                [accuracy] = sess.run([correct], feed_dict = {x:embed_train[0:10], y:one_hot_star[0:10]})
                print ('%.2f' % accuracy)
            sess.run(opt, feed_dict = {x:x_in, y:y_out})

        pred = tf.nn.softmax(out)  # Apply softmax to logits
        # Calculate accuracy
        print("Accuracy:", sess.run(correct, feed_dict = {x: embed_test, y: one_hot_star_test}))

In [36]:
# gate with 100 topics
for _ in range(5):
    train(case_review1, 100, 100000)

0.10
0.60
0.50
0.50
0.70
0.60
0.60
1.00
1.00
1.00
Accuracy: 0.218367
0.10
0.20
0.20
0.20
0.20
0.30
0.20
0.50
0.50
0.50
Accuracy: 0.244898
0.20
0.40
0.50
0.50
0.50
0.60
0.60
0.80
0.80
0.90
Accuracy: 0.259184
0.00
0.50
0.50
0.50
0.40
0.50
0.60
0.70
0.70
0.80
Accuracy: 0.259184
0.20
0.30
0.40
0.50
0.50
0.50
0.60
0.60
0.60
0.60
Accuracy: 0.259184


In [37]:
# gate with 110 topics
for _ in range(5):
    train(case_review1, 110, 100000)

0.00
0.40
0.40
0.40
0.50
0.50
0.60
0.70
0.80
0.70
Accuracy: 0.234694
0.40
0.10
0.10
0.10
0.10
0.30
0.60
0.60
0.70
0.80
Accuracy: 0.240816
0.40
0.50
0.50
0.50
0.50
0.50
0.50
0.50
0.60
0.60
Accuracy: 0.269388
0.10
0.20
0.30
0.40
0.60
0.60
0.60
0.70
0.70
0.80
Accuracy: 0.244898
0.30
0.30
0.30
0.50
0.40
0.50
0.60
0.70
0.50
0.90
Accuracy: 0.218367


In [38]:
# gate with 120 topics
for _ in range(5):
    train(case_review1, 120, 100000)

0.30
0.30
0.30
0.20
0.50
0.70
0.40
0.70
0.70
0.70
Accuracy: 0.240816
0.00
0.30
0.30
0.40
0.60
0.70
0.90
0.70
0.90
0.90
Accuracy: 0.255102
0.10
0.20
0.20
0.30
0.30
0.40
0.60
0.90
0.80
1.00
Accuracy: 0.271429
0.40
0.50
0.60
0.60
0.60
0.70
0.60
0.70
0.70
0.70
Accuracy: 0.263265
0.40
0.20
0.20
0.40
0.60
0.80
0.80
0.90
0.90
0.90
Accuracy: 0.255102


In [46]:
# no gate, dropout rate 10%
for _ in range(5):
    train(case_review1, 0.0, 150000, drop_out = 0.9)

0.30
0.60
0.60
0.60
0.60
0.60
0.60
0.50
0.60
0.50
Accuracy: 0.285714
0.20
0.20
0.20
0.20
0.30
0.40
0.40
0.40
0.60
0.40
Accuracy: 0.293878
0.40
0.40
0.60
0.60
0.50
0.50
0.60
0.60
0.50
0.50
Accuracy: 0.306122
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.50
0.60
0.50
Accuracy: 0.283673
0.10
0.30
0.30
0.30
0.40
0.50
0.40
0.40
0.40
0.40
Accuracy: 0.285714


In [47]:
# no gate, dropout rate 20%
for _ in range(5):s
    train(case_review1, 0.0, 150000, drop_out = 0.8)

0.10
0.60
0.70
0.70
0.60
0.70
0.70
0.70
0.80
0.70
0.40
0.70
0.70
0.60
0.90
Accuracy: 0.308163
0.30
0.30
0.30
0.30
0.30
0.30
0.30
0.30
0.30
0.40
0.30
0.40
0.30
0.30
0.30
Accuracy: 0.283673
0.20
0.30
0.30
0.30
0.30
0.30
0.30
0.40
0.30
0.40
0.30
0.30
0.30
0.20
0.20
Accuracy: 0.283673
0.00
0.50
0.50
0.40
0.50
0.60
0.50
0.50
0.40
0.40
0.60
0.30
0.60
0.40
0.60
Accuracy: 0.3
0.20
0.40
0.40
0.40
0.50
0.50
0.50
0.50
0.20
0.40
0.30
0.30
0.20
0.40
0.30
Accuracy: 0.295918


In [49]:
# default
for _ in range(5):
    train(case_review1, 0.0, 150000, drop_out = 1.0)

0.10
0.50
0.40
0.20
0.40
0.40
0.50
0.60
0.70
0.70
0.70
0.90
1.00
1.00
1.00
Accuracy: 0.240816
0.10
0.50
0.60
0.60
0.60
0.60
0.50
0.60
0.70
0.70
0.70
0.70
0.70
0.70
0.70
Accuracy: 0.210204
0.10
0.30
0.30
0.40
0.50
0.70
0.90
0.90
1.00
1.00
1.00
1.00
0.90
1.00
1.00
Accuracy: 0.236735
0.10
0.30
0.30
0.40
0.40
0.50
0.60
0.60
0.60
0.60
0.70
0.70
0.70
0.70
0.80
Accuracy: 0.279592
0.40
0.50
0.50
0.70
0.70
0.70
0.80
0.80
0.80
0.70
0.80
0.80
0.80
0.70
0.80
Accuracy: 0.277551
