### Hypothesis: If one review is talking about the representitive latent topic of given business, people tend to verify as useful 

In [55]:
from gensim_lda_model import Gensimembedder
from gensim import corpora, models
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
import numpy as np
import tensorflow as tf
import random

In [56]:
business = pd.read_csv('chinese_business_clean.csv')
reviews = pd.read_csv('chinese_reviews_clean.csv')

In [57]:
lda =  models.LdaModel.load('gensim/lda.model')
dictionary = corpora.Dictionary.load('gensim/chinsese_dict.dict')

In [58]:
model = Gensimembedder(model = lda, dictionary = dictionary)

### Case study with yfxDa8RFOvJPQh0rNtakHA

In [59]:
business[business['business_id'] == 'yfxDa8RFOvJPQh0rNtakHA']

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
3109,4029 Spring Mountain Rd,"{'RestaurantsTableService': True, 'GoodForMeal...",yfxDa8RFOvJPQh0rNtakHA,"['Vietnamese', 'Restaurants', 'Chinese']",Las Vegas,"{'Monday': '0:00-0:00', 'Tuesday': '0:00-0:00'...",1,36.126194,-115.193445,Pho Kim Long,Chinatown,89102,2445,3.5,NV


In [60]:
case_review = reviews[reviews['business_id'] == 'yfxDa8RFOvJPQh0rNtakHA']

In [61]:
case_review = case_review.sort_values(by = 'date')

In [62]:
len(case_review)

2446

In [63]:
id_train, id_test, star_train, star_test = train_test_split(case_review['review_id'], case_review['stars'], test_size=0.33)

In [64]:
def gen_data(ids):
    '''
    generate embedding from the set of ids
    '''
    data = reviews[reviews['review_id'].isin(ids)]
    out = [] 
    for d in data['text']:
        out.append(model.embed_sent(d))
    return np.array(out)

In [65]:
data = reviews[reviews['review_id'] == id_train.values[0]]

In [66]:
def gen_bow_data(ids):
    '''
    generate bag of words embedding from the set of ids
    '''
    data = reviews[reviews['review_id'].isin(ids)]
    out = [] 
    for d in data['text']:
        out.append(model.embed_bow(d))
    return np.array(out)

In [68]:
%time embed_train = gen_data(id_train)

CPU times: user 20.5 s, sys: 83.7 ms, total: 20.6 s
Wall time: 18.8 s


In [69]:
embed_test = gen_data(id_test)

In [70]:
def one_hot(stars):
    res = []
    for s in stars:
        out = np.array([0.0]*5)
        out[s-1] = 1.0
        res.append(out)
    return np.array(res)

In [71]:
def one_hot(stars):
    res = []
    for s in stars:
        out = np.array([0.0]*3)
        if s == 5:
            out[2] = 1.0
        elif s==1:
            out[0] = 1.0
        else:
            out[1] = 1.0
        res.append(out)
    return np.array(res)

In [72]:
one_hot_star = one_hot(star_train)

In [73]:
one_hot_star_test = one_hot(star_test)

## Build a feedforward nn

In [74]:
def layer(input_data, size_in, size_out, name):
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')

        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation

In [75]:
def output_layer(input_data, size_in, size_out, name):
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')

        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation

In [76]:
def build_model(x, input_size, hidden, out_size):
    with tf.name_scope("model"):
        prev = input_size
        activation = x 
        # build a series of hidden layers
        for name,i in enumerate(hidden):
            activation = layer(activation, prev, i, 'hiddenlayer-'+str(name))
            prev = i

        # build an output layer
        out = output_layer(activation, hidden[-1], out_size, 'output')

        return out

In [77]:
x = tf.placeholder(tf.float32, shape = [None, 128], name = 'input_topic') # number of topics
x_sparse = tf.sparse_placeholder(tf.float32, name = 'input_topic')

y = tf.placeholder(tf.float32, shape = [None, 3], name = 'softmax') # 5 stars
learning_rate = 0.05
out = build_model(x, 128, [100], 3) # shape of (?, 5)

# loss
with tf.name_scope("loss"):
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits= out, labels = y))
# optimization
with tf.name_scope("train"):
    opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)
# reports
with tf.name_scope("accuracy"):
    correct = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), tf.argmax(y, 1)), dtype = tf.float32))

In [78]:
training_epoch = 50000

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(training_epoch):
        
        idx = random.sample(set(np.arange(1638)), 10)
        x_in = embed_train[idx]
        y_out = one_hot_star[idx]

        r = random.randint(0,1638-11)
        if epoch % 5000 == 0:
            [accuracy] = sess.run([correct], feed_dict = {x:embed_train[:10], y:one_hot_star[:10]})
            print ('%.2f' % accuracy)
        sess.run(opt, feed_dict = {x:x_in, y:y_out})
    
    pred = tf.nn.softmax(out)  # Apply softmax to logits
    # Calculate accuracy

    print("Accuracy:", sess.run(correct, feed_dict = {x: embed_test, y: one_hot_star_test}))

0.30
0.60
0.50
0.40
0.70
0.70
0.60
0.60
0.50
0.60
Accuracy: 0.587871
