In [1]:
from text_embedder import TextEmbedder
from gensim import corpora, models
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import random
import pickle

In [2]:
def one_hot(stars):
    '''
    build one hot encoding for the star rating
    '''
    res = []
    for s in stars:
        out = np.array([0.0]*5)
        out[s-1] = 1.0
        res.append(out)
    return np.array(res)

In [3]:
def one_hot_three(stars):
    res = []
    for s in stars:
        out = np.array([0.0]*3)
        if s == 5:
            out[2] = 1.0
        elif s==1:
            out[0] = 1.0
        else:
            out[1] = 1.0
        res.append(out)
    return np.array(res)

In [16]:
def layer(input_data, size_in, size_out, name):
    '''
    Implement tensor
    '''
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')
        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation

In [17]:
def output_layer(input_data, size_in, size_out, name):
    '''
    output tensor
    '''
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')

        activation = tf.atan(tf.matmul(input_data, w) + b)

        return activation, w

In [6]:
def build_model(x,input_size, hidden, out_size, gate_dimention = 0.0, drop_out = 1.0):
    '''
    implement a filter gate before input
    '''
    # implement random filter here
    
    prev = input_size
    if gate_dimention != 0.0:
        gate = np.zeros(input_size)
        i = random.sample(set(np.arange(input_size)), gate_dimention)
        gate[[i]] = 1.0
        activation = tf.multiply(x, gate)
    else:
        activation = x
        
    # add dropout layer with specified probability
    if drop_out != 1.0:
        activation = tf.nn.dropout(x, drop_out)

    # build a series of hidden layers
    for name,i in enumerate(hidden):
        activation = layer(activation, prev, i, 'hiddenlayer-'+str(name))
        prev = i

    # build an output layer
    embedding_in = activation
    if out_size == 1:
        out, weights = output_layer_sm(activation, hidden[-1], out_size, 'output')
    else:
        out, weights = output_layer(activation, hidden[-1], out_size, 'output')

    return out, embedding_in, weights

In [7]:
def gen_embedding(enum, df, embedder):
    out = []
    if enum == 0:
        for d in df['text'].values:
            out.append(embedder.augmented_embed_text(d))
    elif enum == 1:
        for d,u in zip(df['text'].values, df['user_id'].values):
            out.append(embedder.user_tfidf_embed(d, u))
    elif enum == 2:
        for d,b in zip(df['text'].values, df['business_id'].values):
            out.append(embedder.user_tf_business_idf(d, b))
    elif enum == 3:
        for d, u, b in zip(df['text'].values, df['user_id'].values, df['business_id'].values):
            out.append(embedder.user_tfidf_business_idf(d, u, b))
    else:
        print ('invalid enum')
        return None

    return np.array(out)

In [8]:
# load pre-trained data
business = pd.read_csv('../data/chinese_business_clean.csv')
reviews = pd.read_csv('../data/chinese_reviews_clean.csv')

lda =  models.LdaModel.load('../data/gensim/lda.model')
dictionary = corpora.Dictionary.load('../data/gensim/chinsese_dict.dict')

with open('../data/u_idf.pickle', 'rb') as f:
    uidf_data = pickle.load(f)

with open('../data/b_idf.pickle', 'rb') as f:
    bidf_data = pickle.load(f)

In [9]:
embedder = TextEmbedder(model = lda, dictionary = dictionary, user_idf = uidf_data, business_idf = bidf_data)

In [10]:
reviews.groupby('business_id').count().reset_index().sort_values(by = 'review_id', ascending = False)[:5]

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
3692,yfxDa8RFOvJPQh0rNtakHA,2380,2380,2380,2380,2380,2380,2380,2380
3088,pH0BLkL4cbxKzu471VZnuA,1948,1948,1948,1948,1948,1948,1948,1948
1944,X8c23dur0ll2D9XTu-I8Qg,1525,1525,1525,1525,1525,1525,1525,1525
953,GJ_bXUPv672YwNg4TneJog,1302,1302,1302,1302,1302,1302,1302,1302
2312,cHdJXLlKNWixBXpDwEGb_A,1246,1246,1246,1246,1246,1246,1246,1246


In [11]:
# focus on data for each business
case_review1 = reviews[reviews['business_id'] == 'yfxDa8RFOvJPQh0rNtakHA']
case_review2 = reviews[reviews['business_id'] == 'pH0BLkL4cbxKzu471VZnuA']
case_review3 = reviews[reviews['business_id'] == 'X8c23dur0ll2D9XTu-I8Qg']
case_review4 = reviews[reviews['business_id'] == 'GJ_bXUPv672YwNg4TneJog']
case_review5 = reviews[reviews['business_id'] == 'cHdJXLlKNWixBXpDwEGb_A']

In [14]:
def train(data, training_epoch, embedder, enum, beta = 0.0, gate_size = 0.0, drop_out = 1.0, learning_rate = 0.01, hidden_layer = [100, 80], out_layer = 5):
    id_train, id_test, star_train, star_test = train_test_split(data, data['stars'], test_size=0.2)

    # generate labels
    if out_layer == 5:
        one_hot_star = one_hot(star_train)
        one_hot_star_test = one_hot(star_test)
    elif out_layer == 3:
        one_hot_star = one_hot_three(star_train)
        one_hot_star_test = one_hot_three(star_test)
    elif out_layer == 1:
        one_hot_star = one_hot_soft_max(star_train)
        one_hot_star_test = one_hot_soft_max(star_test)
    
    print ('embedding dataset ...')
    # validation set
    embed_in = gen_embedding(enum, id_train, embedder)
    
    # test set 
    embed_test = gen_embedding(enum, id_test, embedder)
    
    x = tf.placeholder(tf.float32, shape = [None, 128], name = 'input_topic') # number of topics
    y = tf.placeholder(tf.float32, shape = [None, out_layer], name = 'softmax') # 5 stars

    embedded_size = hidden_layer[-1]
    out, embedding_in, weights = build_model(x, 128, hidden_layer, out_layer, gate_size, drop_out) # shape of (?, 5)
    
    
    # loss
    with tf.name_scope("loss"):
        if out_layer == 1:
            cross_entropy = tf.multiply(tf.reduce_mean(tf.pow(out - y, 2)), 10)
        else:
            cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits= out, labels = y))
            regularizer = tf.nn.l2_loss(weights)
            cross_entropy = tf.reduce_mean(cross_entropy + beta * regularizer)
    # optimization
    with tf.name_scope("train"):
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)
    # reports
    with tf.name_scope("accuracy"):
        if out_layer == 1:
            correct = tf.reduce_mean(tf.cast(cross_entropy , dtype = tf.float32))
        else:
            correct = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), tf.argmax(y, 1)), dtype = tf.float32))
    
    print ('training starts ...')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(training_epoch):
            idx = random.sample(set(np.arange(len(id_train))), 10)
            # create embedding with embedder
            x_in = embed_in[idx]
            y_out = one_hot_star[idx]
            
            if epoch % 10000 == 0:
                [accuracy] = sess.run([correct], feed_dict = {x:embed_in[:10], y:one_hot_star[:10]})
                print ('%.4f' % accuracy)
            sess.run(opt, feed_dict = {x:x_in, y:y_out})

        pred = tf.nn.softmax(out)  # Apply softmax to logits
        # Calculate accuracy
        print("Accuracy:", sess.run(correct, feed_dict = {x: embed_test, y: one_hot_star_test}))

In [98]:
train(case_review1, 2000000, embedder, 0)

embedding dataset ...
training starts ...
0.20
0.20
0.20
0.20
0.20
0.10
0.30
0.10
0.30
0.20
0.20
0.20
0.20
0.20
0.30
0.30
0.20
0.20
0.20
0.20
Accuracy: 0.35084


In [94]:
train(case_review1, 200000, embedder, 1)

embedding dataset ...
training starts ...
0.10
0.40
0.50
0.70
1.00
1.00
1.00
1.00
0.90
1.00
1.00
1.00
1.00
1.00
1.00
0.90
1.00
1.00
1.00
1.00
Accuracy: 0.355042


In [100]:
train(case_review1, 2000000, embedder, 2)

embedding dataset ...
training starts ...
0.00
0.40
0.40
0.00
0.00
0.40
0.30
0.40
0.00
0.10
0.30
0.40
0.40
0.40
0.40
0.30
0.40
0.40
0.10
0.10
Accuracy: 0.405462


In [118]:
train(case_review1, 200000, embedder, 3)

embedding dataset ...
training starts ...
0.30
0.30
0.80
0.90
0.90
0.90
0.90
0.80
0.90
0.90
0.90
0.90
0.90
0.90
0.90
0.90
0.90
0.90
0.90
0.90
Accuracy: 0.371849


In [119]:
train(case_review1, 200000, embedder, 3, drop_out = 0.9)

embedding dataset ...
training starts ...
0.30
0.10
0.20
0.30
0.30
0.40
0.20
0.20
0.30
0.20
0.50
0.50
0.30
0.60
0.70
0.70
0.70
0.70
0.70
0.60
Accuracy: 0.380252


In [123]:
train(case_review1, 200000, embedder, 3, drop_out = 0.9, beta = 0.5)

embedding dataset ...
training starts ...
0.00
0.60
0.70
0.80
0.90
0.90
1.00
1.00
1.00
1.00
1.00
0.70
1.00
0.90
1.00
1.00
1.00
0.90
1.00
0.90
Accuracy: 0.32563


In [144]:
train(case_review1, 200000, embedder, 3, drop_out = 0.5)

embedding dataset ...
training starts ...
0.20
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
0.10
Accuracy: 0.157563


In [108]:
train(case_review1, 200000, embedder, 2, out_layer = 3)

embedding dataset ...
training starts ...
0.00
0.90
0.90
0.90
0.90
0.90
0.90
0.10
0.90
1.00
0.90
0.90
0.90
0.90
0.10
0.90
0.90
0.10
0.90
0.90
Accuracy: 0.668067


In [110]:
train(case_review1, 200000, embedder, 2, drop_out = 0.9, out_layer = 3)

embedding dataset ...
training starts ...
0.10
0.60
0.20
0.60
0.60
0.60
0.60
0.20
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
Accuracy: 0.684874


In [112]:
train(case_review1, 500000, embedder, 2, drop_out = 0.9, out_layer = 3)

embedding dataset ...
training starts ...
0.30
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.70
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.40
0.30
0.30
0.60
0.30
0.60
0.60
0.60
0.30
0.60
0.30
0.60
0.30
0.60
0.60
0.60
0.60
0.60
0.60
0.60
0.30
0.30
0.60
0.60
0.60
0.40
Accuracy: 0.665966


In [111]:
train(case_review1, 200000, embedder, 2, drop_out = 0.8, out_layer = 3)

embedding dataset ...
training starts ...
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
0.70
Accuracy: 0.64916


In [18]:
train(case_review1, 200000, embedder, 2, drop_out = 0.8, hidden_layer = [120, 100], out_layer = 3)

embedding dataset ...
training starts ...
0.1000
0.8000
0.8000
0.8000
0.8000
0.8000
0.8000
0.8000
0.8000
0.8000
0.8000
0.7000
0.4000
0.8000
0.8000
0.8000
0.8000
0.8000
0.8000
0.8000
Accuracy: 0.668067


## Compare Embedding Performance

In [88]:
%time gen_embedding(0, d_train, embedder)

CPU times: user 22.7 s, sys: 87 ms, total: 22.8 s
Wall time: 22.9 s


array([[ 0.00800386,  0.00751197,  0.00751197, ...,  0.00751197,
         0.00751197,  0.00751197],
       [ 0.0074958 ,  0.0074958 ,  0.0074958 , ...,  0.0074958 ,
         0.0074958 ,  0.0074958 ],
       [ 0.00749794,  0.00749794,  0.00749794, ...,  0.00749794,
         0.00864102,  0.00749794],
       ..., 
       [ 0.00755739,  0.00755739,  0.00755739, ...,  0.00755739,
         0.00755739,  0.00755739],
       [ 0.00755062,  0.00755062,  0.00755062, ...,  0.00755062,
         0.00755062,  0.00755062],
       [ 0.00746532,  0.00746532,  0.00746532, ...,  0.00746532,
         0.00746532,  0.00746532]])

In [80]:
%time gen_embedding(1, d_train, embedder)

CPU times: user 22.9 s, sys: 78.9 ms, total: 23 s
Wall time: 23 s


array([[ 0.04904512,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.01719956,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.02732329, ...,  0.        ,
         0.        ,  0.02998012],
       [ 0.        ,  0.        ,  0.02121954, ...,  0.02121954,
         0.02121954,  0.        ],
       [ 0.        ,  0.        ,  0.01717248, ...,  0.02694746,
         0.        ,  0.        ]])

In [81]:
%time gen_embedding(2, d_train, embedder)

CPU times: user 23 s, sys: 90.8 ms, total: 23.1 s
Wall time: 23.2 s


array([[ 0.00847174,  0.00901358,  0.00538159, ...,  0.00750334,
         0.00979433,  0.00845681],
       [ 0.00799792,  0.00891608,  0.00532337, ...,  0.00742217,
         0.00968838,  0.00836532],
       [ 0.00796717,  0.0088818 ,  0.00530291, ...,  0.00739364,
         0.00965114,  0.00833316],
       ..., 
       [ 0.00790171,  0.00880883,  0.00525934, ...,  0.00733289,
         0.00957184,  0.00914302],
       [ 0.00805194,  0.00897631,  0.00535933, ...,  0.00747231,
         0.00975383,  0.00842183],
       [ 0.00796804,  0.00888277,  0.00530348, ...,  0.00832823,
         0.00965219,  0.00833407]])

In [82]:
%time gen_embedding(3, d_train, embedder)

CPU times: user 22.9 s, sys: 86.3 ms, total: 23 s
Wall time: 23.1 s


array([[ 0.06876797,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.01900364,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.02111689, ...,  0.        ,
         0.        ,  0.03653553],
       [ 0.        ,  0.        ,  0.01671909, ...,  0.02331078,
         0.03042826,  0.        ],
       [ 0.        ,  0.        ,  0.01375876, ...,  0.0279899 ,
         0.        ,  0.        ]])

## Try with Softmax Regression

In [20]:
def output_layer_sm(input_data, size_in, size_out, name):
    '''
    output tensor
    '''
    with tf.name_scope(name):
        # weight as random normal variables
        w = tf.Variable(tf.random_normal([size_in, size_out]), name = 'W')
        # bias as random normal variables
        b = tf.Variable(tf.random_normal([size_out]), name = 'B')
        
        activation = tf.nn.softmax(tf.atan(tf.matmul(input_data, w) + b, name = 'soft_max'))

        return activation, w

In [21]:
def one_hot_soft_max(stars):
    res = []
    for s in stars:
        out = np.array([1.0 * s/5.0])
        res.append(out)
    return np.array(res)

In [None]:
train(case_review1, 100000, embedder, 2, drop_out = 0.9, out_layer = 1)

embedding dataset ...


In [24]:
train(case_review1, 50000, embedder, 2, drop_out = 0.8, hidden_layer = [120, 100], out_layer = 3)

embedding dataset ...
training starts ...
1.9600
1.9600
1.9600
1.9600
1.9600


KeyboardInterrupt: 