# Finalizing text embeddings

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
import nltk
import pickle
import gensim
from itertools import chain
from gensim_lda_model import Gensimembedder

In [2]:
# load file
business = pd.read_csv('chinese_business_clean.csv')
reviews = pd.read_csv('chinese_reviews_clean.csv')

In [3]:
lda =  models.LdaModel.load('gensim/lda.model')
dictionary = corpora.Dictionary.load('gensim/chinsese_dict.dict')

In [4]:
model = Gensimembedder(model = lda, dictionary = dictionary)

In [64]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def embed(text, model, dictionary):
    text = tokenize(text)
    bow = dictionary.doc2bow(text)
    kindex = model.get_document_topics(bow, minimum_probability = 0.0)
    out = [0] * model.num_topics
    for i, p in kindex:
        out[i] = p
    return np.array(out) 

def embed_sent(text, model, dictionary, sent_length = False):
    out = np.array([0.]*128)
    sentences = len(nltk.sent_tokenize(text))
    for text in nltk.sent_tokenize(text):
        out += embed(text, lda, dictionary)
    if sent_length:
        return out/sentences, sentences
    return (out/sentences)

In [6]:
# top users
reviews.groupby('user_id').count().reset_index().sort_values(by = 'review_id', ascending = False).head(3)

Unnamed: 0,user_id,business_id,cool,date,funny,review_id,stars,text,useful
24136,CxDOIDnH8gp9KXzpBHJYXw,528,528,528,528,528,528,528,528
27171,EiP1OFgs-XGcKZux0OKWIA,189,189,189,189,189,189,189,189
6341,2e5V6M4GNufEnbGJpVdCjw,142,142,142,142,142,142,142,142


In [7]:
user1 = reviews[reviews['user_id'] == 'CxDOIDnH8gp9KXzpBHJYXw']

In [8]:
user1['text'].values[0]

'Vince Seafood Restaurant & BBQ has one of the most value in weekday dim sum offering around on a weekday morning! Before 11 am, it is 10% off $2.28  S/M/L size and free tea.\n\nNot surprisingly that it was packed with seniors that ordering a lot food to take out for lunch and dinner later to enjoy.\n\nDim Sum menu is quite interesting with a mix of classical and chiu chow style dim sum. And at this price point, my Yelp friend ordered quite a few and more from the menu.\n \nShrimp Dumpling "Har Gow" - A-OK\nPork Dumpling with Liver on Top "Siu Mai" - Meh, not crazy about overcooked liver\nPork Rib with Black Bean and Olive Sauce - good portion \nBean Curd Skin Roll with Mix Chinese Vegetable - lacking filling\nDumpling in Lotus Leaf - love this! great contrast of taste and texture! \nBeef Tender in Spicy Sauce - Tender enough but really lacking any spice, more on the sweet side\nCrystal Dumplings in Fish Papaya Soup - huge dumplings!!! shrimp has great bite!\nRoast Pork Belly Roll - No

### 1. get_document_topics function in gensim

In [9]:
model.embed(user1['text'].values[0])

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.13152346,  0.        ,  0.  

### 2. sentence tokenize before embed

$V_{t} = \frac{1}{s}\sum_{s \in \text{sentences}} embed(s)$

In [65]:
model.embed_sent(user1['text'].values[0])

array([ 0.        ,  0.        ,  0.05784255,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.05784255,  0.        ,
        0.        ,  0.        ,  0.00640918,  0.05580751,  0.        ,
        0.        ,  0.        ,  0.00899581,  0.        ,  0.        ,
        0.00903655,  0.08504895,  0.01107164,  0.        ,  0.        ,
        0.        ,  0.        ,  0.04915021,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.06165865,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.06420272,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.04510536,  0.00772431,  0.  

### 3. Augemented Frequency to normalize sentence length

$V_{t}(doc) = \alpha + \alpha * \frac{embed_{t}}{max(embed_{t}:t \in d)}$

In [11]:
def augmented_embed_sent(text, model, dictionary, alpha = 0.5):
    out = np.array([0.]*128)
    sentences = len(nltk.sent_tokenize(text))
    for text in nltk.sent_tokenize(text):
        out += embed(text, lda, dictionary)
    
    out = alpha + alpha * out/max(out)
    
    return out/sum(out)

In [12]:
augmented_embed_sent(user1['text'].values[0], model, dictionary)

array([ 0.00750733,  0.00750733,  0.01011315,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00750733,  0.01011315,  0.00750733,
        0.00750733,  0.00750733,  0.00779539,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00791543,  0.00750733,  0.00750733,
        0.00791586,  0.01134502,  0.008001  ,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00972391,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00750733,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00750733,  0.00750733,  0.00750733,
        0.01028687,  0.00750733,  0.00750733,  0.00750733,  0.00750733,
        0.00750733,  0.01117768,  0.00750733,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00750733,  0.00784335,  0.00750733,
        0.00750733,  0.00750733,  0.00750733,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00750733,  0.00750733,  0.00750733,
        0.00750733,  0.00750733,  0.00954575,  0.00799417,  0.00

### 4. Personal tf-idf - most characteristic topic for particular user

$idf(t,D) = log\frac{N}{|{d \in D : t \in d}|}$

In [66]:
def user_idf(df, model, dictionary, log = True):
    freq = np.array([0.]*128)
    s_count = 0
    for d in df['text']:
        x, s = embed_sent(d, model, dictionary, sent_length = True)
        freq += np.ceil(x - min(x))
        s_count += s
    # add small value to avoid division by 0
    freq += 1e-07
    freq = s_count/freq
    if log:
        freq = np.log(freq)
    # this happend when all sentences contain one common topic
    # prones to happen when user writes few reviews
    freq[np.where(freq < 0.0)] = 0.0
    # pad topic that never used by this user
    freq[np.where(np.log(s_count/1e-07) == freq)] = 0.0
    return np.array(freq)

In [57]:
user_idf(user1, model, dictionary)

array([ 3.09104245,  3.37872452,  1.04874046,  2.97325941,  1.5777484 ,
        2.2617631 ,  2.07944154,  2.8678989 ,  1.02207221,  2.74273576,
        3.27336401,  2.63151012,  2.04958858,  1.87464713,  2.19155884,
        1.70474809,  2.93689177,  2.44045489,  2.19155884,  2.37727598,
        1.19392247,  1.33462235,  2.31785256,  2.07944154,  2.04958858,
        3.871201  ,  2.2617631 ,  1.92529086,  2.93689177,  1.63436729,
        5.17048396,  2.80336038,  2.83510908,  3.70414692,  1.86237704,
        3.871201  ,  1.71521939,  2.03498978,  2.1419619 ,  2.74273576,
        1.38629436,  3.22457384,  2.93689177,  4.4773368 ,  2.58021683,
        2.41894868,  1.26514998,  1.22567117,  3.37872452,  3.09104245,
        1.64412347,  1.85025567,  0.52930337,  1.41706602,  1.96503119,
        3.56104608,  2.65817837,  2.53142666,  2.83510908,  2.97325941,
        3.01099974,  1.79175947,  1.63436729,  3.27336401,  2.68557734,
        1.75823678,  2.33727065,  2.1419619 ,  1.59626745,  2.07

In [15]:
def user_tfidf_embed(text, df, model, dictionary, alpha = 0.5):
    tf = augmented_embed_sent(text, model, dictionary)
    idf = user_idf(df,model,dictionary)
    out = np.multiply(tf, idf)
    return out/sum(out)

In [16]:
user_tfidf_embed(user1['text'].values[0], user1, model, dictionary)

array([ 0.00866581,  0.01084059,  0.00442297,  0.00846665,  0.00460409,
        0.00686505,  0.00616505,  0.00827994,  0.00444582,  0.00793827,
        0.00935805,  0.00856457,  0.00628035,  0.00741363,  0.00639666,
        0.00493936,  0.00899192,  0.00797763,  0.00659583,  0.00692194,
        0.00404757,  0.00666485,  0.00761704,  0.00709939,  0.00639666,
        0.01130094,  0.00670062,  0.00744276,  0.00837183,  0.00493936,
        0.01458179,  0.00785863,  0.00910905,  0.01337093,  0.00567303,
        0.0110619 ,  0.00525072,  0.0061208 ,  0.00630185,  0.00846665,
        0.00622713,  0.00992861,  0.00899192,  0.01184542,  0.00763171,
        0.00692194,  0.0060888 ,  0.0035842 ,  0.0094908 ,  0.0094908 ,
        0.00627243,  0.00548976,  0.00151422,  0.00409506,  0.00538496,
        0.01044182,  0.00770545,  0.00735371,  0.00899192,  0.00923096,
        0.00877061,  0.00521808,  0.00506004,  0.0094908 ,  0.00819079,
        0.00535083,  0.00709939,  0.00893716,  0.00529875,  0.00

### 5. Personal tf * Business idf

In [92]:
def business_idf(df, model, dictionary, log = True):
    freq = np.array([0.]*128)
    s_count = 0
    for d in df['text']:
        x, s = embed_sent(d, model, dictionary, sent_length = True)
        freq += np.ceil(x - min(x))
        s_count += s
    # add small value to avoid division by 0
    freq += 1e-07
    freq = s_count/freq
    if log:
        freq = np.log(freq)
        
    # this happend when all sentences contain one common topic
    # prones to happen when user writes few reviews
    freq[np.where(freq < 0.0)] = 0.0
    # pad topic that never used by this user
    freq[np.where(np.log(s_count/1e-07) == freq)] = 0.0
    return np.array(freq)

In [93]:
business1 = reviews[reviews['business_id'] == 'v95ot_TNwTk1iJ5n56dR0g']

In [94]:
def user_tf_business_idf(text, df, model, dictionary):
    tf = augmented_embed_sent(text, model, dictionary)
    idf = business_idf(df,model,dictionary)
    out = np.multiply(tf, idf)
    return out/sum(out)

In [95]:
user_tf_business_idf(user1['text'].values[0], business1, model, dictionary)

array([ 0.01021194,  0.01278958,  0.01100246,  0.        ,  0.00858564,
        0.01278958,  0.00953696,  0.        ,  0.01185225,  0.        ,
        0.01278958,  0.01116327,  0.00863401,  0.00858564,  0.01278958,
        0.00901341,  0.01021194,  0.01011446,  0.01116327,  0.01278958,
        0.01367903,  0.01088368,  0.00892194,  0.01021194,  0.        ,
        0.        ,  0.        ,  0.01682207,  0.01116327,  0.00953696,
        0.        ,  0.01021194,  0.01278958,  0.        ,  0.00858564,
        0.01278958,  0.00901341,  0.        ,  0.01116327,  0.00953696,
        0.01325511,  0.01116327,  0.01116327,  0.01278958,  0.01021194,
        0.01278958,  0.01290797,  0.0073871 ,  0.        ,  0.        ,
        0.01116327,  0.        ,  0.00901341,  0.00822396,  0.00901341,
        0.        ,  0.01116327,  0.01278958,  0.01278958,  0.01116327,
        0.        ,  0.00953696,  0.01021194,  0.        ,  0.        ,
        0.        ,  0.01278958,  0.01372438,  0.01019627,  0.01

### 6. Personal tf-idf * Business idf

In [96]:
def user_tfidf_business_idf(text, udf, bdf, model, dictionary):
    tf = augmented_embed_sent(text, model, dictionary)
    uidf = user_idf(udf,model,dictionary)
    idf = business_idf(bdf,model,dictionary)
    out = np.multiply(np.multiply(tf, uidf), idf)
    return out/sum(out)

In [98]:
user_tfidf_business_idf(user1['text'].values[0], user1, business1, model, dictionary)

array([ 0.01073831,  0.        ,  0.00777841,  0.01284868,  0.00666943,
        0.01206795,  0.00874771,  0.01335381,  0.00781532,  0.        ,
        0.01410924,  0.01277353,  0.00759198,  0.00691124,  0.00986501,
        0.00700173,  0.01375743,  0.0103883 ,  0.01138609,  0.        ,
        0.00953139,  0.00780205,  0.00804533,  0.00997581,  0.01118133,
        0.        ,  0.        ,  0.01296637,  0.01326227,  0.00662182,
        0.        ,  0.01019913,  0.0136503 ,  0.        ,  0.00674966,
        0.01585406,  0.00809205,  0.01081587,  0.00975952,  0.01032098,
        0.00964127,  0.01191454,  0.01142427,  0.        ,  0.01063329,
        0.01180273,  0.00956448,  0.00545583,  0.01173868,  0.01354754,
        0.00791371,  0.01074874,  0.00531086,  0.00512413,  0.00905983,
        0.        ,  0.01014087,  0.01096348,  0.01277353,  0.01326227,
        0.        ,  0.00824259,  0.00880105,  0.        ,  0.        ,
        0.        ,  0.01185358,  0.01239577,  0.00794183,  0.00

### Train idf for the future use

In [69]:
uidf_data = {}
zero_user = set()
for u, df in reviews.groupby('user_id'):
    uidf_data[u] = user_idf(df, model, dictionary)
    if np.sum(np.ceil(uidf_data[u])) == 0.0:
        zero_user.add(u)

In [110]:
bidf_data = {}
zero_business = set()
for u, df in reviews.groupby('business_id'):
    bidf_data[u] = business_idf(df, model, dictionary)
    if np.sum(np.ceil(bidf_data[u])) == 0.0:
        zero_business.add(u)

In [86]:
import pickle
with open('u_idf.pickle', 'wb') as f:
    pickle.dump(uidf_data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [99]:
with open('zero_user.pickle', 'wb') as f:
    pickle.dump(zero_user, f, protocol=pickle.HIGHEST_PROTOCOL)

In [112]:
with open('b_idf.pickle', 'wb') as f:
    pickle.dump(bidf_data, f, protocol=pickle.HIGHEST_PROTOCOL)