# Finalizing text embeddings

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
import nltk
import pickle
import gensim
from itertools import chain
from gensim_lda_model import Gensimembedder

In [3]:
# load file
business = pd.read_csv('chinese_business_clean.csv')
reviews = pd.read_csv('chinese_reviews_clean.csv')

In [4]:
lda =  models.LdaModel.load('gensim/lda.model')
dictionary = corpora.Dictionary.load('gensim/chinsese_dict.dict')

In [6]:
model = Gensimembedder(model = lda, dictionary = dictionary)

In [51]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def embed(text, model, dictionary):
    text = tokenize(text)
    bow = dictionary.doc2bow(text)
    kindex = model.get_document_topics(bow, minimum_probability = 0.0)
    out = [0] * model.num_topics
    for i, p in kindex:
        out[i] = p
    return np.array(out) 

def embed_sent(text, model, dictionary):
    out = np.array([0.]*128)
    sentences = len(nltk.sent_tokenize(text))
    for text in nltk.sent_tokenize(text):
        out += embed(text, lda, dictionary)
    return (out/sentences)

In [87]:
# top users
reviews.groupby('user_id').count().reset_index().sort_values(by = 'review_id', ascending = False).head(3)

Unnamed: 0,user_id,business_id,cool,date,funny,review_id,stars,text,useful
24136,CxDOIDnH8gp9KXzpBHJYXw,528,528,528,528,528,528,528,528
27171,EiP1OFgs-XGcKZux0OKWIA,189,189,189,189,189,189,189,189
6341,2e5V6M4GNufEnbGJpVdCjw,142,142,142,142,142,142,142,142


In [81]:
user1 = reviews[reviews['user_id'] == 'CxDOIDnH8gp9KXzpBHJYXw']

In [85]:
user1['text'].values[0]

'Vince Seafood Restaurant & BBQ has one of the most value in weekday dim sum offering around on a weekday morning! Before 11 am, it is 10% off $2.28  S/M/L size and free tea.\n\nNot surprisingly that it was packed with seniors that ordering a lot food to take out for lunch and dinner later to enjoy.\n\nDim Sum menu is quite interesting with a mix of classical and chiu chow style dim sum. And at this price point, my Yelp friend ordered quite a few and more from the menu.\n \nShrimp Dumpling "Har Gow" - A-OK\nPork Dumpling with Liver on Top "Siu Mai" - Meh, not crazy about overcooked liver\nPork Rib with Black Bean and Olive Sauce - good portion \nBean Curd Skin Roll with Mix Chinese Vegetable - lacking filling\nDumpling in Lotus Leaf - love this! great contrast of taste and texture! \nBeef Tender in Spicy Sauce - Tender enough but really lacking any spice, more on the sweet side\nCrystal Dumplings in Fish Papaya Soup - huge dumplings!!! shrimp has great bite!\nRoast Pork Belly Roll - No

### 1. get_document_topics function in gensim

In [83]:
model.embed(user1['text'].values[0])

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.13109937,  0.        ,  0.  

### 2. sentence tokenize before embed

$V_{t} = \frac{1}{s}\sum_{s \in \text{sentences}} embed(s)$

In [84]:
model.embed_sent(user1['text'].values[0])

array([ 0.        ,  0.        ,  0.05784255,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.05784255,  0.        ,
        0.        ,  0.        ,  0.00660969,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.00909441,  0.08505146,  0.01112667,  0.        ,  0.        ,
        0.        ,  0.        ,  0.0492811 ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.06708648,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.08927966,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.04504784,  0.00682388,  0.  

### 3. Augemented Frequency to normalize sentence length

$V_{t}(doc) = \alpha + \alpha * \frac{embed_{t}}{max(embed_{t}:t \in d)}$

In [68]:
def augmented_embed_sent(text, model, dictionary, alpha = 0.5):
    out = np.array([0.]*128)
    sentences = len(nltk.sent_tokenize(text))
    for text in nltk.sent_tokenize(text):
        out += embed(text, lda, dictionary)
    
    out = alpha + alpha * out/max(out)
    
    return out/sum(out)

In [90]:
augmented_embed_sent(user1['text'].values[0], model, dictionary)

array([ 0.00744331,  0.00744331,  0.0105958 ,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00744331,  0.0105958 ,  0.00744331,
        0.00744331,  0.00744331,  0.00779619,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00744331,  0.00744331,  0.00744331,
        0.00793982,  0.01208403,  0.00804577,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.01012955,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00744331,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00744331,  0.00744331,  0.00744331,
        0.01007682,  0.00744331,  0.00744331,  0.00744331,  0.00744331,
        0.00744331,  0.01409856,  0.00744331,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00744331,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00744331,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00744331,  0.00744331,  0.00744331,
        0.00744331,  0.00744331,  0.00990621,  0.0078049 ,  0.00

### 4. Personal tf-idf - most characteristic topic for particular user

$idf(t,D) = log\frac{N}{|{d \in D : t \in d}|}$

In [169]:
def user_idf(df, model, dictionary):
    user = df['user_id']
    freq = np.array([0.]*128)
    for d in df['text']:
        x = embed_sent(d, model, dictionary)
        freq += np.ceil(x - min(x))
    freq = len(df)/freq
    freq[np.where(np.isnan(freq))] = 0.0
    freq = np.log(freq)
    freq[np.where(np.isinf(freq))] = 0.0
    return np.array(freq)

In [171]:
user_idf(user1, model, dictionary)

array([ 3.01099975,  3.37872453,  1.02207221,  2.74273576,  1.58696506,
        2.28011224,  2.07944154,  2.8678989 ,  1.05961013,  2.80336038,
        3.22457385,  2.80336038,  1.96503119,  1.76928661,  2.17475172,
        1.6343673 ,  2.97325942,  2.44045489,  2.20865327,  2.39789527,
        1.16922986,  1.3564414 ,  2.33727065,  2.24374459,  2.04958858,
        3.78418963,  2.31785257,  1.87464713,  3.01099975,  1.67397643,
        4.88280192,  2.55552422,  3.01099975,  4.32318613,  1.87464713,
        4.07187171,  1.70474809,  2.02060104,  2.1419619 ,  2.74273576,
        1.39389896,  3.22457385,  2.77258872,  4.47733681,  2.58021683,
        2.35707328,  1.18149995,  1.27188401,  2.97325942,  2.97325942,
        1.56861592,  1.91238746,  0.51652364,  1.37125648,  1.81474899,
        3.49650756,  2.74273576,  2.55552422,  2.71374822,  3.09104245,
        2.90180045,  1.7258015 ,  1.7258015 ,  3.3246573 ,  2.71374822,
        1.78045991,  2.2617631 ,  2.31785257,  1.64412347,  2.04

In [172]:
def user_tfidf_embed(text, df, model, dictionary, alpha = 0.5):
    tf = augmented_embed_sent(text, model, dictionary)
    idf = user_idf(df,model,dictionary)
    out = np.multiply(tf, idf)
    return out/sum(out)

In [173]:
user_tfidf_embed(user1['text'].values[0], user1, model, dictionary)

array([ 0.00914555,  0.00954063,  0.00439538,  0.00868946,  0.00453543,
        0.00691535,  0.00597841,  0.00879706,  0.00441809,  0.00794589,
        0.00999672,  0.00858564,  0.0064462 ,  0.00554657,  0.00629014,
        0.00495284,  0.00868946,  0.00754475,  0.00669194,  0.00735216,
        0.00412028,  0.00655292,  0.00735437,  0.00658626,  0.00624355,
        0.01279113,  0.00697394,  0.00780466,  0.00868946,  0.00489366,
        0.01444688,  0.00770907,  0.00902477,  0.01239605,  0.00554657,
        0.01279113,  0.00516981,  0.00593644,  0.00633748,  0.00829438,
        0.00612192,  0.00983675,  0.00829438,  0.01279113,  0.00778592,
        0.00669194,  0.00638858,  0.00374323,  0.00968499,  0.00940299,
        0.00486451,  0.00533514,  0.00151887,  0.004386  ,  0.00554657,
        0.01053616,  0.00735216,  0.00742018,  0.00829438,  0.00927147,
        0.00868946,  0.00530133,  0.00507487,  0.00983675,  0.00829438,
        0.00533514,  0.00697394,  0.00960942,  0.00521903,  0.00

### 5. Personal tf * Business idf

In [164]:
business1 = reviews[reviews['business_id'] == 'v95ot_TNwTk1iJ5n56dR0g']

In [174]:
def business_idf(df, model, dictionary):
    freq = np.array([0.]*128)
    for d in df['text']:
        x = embed_sent(d, model, dictionary)
        freq += np.ceil(x - min(x))
    freq = len(df)/freq
    freq[np.where(np.isnan(freq))] = 0.0
    freq = np.log(freq)
    freq[np.where(np.isinf(freq))] = 0.0
    return np.array(freq)

In [178]:
def user_tf_business_idf(text, df, model, dictionary):
    tf = augmented_embed_sent(text, model, dictionary)
    idf = business_idf(df,model,dictionary)
    out = np.multiply(tf, idf)
    return out/sum(out)

In [179]:
user_tf_business_idf(user1['text'].values[0], business1, model, dictionary)

  


array([ 0.00852935,  0.        ,  0.00655224,  0.01528912,  0.00949032,
        0.01190924,  0.00852935,  0.01528912,  0.00840161,  0.        ,
        0.        ,  0.01528912,  0.00781218,  0.00744127,  0.        ,
        0.00744127,  0.01528912,  0.00906416,  0.01528912,  0.        ,
        0.01634783,  0.00345941,  0.00629688,  0.01190924,  0.01528912,
        0.        ,  0.        ,  0.0164636 ,  0.01190924,  0.00655224,
        0.        ,  0.00993213,  0.01190924,  0.        ,  0.00580058,
        0.01528912,  0.00744127,  0.        ,  0.01528912,  0.00655224,
        0.01100044,  0.01190924,  0.01190924,  0.        ,  0.00993213,
        0.        ,  0.00617543,  0.00457514,  0.        ,  0.        ,
        0.01190924,  0.01528912,  0.00993213,  0.00655224,  0.00852935,
        0.        ,  0.00744127,  0.00852935,  0.01528912,  0.        ,
        0.        ,  0.00744127,  0.01190924,  0.        ,  0.        ,
        0.        ,  0.01528912,  0.01152132,  0.00927256,  0.00

### 6. Personal tf-idf * Business idf

In [181]:
def user_tf_business_idf(text, udf, bdf, model, dictionary):
    tf = augmented_embed_sent(text, model, dictionary)
    uidf = user_idf(udf,model,dictionary)
    idf = business_idf(bdf,model,dictionary)
    out = np.multiply(np.multiply(tf, uidf), idf)
    return out/sum(out)

In [182]:
user_tf_business_idf(user1['text'].values[0], user1, business1, model, dictionary)

  


array([ 0.01077619,  0.        ,  0.00436463,  0.01956185,  0.00463277,
        0.01531342,  0.007272  ,  0.        ,  0.00352725,  0.        ,
        0.0209494 ,  0.01373315,  0.0060182 ,  0.0050527 ,  0.01100551,
        0.00542355,  0.0188524 ,  0.00857802,  0.0115387 ,  0.        ,
        0.00866978,  0.00179267,  0.00609366,  0.00897252,  0.        ,
        0.        ,  0.        ,  0.00864915,  0.02008187,  0.00522989,
        0.        ,  0.01170158,  0.        ,  0.        ,  0.00498869,
        0.02645412,  0.00549083,  0.01370962,  0.01100551,  0.00849183,
        0.00487731,  0.02159962,  0.01781899,  0.02721933,  0.01078547,
        0.        ,  0.00251529,  0.00191317,  0.        ,  0.        ,
        0.0067766 ,  0.        ,  0.00187207,  0.00275493,  0.0088424 ,
        0.        ,  0.00917483,  0.01331698,  0.01821286,  0.01956185,
        0.        ,  0.00614108,  0.00728366,  0.        ,  0.        ,
        0.        ,  0.01557863,  0.01052528,  0.00640685,  0.01