## Fix lda embedding with Gensim

In [132]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
import numpy as np

In [49]:
# load file 
business = pd.read_csv('chinese_business_clean.csv')
reviews = pd.read_csv('chinese_reviews_clean.csv')

### Tokenize and build dictionary

In [90]:
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

def tokenize_noun(text):
    tokenized = [token for token in simple_preprocess(text) if token not in STOPWORDS]
    return [token for (token,flag) in nltk.pos_tag(tokenized) if flag[0] == 'N']

def tokenize_adj(text):
    tokenized = [token for token in simple_preprocess(text) if token not in STOPWORDS]
    return [token for (token,flag) in nltk.pos_tag(tokenized) if flag[0] == 'J']

def tokenize_noun_adj(text):
    tokenized = [token for token in simple_preprocess(text) if token not in STOPWORDS]
    return [token for (token,flag) in nltk.pos_tag(tokenized) if flag[0] in set(['J', 'N'])] 

In [53]:
%time texts = [tokenize(text) for text in reviews['text']]

CPU times: user 31.7 s, sys: 476 ms, total: 32.1 s
Wall time: 32.3 s


In [55]:
%time dictionary = gensim.corpora.Dictionary(texts)

CPU times: user 12.4 s, sys: 36.3 ms, total: 12.4 s
Wall time: 12.5 s


In [94]:
print(dictionary)

Dictionary(83678 unique tokens: ['place', 'horrible', 'excited', 'try', 'got']...)


In [103]:
# save vocabulary
dictionary.save('gensim/chinsese_dict.dict')

### Build count vectorizer

In [104]:
dictionary.doc2bow(texts[0])

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 2),
 (6, 2),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1)]

In [105]:
% time corpus = [dictionary.doc2bow(text) for text in texts]

CPU times: user 11.2 s, sys: 363 ms, total: 11.5 s
Wall time: 11.7 s


In [113]:
corpora.MmCorpus.serialize('gensim/chinsese_count.mm', corpus)

#### Making sure you can load

In [116]:
dictionary = corpora.Dictionary.load('gensim/chinsese_dict.dict')
print(dictionary)

Dictionary(83678 unique tokens: ['place', 'horrible', 'excited', 'try', 'got']...)


In [119]:
corpus = corpora.MmCorpus('gensim/chinsese_count.mm')
print (corpus[10])

[(9, 1.0), (17, 4.0), (27, 1.0), (34, 1.0), (40, 1.0), (41, 1.0), (45, 1.0), (49, 3.0), (57, 1.0), (70, 1.0), (137, 2.0), (139, 1.0), (154, 1.0), (155, 1.0), (158, 2.0), (171, 2.0), (172, 2.0), (177, 1.0), (179, 1.0), (184, 1.0), (224, 1.0), (252, 1.0), (254, 1.0), (280, 1.0), (281, 1.0), (282, 1.0), (283, 3.0), (284, 1.0), (285, 1.0), (286, 2.0), (287, 1.0), (288, 1.0), (289, 1.0), (290, 1.0), (291, 1.0), (292, 1.0), (293, 1.0), (294, 1.0), (295, 1.0), (296, 1.0), (297, 1.0), (298, 1.0), (299, 1.0), (300, 1.0), (301, 1.0), (302, 1.0), (303, 1.0), (304, 1.0), (305, 1.0), (306, 1.0), (307, 1.0), (308, 1.0), (309, 1.0), (310, 1.0), (311, 1.0), (312, 1.0), (313, 1.0), (314, 1.0), (315, 1.0), (316, 2.0), (317, 1.0), (318, 1.0), (319, 1.0), (320, 1.0), (321, 1.0), (322, 1.0), (323, 1.0), (324, 1.0), (325, 2.0), (326, 1.0), (327, 1.0)]


## Tf-Idf

In [121]:
%time tfidf = models.TfidfModel(corpus)

CPU times: user 22 s, sys: 156 ms, total: 22.2 s
Wall time: 22.2 s


In [144]:
top10 = sorted(tfidf[corpus[10]], key = lambda x: x[1], reverse=True)[:10]
sorted(tfidf[corpus[10]], key = lambda x: x[1], reverse=True)[:10]

[(325, 0.3741926942388568),
 (283, 0.28156348561746436),
 (316, 0.25286082918224934),
 (286, 0.22408108169375784),
 (319, 0.22215474330370155),
 (49, 0.20727418233048003),
 (302, 0.16094462978635815),
 (314, 0.15363823517695105),
 (282, 0.15189856268780144),
 (287, 0.1508827099006586)]

In [146]:
for key, value in top10: 
    print (dictionary.get(key))

chx
issue
culture
employee
rive
free
resolved
rarity
unknown
notified


In [149]:
reviews['text'][10]

"This review is based upon consistency of flavor and great customer service.  We came and there was an unknown issue that required a 25 minute wait for food.  The employee notified us, and although hesitant, we decided to stay.  We have been here numerous times before in the past years so we are familiar with this location.  The employee was apologetic and gave us a free drink.  That was a simple gesture but rarely do you see decent customer service anymore.  We received our food and had an issue with an incorrect order.  It was explained and the issue was resolved quickly.  They gave us a free appetizer.  We do not expect perfection, nor free food.  This restaurant cares for customers and works to provide a positive experience.  We would return again because they have good food and they care.  That is a rarity in today's restaurant culture.  Kudos to the manager for creating this culture.  Ordered- fried rive and Tofu, edamame, won ton soup, dynamite chx, and Thai curry chx"

## LDA

In [159]:
dictionary = corpora.Dictionary.load('gensim/chinsese_dict.dict')
print(dictionary)

Dictionary(83678 unique tokens: ['place', 'horrible', 'excited', 'try', 'got']...)


In [155]:
#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=3, no_above=0.8)

In [156]:
print (dictionary)

Dictionary(33277 unique tokens: ['place', 'horrible', 'excited', 'try', 'got']...)


Run multicore LDA

In [179]:
%time lda = models.LdaMulticore(corpus, num_topics=128, id2word=dictionary, chunksize=10000, passes=2)



CPU times: user 11min 33s, sys: 4min 30s, total: 16min 4s
Wall time: 12min 49s


In [182]:
lda.save('gensim/lda.model')

In [183]:
model =  models.LdaModel.load('gensim/lda.model')

In [195]:
lda.get_document_topics(corpus[5])

[(21, 0.9291294642857143)]

In [223]:
def embed(text, model, dictionary):
    text = tokenize(text)
    bow = dictionary.doc2bow(text)
    kindex = model.get_document_topics(bow)
    out = [0] * model.num_topics
    for i, p in kindex:
        out[i] = p
    return np.array(out) 

In [228]:
embed(reviews['text'][3], lda, dictionary)

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.14009262,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.42144656,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.18