In [1]:
import pickle
with open("review_user_top10_list.txt", "rb") as f:
    review_user_top10_list = pickle.load(f)

In [2]:
review_user_top10_list[:10]

["Convenient subway system. Service now extends all the way to Laval, which you can't beat.\n\nFast. During normal hours, trains come by quite often, unlike other cities I could name.\n\nClean. Once I asked a janitor (in English) how to buy a metro ticket, whereupon he directed me (in French) to a kiosk where I could buy bus tickets. At least he kept the station clean. Luckily I was able to call my lifeline for directions on how to properly buy tickets. Bonus: They weren't expensive.\n\nCan't speak for the buses, but I love not having to drive.",
 "The drive-thru waitresses Cindy E. and Rosa have very good customer service. They are very good at taking your order and getting it right the first time, all the time. The food is always hot with no errors. This Panda Express is one of the better I've experienced. I'd recommend anyone passing through and wanting a good experience to visit this Panda Express location.",
 '**MAIL ORDER REVIEW**\nhttp://www.yelp.com/biz_photos/OGQ_6nIn4QQL2U6t0

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
no_features = 1000
# Compare with traditional LDA
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(review_user_top10_list)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20
no_top_words = 10
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
# topic words = 10
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


display_topics(lda, tf_feature_names, no_top_words)
# def get_topics(model, feature_names, no_top_words):
#     topic_lists = []
#     for topic_idx, topic in enumerate(model.components_):
#         topic_list = []
#         for i in topic.argsort()[:-no_top_words - 1:-1]:
#             topic_list += [feature_names[i]]
#         topic_lists.append(topic_list)
#     return topic_lists
# d = get_topics(lda, tf_feature_names, no_top_words)


# import texttable as tt
# tab = tt.Texttable()
# headings = ['food','order','service','price','environment','place']
# tab.header(headings)
# d0 = d[0]
# d1 = d[1]
# d2 = d[2]
# d3 = d[3]
# d4 = d[4]
# d5 = d[5]
# for row in zip(d0,d1,d2,d3,d4,d5):
#     tab.add_row(row)

# s = tab.draw()
# print (s)

Topic 0:
pretty chicken lunch leave thing time review end drink word
Topic 1:
und der das vegas die las hotel mit man ist
Topic 2:
bbq lettuce korean spicy burger king man recommended prices sho
Topic 3:
hotel night und time kids mit das like thai roy
Topic 4:
duck perfect tasted bottles sure conveniently work accommodating crowd sandwich
Topic 5:
thai place table like 95 sauce don menu circus good
Topic 6:
incline car cost change station historic wash gas road views
Topic 7:
ordered did right like onion soup vegas available make trip
Topic 8:
lighthouse der die man history und house didn west historic
Topic 9:
good casino bar just area place friend food vegas know
Topic 10:
food chicken lunch hot good dishes didn interesting 95 service
Topic 11:
good location ve food just store service right like place
Topic 12:
lighthouse like just places old know history plenty time petit
Topic 13:
toast french le pool water make caesar salad ve especially
Topic 14:
check fi lots wi okay affordable 

In [24]:
import numpy as np
# components_[i, j] can be viewed 
# pseudocount that represents the number of times word j was assigned to topic i.
np.shape(lda.components_)
# it can also be viewed as distribution over the words for each topic after normalization
lda_word_distribution = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
lda_word_distribution

array([[ 0.00114101,  0.00100498,  0.00115182, ...,  0.00109311,
         0.00091689,  0.00117244],
       [ 0.00617627,  0.00277089,  0.00217438, ...,  0.00168376,
         0.00171006,  0.00364717],
       [ 0.00086001,  0.00069024,  0.00084302, ...,  0.00129601,
         0.00092718,  0.00099737],
       ..., 
       [ 0.00056811,  0.00052393,  0.00148134, ...,  0.00065805,
         0.00061423,  0.00079779],
       [ 0.00117369,  0.00083935,  0.00156531, ...,  0.00077063,
         0.00099632,  0.00109062],
       [ 0.00038612,  0.00057464,  0.00127539, ...,  0.00035749,
         0.00039617,  0.00039228]])

In [33]:
# LDA by gensim
from gensim import corpora
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in review_user_top10_list]

 # remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in texts]

In [44]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]
lda = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=10)

In [47]:

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]
lsi.print_topics(20)

[(0,
  '-0.375*"und" + -0.281*"der" + -0.274*"das" + -0.255*"die" + -0.180*"mit" + -0.176*"den" + -0.154*"man" + -0.150*"von" + -0.149*"ist" + -0.133*"es"'),
 (1,
  '-0.158*"und" + 0.142*"was" + 0.136*"i" + 0.125*"we" + -0.120*"der" + 0.119*"good" + -0.115*"das" + 0.110*"-" + -0.107*"die" + 0.105*"as"'),
 (2,
  '0.207*"good" + -0.183*"-" + 0.176*"very" + -0.176*"we" + -0.154*"our" + 0.144*"park" + 0.142*"location" + -0.133*"were" + -0.123*"was" + 0.110*"your"'),
 (3,
  '-0.280*"wash" + -0.254*"pumps" + -0.234*"car" + -0.138*"stations." + -0.138*"gas," + -0.138*"pump/water" + -0.138*"bays" + -0.138*"2-3" + -0.138*"diesel" + -0.138*"vacuum"'),
 (4,
  '-0.177*"drive-thru" + -0.164*"very" + -0.151*"food" + 0.148*"wash" + 0.140*"pumps" + 0.140*"car" + 0.117*"hotel" + -0.111*"good" + -0.109*"panda" + -0.108*"fast"'),
 (5,
  '-0.339*"(" + -0.326*")" + 0.218*"-" + -0.199*"pizza" + -0.095*"was" + 0.094*"their" + -0.094*"caesars" + -0.091*"would" + 0.089*"vietnamese" + -0.086*"section"'),
 (6,
 

In [48]:
for doc in corpus_lsi:
    print(doc)

[(0, -0.052941729409481685), (1, 0.13620408280632612), (2, -0.01140446797152277), (3, 0.026602861448545574), (4, 0.068885499004260123), (5, -0.0045745596906245538), (6, 0.066475667705673355), (7, 0.055539868805046251), (8, 0.022880915329761619), (9, 0.081113280192189177)]
[(0, -0.077651180503146983), (1, 0.20284598058245243), (2, 0.20463934506666162), (3, 0.0018353286530534876), (4, -0.27100824417609121), (5, -0.015948310567770945), (6, 0.070774761714645071), (7, -0.11224052143547243), (8, 0.21557359231833778), (9, -0.10268610044292664)]
[(0, -0.07542896182163808), (1, 0.17971118182407866), (2, -0.018899512467136649), (3, 0.0092929874453543445), (4, 0.0040354145551849564), (5, -0.020863020563312351), (6, 0.013246032318094444), (7, 0.040556439969869539), (8, -0.040956113394541235), (9, -0.060374424960133205)]
[(0, -0.076796147882331328), (1, 0.19697379554575214), (2, 0.03070305273559144), (3, -0.011473781715391665), (4, 0.1111421870001591), (5, -0.054828024397519395), (6, -0.00975139065

In [49]:
model = models.TfidfModel(corpus, normalize=True)

In [51]:
model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)

In [53]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [54]:
index = similarities.MatrixSimilarity(lsi[corpus])

2017-11-03 01:00:03,148 : INFO : creating matrix with 227 documents and 10 features


In [65]:
review_user_top10_text = " ".join(review_user_top10_list[:1])

In [67]:
# Similarity 
>>> vec_bow = dictionary.doc2bow(review_user_top10_text.lower().split())
>>> vec_lsi = lsi[vec_bow] # convert the query to LSI space
>>> sims = index[vec_lsi] # perform a similarity query against the corpus
>>> print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

[(0, 1.0), (1, 0.34869328), (2, 0.85245013), (3, 0.71395826), (4, 0.54902458), (5, 0.90134001), (6, 0.76287186), (7, 0.80699676), (8, 0.69941252), (9, 0.63923585), (10, 0.75164193), (11, 0.85603136), (12, 0.50255549), (13, 0.3903358), (14, 0.68076038), (15, 0.79986286), (16, 0.85524768), (17, 0.75570714), (18, 0.69133854), (19, 0.8497327), (20, 0.89046001), (21, 0.7003305), (22, 0.033951882), (23, 0.80228961), (24, 0.77562869), (25, 0.73982167), (26, 0.78233749), (27, 0.70830983), (28, 0.83217865), (29, -0.023041731), (30, 0.70926303), (31, 0.85236502), (32, 0.89160025), (33, 0.93854541), (34, 0.81914419), (35, 0.83667624), (36, 0.007517565), (37, 0.78825748), (38, 0.82890368), (39, 0.78773946), (40, 0.47680023), (41, 0.86979353), (42, 0.87565029), (43, 0.84433007), (44, -0.014052775), (45, 0.2894398), (46, 0.78073663), (47, 0.13729274), (48, 0.86623883), (49, 0.82521325), (50, 0.89344794), (51, 0.8831265), (52, 0.77844024), (53, 0.34320223), (54, 0.71913469), (55, 0.22126162), (56, 0.