In [None]:
# Step 1 Read in data
# Step 2 Preprocess text data
# Step 3 Word Embedding
# Step 4 LDA

importing packages

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim 

In [20]:
yelp=pd.read_csv('all_data20180608.csv')

### PreProcessing
#step 1 lower case
#step 2 punctuation
#step 3 stop word
#step 4 common word removal
#step 5 rare word removal
#step 6 token
#step 7 stemming
#step 8 lemma

In [21]:
#step 1
yelp['lower'] = yelp.text.apply(lambda x: " ".join(x.lower() for x in x.split()))
yelp.lower.head()

0    my friend gabi, i love your cute parisian inte...
1     had a good waiter, all the staff were very cool.
2    my only regret is not catching the name of our...
3    lotus of siam did not disappoint, the service ...
4    his name is carlos if you ever want to request...
Name: lower, dtype: object

In [22]:
# step 2
reg_tok = RegexpTokenizer(r'\w+')#+ is one or more
yelp['no_punc'] = yelp['lower'].apply(lambda x: ' '.join(reg_tok.tokenize(x)))
yelp.no_punc.head()

0    my friend gabi i love your cute parisian inter...
1       had a good waiter all the staff were very cool
2    my only regret is not catching the name of our...
3    lotus of siam did not disappoint the service w...
4    his name is carlos if you ever want to request...
Name: no_punc, dtype: object

In [23]:
# step 3
yelp['no_stop'] = yelp['no_punc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
yelp.no_stop.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret catching name server best experienced f...
3        lotus siam disappoint service great attentive
4          name carlos ever want request service great
Name: no_stop, dtype: object

In [24]:
# top most common words
freq = pd.Series(' '.join(yelp['no_stop']).split()).value_counts()[:20]#combining all rows and then splitting and converitign and value count
freq
#looking at these, we actually want to keep them so no need to carry out this step

food          3504
good          1925
buffet        1556
service       1554
great         1365
place         1111
vegas          882
like           764
restaurant     656
one            642
get            641
best           635
really         625
quality        611
price          596
would          552
time           539
go             539
selection      470
better         463
dtype: int64

In [25]:
# least common words
rare = pd.Series(' '.join(yelp['no_stop']).split()).value_counts()[-600:]

In [26]:
rare = list(rare.index)
yelp['no_rare'] = yelp['no_stop'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))

In [27]:
# from textblob import TextBlob
# # not really doing that for tutorial, this is just demo of it
# yelp['no_stop'][:5].apply(lambda x: str(TextBlob(x).correct()))

In [28]:
# step 6
_word_tokenize = TreebankWordTokenizer()
yelp['token'] = yelp['no_rare'].apply(lambda x: ' '.join(_word_tokenize.tokenize(x)))
yelp.token.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2    regret name server best experienced far trip v...
3        lotus siam disappoint service great attentive
4          name carlos ever want request service great
Name: token, dtype: object

In [29]:
# step 7
st = SnowballStemmer("english")
yelp['stemed']=yelp['token'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
yelp.stemed.head()

0    friend gabi love cute parisian interior dim li...
1                               good waiter staff cool
2      regret name server best experienc far trip vega
3            lotus siam disappoint servic great attent
4            name carlo ever want request servic great
Name: stemed, dtype: object

In [30]:
#step 8
wordnet_lemmatizer = WordNetLemmatizer()
yelp['lemma']=yelp['stemed'].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))
yelp.lemma.head(20)

0     friend gabi love cute parisian interior dim li...
1                                good waiter staff cool
2       regret name server best experienc far trip vega
3             lotus siam disappoint servic great attent
4             name carlo ever want request servic great
5                               room beauti server good
6     servic quick price ok get pretti darn good san...
7                                 good servic good food
8     say locat decor lotus siam never life find bet...
9                              servic snappi food tasti
10    came month ago food ok initi encount cashier g...
11                       hostess waitress friend attent
12                     shout boy wesley host cool peopl
13                            waitress awesom help ball
14     servic great busi afternoon outdoor set look day
15    arriv 3pm weekday prompt seat busi patio time ...
16    happi help take mani pictur request alway kept...
17    item order mon ami gabi oyster du jour 15 

In [31]:
yelp.lemma.str.len().max()

610

In [32]:
sentences=yelp['lemma'].tolist()
print(sentences[:5])

['friend gabi love cute parisian interior dim light knowledg waiter delici food', 'good waiter staff cool', 'regret name server best experienc far trip vega', 'lotus siam disappoint servic great attent', 'name carlo ever want request servic great']


In [33]:
tkn_sentences = []
for s in sentences:
    sent = _word_tokenize.tokenize(s)
    #print(sent)
    #break
    tkn_sentences.append(sent)

In [34]:
# creating word dictionary
dictionary=[]
for s in tkn_sentences:
    num_free = [x for x in s if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
    #print(num_free)
    legit_words = [x for x in num_free if not len(x) < 2]
    dictionary.append(legit_words)

In [35]:
print(dictionary)

[['friend', 'gabi', 'love', 'cute', 'parisian', 'interior', 'dim', 'light', 'knowledg', 'waiter', 'delici', 'food'], ['good', 'waiter', 'staff', 'cool'], ['regret', 'name', 'server', 'best', 'experienc', 'far', 'trip', 'vega'], ['lotus', 'siam', 'disappoint', 'servic', 'great', 'attent'], ['name', 'carlo', 'ever', 'want', 'request', 'servic', 'great'], ['room', 'beauti', 'server', 'good'], ['servic', 'quick', 'price', 'ok', 'get', 'pretti', 'darn', 'good', 'sandwich', 'definit', 'back', 'next', 'time', 'vega'], ['good', 'servic', 'good', 'food'], ['say', 'locat', 'decor', 'lotus', 'siam', 'never', 'life', 'find', 'better', 'servic', 'receiv'], ['servic', 'snappi', 'food', 'tasti'], ['came', 'month', 'ago', 'food', 'ok', 'initi', 'encount', 'cashier', 'good', 'one', 'accus', 'cut', 'line', 'tri', 'walk', 'around', 'crowd', 'tri', 'figur', 'payment', 'fast', 'forward', 'month', 'later', 'comp', 'buffet', 'spoke', 'manag', 'verifi', 'comp', 'quit', 'charact', 'mani', 'red', 'bull', 'enoug

In [37]:


#count=Counter(" ".join(dictionary)).items()
print(sorted(count))

TypeError: sequence item 0: expected str instance, list found

In [None]:
#len(count)

In [None]:
print(max(yelp.lemma, key=len))

convert to word 2 vec
check word to vec model
bring in lda
create lda model
check results

In [38]:
model = gensim.models.Word2Vec(dictionary, sg=1)

In [39]:
print(model.wv['friend'])

[ 2.20085844e-01  1.33751705e-01 -1.26855046e-01 -4.10607100e-01
 -1.71559468e-01 -1.10338412e-01 -2.68061489e-01 -2.80084670e-01
 -1.34559959e-01 -2.38033921e-01  2.26818457e-01  4.42651100e-02
  5.56680793e-03 -1.92120761e-01  5.46712242e-02  1.98707387e-01
  5.93305342e-02 -1.14575781e-01  3.39166135e-01 -3.67623791e-02
 -2.44950846e-01 -4.49614316e-01  1.19434705e-03 -2.83697367e-01
 -6.80893138e-02 -3.75482053e-01 -1.21857822e-01 -4.35722411e-01
  1.89393997e-01  1.36069089e-01  2.76094645e-01 -5.73641360e-01
  1.79207727e-01 -1.17764615e-01 -4.56820838e-02  3.54684686e-04
 -1.54923659e-03  6.31416261e-01  4.01974246e-02 -4.98285830e-01
 -2.65538305e-01 -3.49599302e-01  3.46334875e-01  1.69771686e-02
 -3.78077060e-01 -3.64983290e-01 -3.44539657e-02 -2.49983996e-01
 -1.13225259e-01  1.37350038e-01  3.01715732e-01  1.74711987e-01
  2.36550704e-01  2.90854275e-01  1.53898448e-01  8.95950794e-02
  2.63062775e-01 -6.74505457e-02 -7.67312497e-02 -2.18023643e-01
  3.06011677e-01 -2.25366

In [40]:
model.wv.most_similar(positive='waiter', topn=10)

[('server', 0.9909631609916687),
 ('help', 0.9713982939720154),
 ('waitress', 0.9664373397827148),
 ('drink', 0.964133620262146),
 ('refil', 0.962772786617279),
 ('manag', 0.9568043351173401),
 ('friend', 0.9323858022689819),
 ('empti', 0.929429292678833),
 ('took', 0.9262838363647461),
 ('ask', 0.925493597984314)]

In [41]:
model.wv.most_similar(positive='waitress', topn=10)

[('refil', 0.9723803997039795),
 ('help', 0.9720377922058105),
 ('waiter', 0.9664373397827148),
 ('prompt', 0.96534264087677),
 ('effici', 0.9579280614852905),
 ('friend', 0.9546313285827637),
 ('server', 0.9546089768409729),
 ('fast', 0.9527144432067871),
 ('attent', 0.9513038992881775),
 ('drink', 0.9500843286514282)]

In [42]:

# Create Dictionary
id2word = corpora.Dictionary(dictionary)

# Create Corpus
texts = dictionary

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]]


In [43]:
id2word[0]

'cute'

In [45]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus]

[[('cute', 1),
  ('delici', 1),
  ('dim', 1),
  ('food', 1),
  ('friend', 1),
  ('gabi', 1),
  ('interior', 1),
  ('knowledg', 1),
  ('light', 1),
  ('love', 1),
  ('parisian', 1),
  ('waiter', 1)],
 [('waiter', 1), ('cool', 1), ('good', 1), ('staff', 1)],
 [('best', 1),
  ('experienc', 1),
  ('far', 1),
  ('name', 1),
  ('regret', 1),
  ('server', 1),
  ('trip', 1),
  ('vega', 1)],
 [('attent', 1),
  ('disappoint', 1),
  ('great', 1),
  ('lotus', 1),
  ('servic', 1),
  ('siam', 1)],
 [('name', 1),
  ('great', 1),
  ('servic', 1),
  ('carlo', 1),
  ('ever', 1),
  ('request', 1),
  ('want', 1)],
 [('good', 1), ('server', 1), ('beauti', 1), ('room', 1)],
 [('good', 1),
  ('vega', 1),
  ('servic', 1),
  ('back', 1),
  ('darn', 1),
  ('definit', 1),
  ('get', 1),
  ('next', 1),
  ('ok', 1),
  ('pretti', 1),
  ('price', 1),
  ('quick', 1),
  ('sandwich', 1),
  ('time', 1)],
 [('food', 1), ('good', 2), ('servic', 1)],
 [('lotus', 1),
  ('servic', 1),
  ('siam', 1),
  ('better', 1),
  ('decor

In [46]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [47]:
# Print the Keyword in the 10 topics
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.211*"food" + 0.148*"good" + 0.127*"great" + 0.123*"servic" + 0.029*"amaz" '
  '+ 0.024*"select" + 0.023*"alway" + 0.019*"perfect" + 0.019*"year" + '
  '0.018*"everyth"'),
 (1,
  '0.057*"tasti" + 0.047*"feel" + 0.044*"meal" + 0.038*"serv" + 0.037*"fri" + '
  '0.036*"come" + 0.032*"enjoy" + 0.031*"dinner" + 0.028*"burger" + '
  '0.024*"went"'),
 (2,
  '0.087*"dish" + 0.078*"decor" + 0.049*"high" + 0.046*"bar" + 0.037*"pasta" + '
  '0.035*"night" + 0.033*"thin" + 0.032*"disappoint" + 0.024*"garden" + '
  '0.024*"waiter"'),
 (3,
  '0.047*"fresh" + 0.036*"littl" + 0.028*"special" + 0.027*"portion" + '
  '0.026*"thai" + 0.023*"crab" + 0.023*"sauc" + 0.022*"bit" + 0.020*"everi" + '
  '0.019*"must"'),
 (4,
  '0.078*"pizza" + 0.062*"would" + 0.058*"sushi" + 0.047*"never" + '
  '0.032*"fish" + 0.030*"salad" + 0.029*"lasagna" + 0.026*"roll" + '
  '0.025*"overal" + 0.024*"crust"'),
 (5,
  '0.127*"go" + 0.092*"time" + 0.038*"first" + 0.037*"tabl" + 0.037*"experi" + '
  '0.033*"made" + 0.0

In [48]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=dictionary, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.593850934962384

Coherence Score:  0.36811374329032


In [49]:

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
# from numpy import array
# from keras.preprocessing.text import one_hot
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Flatten
# from keras.layers.embeddings import Embedding
# from sklearn.preprocessing import LabelEncoder
# from keras.utils import np_utils
# from sklearn.model_selection import train_test_split
# # define documents

# # define class labels
# encoder = LabelEncoder()
# encoder.fit(yelp.category)
# encoded_Y = encoder.transform(yelp.category)
# # convert integers to dummy variables (i.e. one hot encoded)
# dummy_y = np_utils.to_categorical(encoded_Y)
# import gensim
# from gensim import corpora
# from gensim.corpora import Dictionary
# dictionary=Dictionary([list(yelp.lemma)])
# # integer encode the documents
# vocab_size = 5248
# #encoded_docs = [one_hot(d, vocab_size) for d in yelp.lemma]
# doc_term_matrix = [dictionary.doc2bow(doc) for doc in yelp.lemma]
# print(doc_term_matrix)
# #print(encoded_docs)
# # pad documents to a max length of 4 words
# max_length = 102
# #padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


# #X_train, X_test, y_train, y_test = train_test_split(padded_docs,dummy_y,test_size=0.1)

# Lda = gensim.models.ldamodel.LdaModel

# # Running and Trainign LDA model on the document term matrix.
# ldamodel = Lda(doc_term_matrix, num_topics=12, id2word = dictionary, passes=50)

# # evaluate the model
# #y_pred=model.predict(X_test)
# print(ldamodel.print_topics(num_topics=3, num_words=3))

In [None]:
# from sklearn.metrics import confusion_matrix
# conf_mat = confusion_matrix(y_test, y_pred)
# fig, ax = plt.subplots(figsize=(10,10))
# sns.heatmap(conf_mat, annot=True, fmt='d',
#             xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()
loss, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %f' % (accuracy*100))
print('loss: %f' % (loss))

In [None]:
from sklearn import metrics
#print(metrics.classification_report(y_test, y_pred, target_names=my_df['category'].unique()))
print(metrics.accuracy_score(X_test, y_test))

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=12, id2word = dictionary, passes=50)

In [None]:
print(y_test[0])

In [None]:
# from lda2vec import LDA2Vec
# n_words = 10
# n_docs = 15
# n_hidden = 8
# n_topics = 2
# n_obs = 300
# words = np.random.randint(n_words, size=(n_obs))
# _, counts = np.unique(words, return_counts=True)
# model = LDA2Vec(n_words, n_hidden, counts)
# model.add_categorical_feature(n_docs, n_topics, name='document id')
# model.finalize()
# doc_ids = np.arange(n_obs) % n_docs
# loss = model.fit_partial(words, 1.0, categorical_features=doc_ids)