In [0]:
!pip install gensim
!pip install pyLDAvis
# Run in python console
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:

import spacy.cli
spacy.cli.download("en_core_web_sm")


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



##3. Import Packages

In [0]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)



##5. Prepare Stopwords

In [0]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

##6. Import Newsgroups Data

In [0]:
#from google.colab import files
#uploaded = files.upload()

# Import Dataset
from google.colab import drive
drive.mount('/content/drive/')
#users = pd.read_csv('drive/My Drive/yelp/user.csv')
df = pd.read_csv('drive/My Drive/Colab Notebooks/LDA/yelp/restaurants_arizonacsv.csv')

Mounted at /content/drive/


In [0]:
# Import Dataset
#df = pd.read_csv('restaurants_arizonacsv (1).csv')
#print(df.target_names.unique())
df =  df[df['stars_x']>3]
df.head()

Unnamed: 0,votes,user_id,review_id,stars_x,date,text,type_x,business_id
0,"{'funny': 0, 'useful': 1, 'cool': 0}",t95D1tnWvAOy2sxXnI3GUA,4ibY4fPQYM7FBSVPrvrYvg,4,2010-04-02,Not the same Domino's pizza I ate too much of ...,review,x5Mv61CnZLohZWxfCVCPTQ
1,"{'funny': 0, 'useful': 1, 'cool': 0}",t95D1tnWvAOy2sxXnI3GUA,1JlopVxrAeDNQCRnSHLHrQ,4,2010-05-03,Easy ordering-on-line. Made exactly as I asked...,review,x5Mv61CnZLohZWxfCVCPTQ
2,"{'funny': 4, 'useful': 4, 'cool': 6}",9ellV9VrEOPA3vX2pZptSA,9--jL__9efnmXZEm9o0HIw,5,2010-11-02,OK OK... as a Proud Italian I hope my momma do...,review,x5Mv61CnZLohZWxfCVCPTQ
7,"{'funny': 0, 'useful': 0, 'cool': 0}",lzu5sYkqKRYDHnGGDlHcWw,3w9IoOVjTTWIZeNG13ZGfA,5,2014-06-04,I have always ordered pizza from dominos and t...,review,x5Mv61CnZLohZWxfCVCPTQ
10,"{'funny': 0, 'useful': 0, 'cool': 0}",lkPszfTzL-Hb2zGJfMktWQ,SHRKu4T-PI5WJuoGN7Mo4w,5,2015-02-04,Love the online ordering ....... order was acc...,review,x5Mv61CnZLohZWxfCVCPTQ


##7. Remove emails and newline characters

In [0]:
# Convert to list
data = df.text.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['Not the same Dominos pizza I ate too much of in College. Vastly different '
 'and a major improvement! Ordering on-line couldnt be easier, and pizza was '
 'made exactly how I ordered. Piping hot, flavorful, better than average '
 'crust, nicely blackened in all the right spots, tasty toppings, robust '
 'sauce. Friendly delivery by John. Price is fair. Much, much better than '
 'Pizza Hut!']


##8. Tokenize words and Clean-up text

In [0]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['not', 'the', 'same', 'dominos', 'pizza', 'ate', 'too', 'much', 'of', 'in', 'college', 'vastly', 'different', 'and', 'major', 'improvement', 'ordering', 'on', 'line', 'couldnt', 'be', 'easier', 'and', 'pizza', 'was', 'made', 'exactly', 'how', 'ordered', 'piping', 'hot', 'flavorful', 'better', 'than', 'average', 'crust', 'nicely', 'blackened', 'in', 'all', 'the', 'right', 'spots', 'tasty', 'toppings', 'robust', 'sauce', 'friendly', 'delivery', 'by', 'john', 'price', 'is', 'fair', 'much', 'much', 'better', 'than', 'pizza', 'hut']]


##9. Creating Bigram and Trigram Models

In [0]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['not', 'the', 'same', 'dominos', 'pizza', 'ate', 'too', 'much', 'of', 'in', 'college', 'vastly', 'different', 'and', 'major', 'improvement', 'ordering', 'on', 'line', 'couldnt', 'be', 'easier', 'and', 'pizza', 'was', 'made', 'exactly', 'how', 'ordered', 'piping_hot', 'flavorful', 'better', 'than', 'average', 'crust', 'nicely', 'blackened', 'in', 'all', 'the', 'right', 'spots', 'tasty', 'toppings', 'robust', 'sauce', 'friendly', 'delivery', 'by', 'john', 'price', 'is', 'fair', 'much', 'much', 'better', 'than', 'pizza', 'hut']


## *10. Remove Stopwords, Make Bigrams and Lemmatize*

In [0]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [0]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['dominos', 'pizza', 'eat', 'much', 'college', 'vastly', 'different', 'major', 'improvement', 'ordering', 'line', 'could', 'not', 'easy', 'pizza', 'make', 'exactly', 'order', 'piping_hot', 'flavorful', 'good', 'average', 'crust', 'nicely', 'blacken', 'right', 'spot', 'tasty', 'topping', 'robust', 'sauce', 'friendly', 'delivery', 'john', 'price', 'fair', 'much', 'much', 'good', 'pizza', 'hut']]


## 11. Create the Dictionary and Corpus needed for Topic Modeling

In [0]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 3), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 3), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1)]]


In [0]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('average', 1),
  ('blacken', 1),
  ('college', 1),
  ('could', 1),
  ('crust', 1),
  ('delivery', 1),
  ('different', 1),
  ('dominos', 1),
  ('easy', 1),
  ('eat', 1),
  ('exactly', 1),
  ('fair', 1),
  ('flavorful', 1),
  ('friendly', 1),
  ('good', 2),
  ('hut', 1),
  ('improvement', 1),
  ('john', 1),
  ('line', 1),
  ('major', 1),
  ('make', 1),
  ('much', 3),
  ('nicely', 1),
  ('not', 1),
  ('order', 1),
  ('ordering', 1),
  ('piping_hot', 1),
  ('pizza', 3),
  ('price', 1),
  ('right', 1),
  ('robust', 1),
  ('sauce', 1),
  ('spot', 1),
  ('tasty', 1),
  ('topping', 1),
  ('vastly', 1)]]

## 12. Building the Topic Model

In [0]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

##13. View the topics in LDA model

In [0]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.034*"go" + 0.030*"get" + 0.025*"time" + 0.023*"order" + 0.021*"come" + '
  '0.017*"make" + 0.015*"back" + 0.014*"sushi" + 0.014*"eat" + 0.014*"would"'),
 (1,
  '0.052*"chicken" + 0.045*"roll" + 0.034*"hot" + 0.031*"fry" + 0.020*"sauce" '
  '+ 0.018*"spicy" + 0.017*"mexican" + 0.017*"rice" + 0.016*"soup" + '
  '0.015*"fish"'),
 (2,
  '0.076*"not" + 0.046*"be" + 0.037*"do" + 0.019*"know" + 0.015*"can" + '
  '0.015*"bagel" + 0.014*"say" + 0.014*"s" + 0.014*"think" + 0.012*"star"'),
 (3,
  '0.081*"good" + 0.060*"food" + 0.057*"place" + 0.053*"great" + '
  '0.035*"service" + 0.029*"always" + 0.028*"love" + 0.017*"have" + '
  '0.017*"friendly" + 0.015*"lunch"'),
 (4,
  '0.037*"bar" + 0.016*"sit" + 0.014*"old" + 0.011*"coffee" + 0.011*"cafe" + '
  '0.011*"local" + 0.010*"kid" + 0.010*"line" + 0.010*"walk" + 0.009*"cool"'),
 (5,
  '0.072*"pie" + 0.062*"pizza" + 0.035*"salad" + 0.027*"cheese" + '
  '0.019*"bread" + 0.018*"slice" + 0.015*"sauce" + 0.013*"italian" + '
  '0.013*"cream" +

##14. Compute Model Perplexity and Coherence Score

In [0]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.435416695853526

Coherence Score:  0.4358264454981806


##15. Visualize the topics-keywords

In [0]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

##16. Building LDA Mallet Model

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
from gensim.test.utils import common_corpus, common_dictionary
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = 'gdrive/My Drive/Colab Notebooks/LDA/yelp/mallet/bin/mallet'
#mallet_path = 'gdrive/My Drive/Colab Notebooks/LDA/yelp/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [0]:
#load pretrained model
ldamallet = gensim.models.wrappers.LdaMallet.load('drive/My Drive/Colab Notebooks/LDA/yelp/mallet.gensim')

In [0]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('pizza', 0.027078807830862014),
   ('make', 0.022334956892480808),
   ('time', 0.021417488565376366),
   ('sandwich', 0.018238730655585088),
   ('order', 0.0178231714721319),
   ('eat', 0.014574254219679696),
   ('find', 0.012593602007636574),
   ('location', 0.011587085284077876),
   ('give', 0.01139819474614461),
   ('star', 0.01115803391934374)]),
 (1,
  [('good', 0.03146730769752532),
   ('chicken', 0.02888165980687679),
   ('order', 0.024263850037848676),
   ('sauce', 0.019485963431938855),
   ('fresh', 0.019339452428691197),
   ('fry', 0.017106516212527775),
   ('lunch', 0.014813890328374569),
   ('hot', 0.014604976490410312),
   ('taste', 0.012835991784531152),
   ('roll', 0.01213870793574136)]),
 (2,
  [('food', 0.10964246956511454),
   ('good', 0.0884023644143225),
   ('great', 0.0873319317170391),
   ('place', 0.06462515635118472),
   ('love', 0.04900925835573258),
   ('service', 0.0489353334456992),
   ('friendly', 0.025371029123457556),
   ('price', 0.0207462867517

In [0]:
lda_model2 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model2, corpus, id2word)
vis

## 18. Finding the dominant topic in each sentence

In [0]:
def format_topics_sentences(ldamodel, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

TypeError: ignored

##19. Find the most representative document for each topic

In [0]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

NameError: ignored

##20. Topic distribution across documents

In [0]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

NameError: ignored