# Importing Necessary Libraries

In [1]:
import regex as re
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

# Loading, Cleaning, and Tokenizing the data

In [2]:
df = pd.read_csv('RestaurantReviews.csv')
df = df.dropna()
df

Unnamed: 0,Restaurant,Review,No_Of_Raters,Time,Cost,American,Asian,French,Indian,Mediterranean
0,Beyond Flavours,"The ambience was good, food was quite good . h...",5,5/25/19 15:54,800,0,1,0,0,0
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,5,5/25/19 14:20,800,0,1,0,0,0
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,5,5/24/19 22:54,800,0,1,0,0,0
3,Beyond Flavours,Soumen das and Arun was a great guy. Only beca...,5,5/24/19 22:11,800,0,1,0,0,0
4,Beyond Flavours,Food is good.we ordered Kodi drumsticks and ba...,5,5/24/19 21:37,800,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9956,Chinese Pavilion,Madhumathi Mahajan Well to start with nice cou...,3,6/5/16 0:08,1000,0,1,0,0,0
9957,Chinese Pavilion,This place has never disappointed us.. The foo...,5,6/4/16 22:01,1000,0,0,0,1,0
9958,Chinese Pavilion,"Bad rating is mainly because of ""Chicken Bone ...",2,6/3/16 10:37,1000,0,0,0,1,0
9959,Chinese Pavilion,I personally love and prefer Chinese Food. Had...,4,5/31/16 17:22,1000,0,1,0,0,0


In [3]:
reviews = df['Review'].tolist()
reviews = [re.sub('\s+', ' ', thing) for thing in reviews]
reviews = [re.sub("\'", "", thing) for thing in reviews]
print(reviews[0])

The ambience was good, food was quite good . had Saturday lunch , which was cost effective . Good place for a sate brunch. One can also chill with friends and or parents. Waiter Soumen Das was really courteous and helpful.


In [4]:
def tokenizer(review):
    for word in review:
        yield(gensim.utils.simple_preprocess(str(word), deacc=True))
        
tokensets = list(tokenizer(reviews))

print(tokensets[0])

['the', 'ambience', 'was', 'good', 'food', 'was', 'quite', 'good', 'had', 'saturday', 'lunch', 'which', 'was', 'cost', 'effective', 'good', 'place', 'for', 'sate', 'brunch', 'one', 'can', 'also', 'chill', 'with', 'friends', 'and', 'or', 'parents', 'waiter', 'soumen', 'das', 'was', 'really', 'courteous', 'and', 'helpful']


# Creating Bigrams, Removing Stopwords, and Lemmatizing

In [5]:
bigram = gensim.models.Phrases(tokensets, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
print(bigram_mod[tokensets[0]])

['the', 'ambience', 'was', 'good', 'food', 'was', 'quite', 'good', 'had', 'saturday', 'lunch', 'which', 'was', 'cost_effective', 'good', 'place', 'for', 'sate', 'brunch', 'one', 'can', 'also', 'chill', 'with', 'friends', 'and', 'or', 'parents', 'waiter', 'soumen_das', 'was', 'really', 'courteous', 'and', 'helpful']


In [6]:
def stopword_remover(tokensets):
    return [[token for token in simple_preprocess(str(tokenset)) if token not in stop_words] for tokenset in tokensets]

def bigram_maker(tokensets):
    return [bigram_mod[tokenset] for tokenset in tokensets]

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatizer(tokensets, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    better_tokensets = []
    for tokens in tokensets:
        tokenset = nlp(" ".join(tokens)) 
        better_tokensets.append([token.lemma_ for token in tokenset if token.pos_ in allowed_postags])
    return better_tokensets

In [7]:
NoStop = stopword_remover(tokensets)

BiGram = bigram_maker(NoStop)

best_tokensets = lemmatizer(BiGram, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(best_tokensets[0])

['ambience', 'good', 'food', 'quite', 'good', 'saturday', 'lunch', 'cost_effective', 'good', 'place', 'sate', 'brunch', 'also', 'chill', 'friend', 'parent', 'waiter', 'soumen_da', 'really', 'courteous', 'helpful']


# Using LDA to develop a topic model

In [8]:
tokenDict = corpora.Dictionary(best_tokensets)
corpus = [tokenDict.doc2bow(tokenset) for tokenset in best_tokensets]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 3), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)]


In [9]:
LDA = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                      id2word=tokenDict,
                                      num_topics=3,
                                      random_state=100,
                                      update_every=1,
                                      chunksize=100,
                                      passes=10,
                                      alpha='auto',
                                      per_word_topics=True)
newLDA = LDA[corpus]
print(LDA.print_topics())

[(0, '0.073*"good" + 0.056*"food" + 0.052*"place" + 0.027*"service" + 0.018*"great" + 0.018*"visit" + 0.017*"ambience" + 0.015*"staff" + 0.014*"really" + 0.013*"nice"'), (1, '0.039*"chicken" + 0.026*"taste" + 0.016*"order" + 0.015*"try" + 0.014*"veg" + 0.014*"dish" + 0.013*"rice" + 0.012*"chinese" + 0.011*"biryani" + 0.010*"soup"'), (2, '0.028*"order" + 0.020*"do" + 0.019*"get" + 0.018*"restaurant" + 0.014*"bad" + 0.013*"time" + 0.012*"food" + 0.012*"even" + 0.012*"take" + 0.011*"give"')]


# Generating a Visualization of the 3 Topic Models

In [10]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
visualization = pyLDAvis.gensim_models.prepare(LDA, corpus, tokenDict)
visualization