In [1]:
import spacy
import nltk
import re
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import gensim
from gensim import corpora
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import pyprojroot.here as here

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ammarbagharib/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data1 = pd.read_csv(here("data/processed/cleaned_swissotel-the-stamford.csv"))
data2 = pd.read_csv(here("data/processed/cleaned_mbs_total.csv"))
data3 = pd.read_csv(here("data/processed/cleaned_pan-pacific.csv"))
data4 = pd.read_csv(here("data/processed/cleaned_parkroyal-collection-marina-bay.csv"))
data5 = pd.read_csv(here("data/processed/cleaned_fullerton.csv"))
data = pd.concat([data1, data2, data3, data4, data5], ignore_index = True)
print(len(data1))
print(len(data2))
print(len(data3))
print(len(data4))
print(len(data5))
len(data)

5058
10523
7430
6237
6374


35622

In [9]:
reviews = data["lem_review"]
reviews.head()

0    excellent stay fantastic scenery th stay still...
1    pampercation th floor crest suite harbour view...
2    excellent location facility conference venue h...
3    great experience stay swissotel great staying ...
4    decent stay something improve decent stay swis...
Name: lem_review, dtype: object

In [10]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents([comment.split() for comment in reviews])
# Filter only those that occur at least 10 times
finder.apply_freq_filter(10)
bigram_scores = finder.score_ngrams(bigram_measures.pmi)

In [11]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = nltk.collocations.TrigramCollocationFinder.from_documents([comment.split() for comment in reviews])
# Filter only those that occur at least 10 times
finder.apply_freq_filter(10)
trigram_scores = finder.score_ngrams(trigram_measures.pmi)

In [12]:
bigram_pmi = pd.DataFrame(bigram_scores)
bigram_pmi.columns = ['bigram', 'pmi']
bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
bigram_pmi

Unnamed: 0,bigram,pmi
0,"(pina, colada)",17.327900
1,"(laurent, perrier)",16.825399
2,"(foie, gras)",16.819753
3,"(sri, lanka)",16.590934
4,"(giovanni, viterale)",16.368542
...,...,...
25131,"(would, hotel)",-3.678364
25132,"(hotel, we)",-3.802143
25133,"(club, hotel)",-3.824414
25134,"(staff, singapore)",-3.899773


In [13]:
trigram_pmi = pd.DataFrame(trigram_scores)
trigram_pmi.columns = ['trigram', 'pmi']
trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
trigram_pmi

Unnamed: 0,trigram,pmi
0,"(ku, de, ta)",29.440543
1,"(din, tai, fung)",28.886434
2,"(ruth, chris, steak)",27.490997
3,"(mass, rapid, transit)",27.335074
4,"(scan, qr, code)",27.199812
...,...,...
8066,"(hotel, room, staff)",-0.481435
8067,"(hotel, hotel, good)",-0.770127
8068,"(stay, hotel, hotel)",-0.771328
8069,"(hotel, hotel, room)",-0.931946


In [14]:
stop_word_list = set(stopwords.words('english'))

In [15]:
# Filter for bigrams with only noun-type structures
def bigram_filter(bigram):
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_word_list or bigram[1] in stop_word_list:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True

In [16]:
# Filter for trigrams with only noun-type structures
def trigram_filter(trigram):
    tag = nltk.pos_tag(trigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['JJ','NN']:
        return False
    if trigram[0] in stop_word_list or trigram[-1] in stop_word_list or trigram[1] in stop_word_list:
        return False
    if 'n' in trigram or 't' in trigram:
         return False
    if 'PRON' in trigram:
        return False
    return True 

In [17]:
# Can set pmi threshold to whatever makes sense - eyeball through and select threshold where n-grams stop making sense
# choose top 500 ngrams in this case ranked by PMI that have noun like structures
filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram: bigram_filter(bigram['bigram']) and bigram.pmi > 5, axis = 1)][:500]

filtered_trigram = trigram_pmi[trigram_pmi.apply(lambda trigram: \
                                                 trigram_filter(trigram['trigram'])\
                                                 and trigram.pmi > 5, axis = 1)][:500]


bigrams = [' '.join(x) for x in filtered_bigram.bigram.values if len(x[0]) > 2 or len(x[1]) > 2]
trigrams = [' '.join(x) for x in filtered_trigram.trigram.values if len(x[0]) > 2 or len(x[1]) > 2 and len(x[2]) > 2]

In [18]:
bigrams[:15]

['pina colada',
 'laurent perrier',
 'foie gras',
 'sri lanka',
 'giovanni viterale',
 'origin bloom',
 'kuala lumpur',
 'hor oeuvre',
 'tong yien',
 'skating rink',
 'roberto cavalli',
 'louis vuitton',
 'clifford pier',
 'veuve cliquot',
 'koh samui']

In [19]:
trigrams[:15]

['din tai fung',
 'ruth chris steak',
 'mass rapid transit',
 'scan qr code',
 'yvonne yee ming',
 'cut wolfgang puck',
 'crazy rich asians',
 'lau pa sit',
 'est la vie',
 'willow stream spa',
 'lobster nasi lemak',
 'hai tien lo',
 'spago wolfgang puck',
 'hai tien lou',
 'shampoo conditioner body']

In [20]:
# Concatenate n-grams
def replace_ngram(x):
    for gram in trigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x

In [21]:
reviews_w_ngrams = reviews.copy()
final_reviews = reviews_w_ngrams.map(lambda x: replace_ngram(x))
final_reviews.head()

0    excellent stay fantastic scenery th stay still...
1    pampercation th floor crest suite harbour view...
2    excellent location facility conference venue h...
3    great experience stay swissotel great staying ...
4    decent stay something improve decent stay swis...
Name: lem_review, dtype: object

In [22]:
#final_reviews = final_reviews.apply(lambda x: [x])
final_review = final_reviews.tolist()
final_reviews = ""
for x in final_review:
    final_reviews += x

In [23]:
final_reviews = final_reviews.split()

In [24]:
x = []
for i in final_reviews:
    pos_comment = nltk.pos_tag([i])
    word = pos_comment[0]
    if word[1] == 'NN':
        x.append(word[0])
x

['excellent',
 'stay',
 'scenery',
 'th',
 'stay',
 'propose',
 'wife',
 'hotel',
 'stay',
 'hotel',
 'time',
 'way',
 'staff',
 'request',
 'act',
 'receive',
 'complimentary',
 'cake',
 'check',
 'weekend',
 'stay',
 'minor',
 'comment',
 'pillow',
 'firm',
 'pillow',
 'thoughpampercation',
 'th',
 'floor',
 'crest',
 'suite',
 'harbour',
 'view',
 'suite',
 'spectacular',
 'view',
 'complimentary',
 'minibar',
 'access',
 'executive',
 'lounge',
 'indulge',
 'service',
 'suite',
 'meet',
 'greet',
 'hotel',
 'escort',
 'suite',
 'check',
 'touch',
 'birthday',
 'suite',
 'mention',
 'birthday',
 'surprise',
 'explore',
 'option',
 'breakfast',
 'buffet',
 'spread',
 'pool',
 'surround',
 'area',
 'capacity',
 'hotel',
 'share',
 'pool',
 'fairmont',
 'room',
 'service',
 'experience',
 'brilliant',
 'food',
 'serve',
 'service',
 'lady',
 'sorry_forgot_name',
 'personalityexcellent',
 'location',
 'facility',
 'conference',
 'venue',
 'hotel',
 'locate',
 'marina',
 'walk',
 'mornin

In [25]:
final_reviews = x

In [26]:
dictionary = corpora.Dictionary([final_reviews])

In [27]:
doc_term_matrix = [dictionary.doc2bow([doc]) for doc in final_reviews]

In [28]:
'''
coherence = []
for k in range(3,11):
    print('Round: '+str(k))
    Lda = gensim.models.ldamodel.LdaModel
    ldamodel = Lda(doc_term_matrix, num_topics=k, id2word = dictionary, passes=40,\
                   iterations=200, chunksize = 10000, eval_every = None)
    
    cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=final_reviews,\
                                                     dictionary=dictionary, coherence='c_v')
    coherence.append((k,cm.get_coherence()))
'''

"\ncoherence = []\nfor k in range(3,11):\n    print('Round: '+str(k))\n    Lda = gensim.models.ldamodel.LdaModel\n    ldamodel = Lda(doc_term_matrix, num_topics=k, id2word = dictionary, passes=40,                   iterations=200, chunksize = 10000, eval_every = None)\n    \n    cm = gensim.models.coherencemodel.CoherenceModel(model=ldamodel, texts=final_reviews,                                                     dictionary=dictionary, coherence='c_v')\n    coherence.append((k,cm.get_coherence()))\n"

In [29]:
'''
x_val = [x[0] for x in coherence]
y_val = [x[1] for x in coherence]
'''

'\nx_val = [x[0] for x in coherence]\ny_val = [x[1] for x in coherence]\n'

In [30]:
'''
plt.plot(x_val,y_val)
plt.scatter(x_val,y_val)
plt.title('Number of Topics vs. Coherence')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence')
plt.xticks(x_val)
plt.show()
'''

"\nplt.plot(x_val,y_val)\nplt.scatter(x_val,y_val)\nplt.title('Number of Topics vs. Coherence')\nplt.xlabel('Number of Topics')\nplt.ylabel('Coherence')\nplt.xticks(x_val)\nplt.show()\n"

In [31]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=40,\
               iterations=200,  chunksize = 10000, eval_every = None, random_state=0)

In [32]:
ldamodel.show_topics(5, num_words=10, formatted=False)

[(0,
  [('room', 0.17086901),
   ('thank', 0.02778849),
   ('enjoy', 0.025550712),
   ('helpful', 0.021703754),
   ('buffet', 0.02161855),
   ('walk', 0.020139378),
   ('birthday', 0.019792272),
   ('visit', 0.019477786),
   ('everything', 0.014582665),
   ('star', 0.01430211)]),
 (1,
  [('stay', 0.12050929),
   ('view', 0.05989773),
   ('food', 0.054466013),
   ('excellent', 0.040791284),
   ('pool', 0.031735215),
   ('marina', 0.031172885),
   ('day', 0.029853014),
   ('place', 0.029173449),
   ('feel', 0.019838797),
   ('staycation', 0.013645659)]),
 (2,
  [('hotel', 0.2069512),
   ('service', 0.09985726),
   ('singapore', 0.062973276),
   ('breakfast', 0.04648913),
   ('location', 0.04638615),
   ('beautiful', 0.021788752),
   ('afternoon', 0.01906578),
   ('area', 0.018283708),
   ('bed', 0.01582549),
   ('building', 0.014789774)]),
 (3,
  [('fullerton', 0.07440514),
   ('time', 0.041130897),
   ('experience', 0.035579357),
   ('bay', 0.029889552),
   ('club', 0.027151763),
   ('r

In [33]:
topic_data =  pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, mds = 'pcoa')
pyLDAvis.display(topic_data)