In [1]:
import numpy as np
import pandas as pd
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
import matplotlib.pyplot as plt
import pickle

In [2]:
data = pd.read_csv('./reviews_lem.csv')

In [3]:
data.head()

Unnamed: 0,text
0,"['come', 'right', 'place', 'everything', 'wish..."
1,"['came', 'back', 'trip', 'cumbuco', 'one', 'be..."
2,"['week', 'cumbuco', 'best', 'place', 'stay', '..."
3,"['everything', 'nice', 'lovely', 'stay', 'mana..."
4,"['third', 'time', 'row', 'weve', 'great', 'pou..."


In [10]:
lem = WordNetLemmatizer()
def preprocess(txt):
    text = txt.lower()
    
    text_p = "".join([char for char in text if char not in string.punctuation])
    text_p = re.sub(r'[0-9]','',text_p)
    words = word_tokenize(text_p)
    
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    
    
    stemmed = [lem.lemmatize(word) for word in filtered_words]
    return stemmed

In [5]:
data['text_cleaned_lem'] = data['text'][:100000].apply(preprocess)

In [11]:
test = pd.DataFrame({'text':data['text_cleaned_lem'][:100000]})
test.to_csv('reviews_lem.csv',index=False)

In [18]:
data['text_cleaned'] = data['text_cleaned'].apply(lambda x : " ".join(x))

In [4]:
data.head()

Unnamed: 0,text
0,"['come', 'right', 'place', 'everything', 'wish..."
1,"['came', 'back', 'trip', 'cumbuco', 'one', 'be..."
2,"['week', 'cumbuco', 'best', 'place', 'stay', '..."
3,"['everything', 'nice', 'lovely', 'stay', 'mana..."
4,"['third', 'time', 'row', 'weve', 'great', 'pou..."


In [5]:
vec = CountVectorizer()
cv = vec.fit_transform(data['text'])

In [12]:
with open('cv.pkl','wb') as f:
    pickle.dump(vec,f)

In [6]:
lda_model = LatentDirichletAllocation(n_components=15,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(cv)

In [7]:
with open('lda_15_cv.pkl','wb') as f:
    pickle.dump((lda_output,lda_model),f)

In [8]:

for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([vec.get_feature_names()[i] for i in topic.argsort()[-5:]])
    print('\n')


THE TOP 15 WORDS FOR TOPIC #0
['clean', 'breakfast', 'good', 'hotel', 'room']


THE TOP 15 WORDS FOR TOPIC #1
['home', 'house', 'stay', 'wonderful', 'breakfast']


THE TOP 15 WORDS FOR TOPIC #2
['immaculate', 'birthday', 'somewhere', 'visited', 'relax']


THE TOP 15 WORDS FOR TOPIC #3
['property', 'menu', 'experience', 'customer', 'service']


THE TOP 15 WORDS FOR TOPIC #4
['attending', 'campsite', 'fireplace', 'similar', 'hostel']


THE TOP 15 WORDS FOR TOPIC #5
['greek', 'boutique', 'bungalow', 'favorite', 'bike']


THE TOP 15 WORDS FOR TOPIC #6
['east', 'animal', 'tent', 'willing', 'lodge']


THE TOP 15 WORDS FOR TOPIC #7
['desk', 'floor', 'door', 'room', 'told']


THE TOP 15 WORDS FOR TOPIC #8
['paris', 'route', 'ski', 'de', 'la']


THE TOP 15 WORDS FOR TOPIC #9
['ok', 'carpet', 'dirty', 'poor', 'resort']


THE TOP 15 WORDS FOR TOPIC #10
['water', 'kid', 'area', 'also', 'pool']


THE TOP 15 WORDS FOR TOPIC #11
['food', 'stay', 'place', 'staff', 'great']


THE TOP 15 WORDS FOR TOPIC

In [13]:
def get_n_topic_words(lda_model , n_top_words , topic_idx):
  n_topic_words = []
  vocab = vec.get_feature_names()
  topic_words_idx = np.argsort(lda_model.components_[topic_idx])[::-1][:n_top_words]
  n_topic_words = [vocab[i] for i in topic_words_idx]
  return n_topic_words

In [14]:
def get_max_topics(probabilities,number_of_topics):
  probabilities = [[index,prob] for index,prob in enumerate(probabilities)]
  probabilities.sort(key = lambda k :k[1],reverse = True)
  probabilities = [i[0] for i in probabilities]
  return probabilities[:number_of_topics]

def get_review_asbects(review , lda_model):
  transformed_review = vec.transform(review)
  prob = lda_model.transform(transformed_review)
  prob = prob.tolist()[0]
  best_topics = get_max_topics(prob,3)
  aspects = []
  top_topic_words = []
  for topic in best_topics:
    x = get_n_topic_words(lda_model,10,topic)
    top_topic_words.extend(x)
  for word in top_topic_words:
    if word in review:
      aspects.append(word)
  return list(set(aspects))

In [15]:
text ="Overall, I enjoyed my stay at the Finest Playa Mujeres and would probably stay again in the future if the situation arose. I only give 4 out of 5 stars because I felt the rooms could have exhibited more luxury. The furniture seemed very cheap and basic. Especially the patio furniture. The towels and robes were very hard. Slippers were comfy and a nice touch, though. Too many kids for my personal comfort but access to adult only areas helped a little. Long waits for dinner were annoying and while my partner didn’t mind eating at the buffet for every meal, I most certainly mind. We had an Excellence Club Junior suite, Ocean View and for over $500 a night I should never have to wait 1.5 hours to eat at a place of my choice. That said, let’s get to the good stuff! At the airport we found our transportation company (Seasons) and waited just a few minutes for our private transportation which was provided with the room we booked. Check in was quick, easy and very hospitable. We had our own separate check in area in Building 6, adults only. It was a little unclear of where club access places were but we found them. The towel colors help. We started off our vacation with the free hydrotherapy session that came with our room. We absolutely loved it. Adriana was a great host. Highly recommended, Don’t skip it! It was my birthday so we also booked other services at the spa and we really enjoyed those too. We both got stress relief massages and I got a facial. Jacqueline ready did a nice job with both, but I really loved the facial! We mostly hung out at the Aqua Bar in terms of pool areas. Daniel was a great bartender. Despite his being left alone to serve many guests, He worked as fast as he could and was very friendly. Service was great for us the entire stay. (5 days) I felt really pampered and appreciated as a guest. We are cigar smokers so we enjoyed the cigar lounge in the evenings. It was always empty! We ate at Brass Steakhouse, Seaside, Duke’s Lobster and Seafood, Insieme Trattoria and the food truck on site. We enjoyed the food at all of them. We also ate at the Marketplace buffet. The omelettes, fruit and fresh juices were a fave. We recommend Dukes for breakfast. Never a wait and the options were more tasty than the buffet. Kudos to the manager and staff at Dukes for amazing customer service.   booked a tour to Chichén Itzá through seasons tours. Although a little expensive, we had an incredible experience. The bus ride was 2 hours long but we got to see a lot of Mexico along the way. The tour guide on the bus was awesome. Our actual Chichén Itzá guide, Wences was very good. We got an opportunity to eat authentic Mexican food and not the fancy resort stuff. We also got to swim in a cenote which was the highlight of the day. I recommend this tour if you have it in your budget. Water sports were too expensive for my liking. $150 for $10 min of parasailing (2ppl) is absolutely ridiculous. I don’t recommend. Covid testing was easy to schedule and we got it done in literally 2 min and went back to our vacation. Really enjoyed nightly turndown service, fresh bottles of champagne for my birthday and the special desserts and accommodations for my birthday. Last thing, although the resort doesn’t take reservations for most restaurants, I had breakfast at Seaside (thanks Deny!) and they really made me feel special. They sang to me and offered to guarantee me a seat without waiting at the restaurant of my choice for that evening. That was incredibly nice of the staff, however I was surprised by my partner with a dinner on the beach instead. It was amazing. There was a live saxophone player and it was scheduled right around sunset which made it soooo romantic. Transportation and check out were just as smooth as check in. The staff emailed me to confirm departure times and they arrived to take our bags right on time. Great vacation overall. Thanks to the entire staff for making it special!"
text = preprocess(text)

text

['overall',
 'enjoyed',
 'stay',
 'finest',
 'playa',
 'mujeres',
 'would',
 'probably',
 'stay',
 'future',
 'situation',
 'arose',
 'give',
 'star',
 'felt',
 'room',
 'could',
 'exhibited',
 'luxury',
 'furniture',
 'seemed',
 'cheap',
 'basic',
 'especially',
 'patio',
 'furniture',
 'towel',
 'robe',
 'hard',
 'slipper',
 'comfy',
 'nice',
 'touch',
 'though',
 'many',
 'kid',
 'personal',
 'comfort',
 'access',
 'adult',
 'area',
 'helped',
 'little',
 'long',
 'wait',
 'dinner',
 'annoying',
 'partner',
 '’',
 'mind',
 'eating',
 'buffet',
 'every',
 'meal',
 'certainly',
 'mind',
 'excellence',
 'club',
 'junior',
 'suite',
 'ocean',
 'view',
 'night',
 'never',
 'wait',
 'hour',
 'eat',
 'place',
 'choice',
 'said',
 'let',
 '’',
 'get',
 'good',
 'stuff',
 'airport',
 'found',
 'transportation',
 'company',
 'season',
 'waited',
 'minute',
 'private',
 'transportation',
 'provided',
 'room',
 'booked',
 'check',
 'quick',
 'easy',
 'hospitable',
 'separate',
 'check',
 'area'

In [16]:
get_review_asbects(text,lda_model)

['staff',
 'nice',
 'good',
 'would',
 'day',
 'pool',
 'water',
 'room',
 'great',
 'time',
 'night',
 'breakfast',
 'kid',
 'stay',
 'also',
 'area',
 'beach',
 'get']

In [43]:
with open('lda_cv.pkl','wb') as f:
    pickle.dump((lda_model,lda_output),f)

In [10]:
vec.transform(['test'])

<1x580958 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>