# Obtain Data

The Main data source was scraped from TripAdvisor, a popular travel review website, using Scrapy. I decided on scraping all hotel/resort reviews from Punta Cana, a Caribbean vacation destination that is rising in popularity. 

The Scrapy spider crawled and scraped all the data into a JSON format, although the framework allows for item pipelining into a MongoDB database.

Please see /src/tripdadvisor_reviews for the Scrapy source code.

In [1]:
import pandas as pd
import os
import functools
import pickle

In [2]:
# Decorator Functions
from functools import wraps

def my_logger(orig_func):
    import logging
    logging.basicConfig(filename='{}.log'.format(orig_func.__name__), level=logging.INFO)

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        logging.info(
            'Ran with args: {}, and kwargs: {}'.format(args, kwargs))
        return orig_func(*args, **kwargs)

    return wrapper


def my_timer(orig_func):
    import time

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = orig_func(*args, **kwargs)
        t2 = time.time() - t1
        print('{} ran in: {} sec'.format(orig_func.__name__, t2))
        return result

    return wrapper

def load_all_jsons(directory):
    json_files = [pd.read_json(directory+pos_json) for pos_json 
                  in os.listdir(directory) if pos_json.endswith('.json')]
    return functools.reduce(lambda x,y: pd.concat([x,y], ignore_index=True), json_files)

def load_or_make(filepath, overwrite='n'):
    def decorator(func):
        def wraps(*args, **kwargs):
            if overwrite == 'y':
                ow = input(f'Are you sure you want to overwrite {filepath}? y/n: ')
                if (os.path.exists(filepath)) and (ow == 'y'):
                    os.remove(filepath)
            try:
                with open(filepath, 'rb') as f:
                    data = pickle.load(f)
            except Exception:
                data = func(*args, **kwargs)
                with open(filepath, 'wb') as to_write:
                    pickle.dump(data, to_write)
            return data
        return wraps
    return decorator

In [64]:
# reviews = load_all_jsons("../data/raw/")
pc = pd.read_json("../data/raw/puntacana.json")
aspen = pd.read_json("../data/raw/aspen.json")
bor = pd.read_json("../data/raw/boracay.json")
whistler = pd.read_json("../data/raw/whistler.json")
gran = pd.read_json("../data/raw/grancanaria.json")
playa = pd.read_json("../data/raw/playadelcarmen.json")
vienna = pd.read_json("../data/raw/vienna.json")
budapest = pd.read_json("../data/raw/budapest.json")

In [65]:
pc["city"]="Punta Cana"
aspen["city"]="Aspen"
bor["city"]="Boracay"
whistler["city"]="Whistler"
gran["city"]="Gran Canaria"
playa["city"]="Playa Del Carmen"
vienna["city"]="Vienna"
budapest["city"]="Budapest"

In [5]:
reviews = functools.reduce(lambda x,y: pd.concat([x,y], ignore_index=True), [pc,aspen,bor,whistler,gran,playa,vienna,budapest])

In [6]:
pos = reviews[reviews.stars >= 4]
neg = reviews[reviews.stars < 3]

In [7]:
pos.head()

Unnamed: 0,content,hotel,stars,title,city
0,[Hands down great family vacation. Estefanía o...,[The Reserve at Paradisus Punta Cana],5,[Family concierge stay],Punta Cana
1,[Juan Batistae has got to be the best host/bar...,[Paradisus Punta Cana Resort],5,[Sunrise bartender],Punta Cana
2,[This was our first trip! We went with our fri...,[Dreams Palm Beach Punta Cana],5,[Overall Excellent Experience!],Punta Cana
3,[Me and my partner travelled from Glasgow Scot...,[Now Onyx Punta Cana],5,[Lovely 10 days],Punta Cana
4,[We had amazing time with family and friends. ...,[Paradisus Punta Cana Resort],4,[Amazing spring break vacation✌️👌👨‍👩‍👧‍👦😍😍👍],Punta Cana


# Scrub Data

This project is an NLP project, and therefore scrubbing the data takes on a different path than a supervised learning project. 

After importing the data, I have to take these steps:

1. Clean the data
2. Tokenize the data
3. Vectorize the data

In [8]:
# Imports
import numpy as np
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import wordnet as WN
from nltk.corpus import stopwords, words
import string
import re
import pickle

In [9]:
stop_words_en = stopwords.words('english')
newStopWords=['punta','cana','vienna','aspen','whistler',
              'gran','canaria','budapest','boracay',
              'playa', 'del', 'carmen','was','good','great']
typeWords=['hostel','hotel','apartment','resort','room']
stop_words_en.extend(newStopWords)
stop_words_en.extend(typeWords)
stop_words_en = set(stop_words_en)
english_words = words.words()

In [10]:
def punctuations(text):
    text = re.sub(r"[^a-z0-9(),!.?\'`]", " ", text)
    text = re.sub(r"\'s", "", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"[(),!.?\'`]", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\s[a-z]{1,2}\s"," ", text)
    return text 

def tokenize_single(row):
    return word_tokenize(row)

def clean_single(row):
    row = row.strip().lower()
    return punctuations(row)
#     return english_only(row)

@my_timer
# @load_or_make('../data/interim/preprocessed1_clean.pickle')
def clean(df):
    """
        returns dataframe after cleaning the review text
        1: lowercase
        2: strip any outer whitespace
        3: remove or replace punctuations with one space
        4: replace multiple spaces in a row with single space
    """
    columns = list(df.columns)
    df["clean_review"] =  df["content"].apply(lambda x: ' '.join(x) if len(x)> 0 else '')
    df = df.applymap(lambda x: x if not isinstance(x, list) else x[0] if len(x) else '')
    df.clean_review = df.clean_review.apply(lambda x: clean_single(x))
    df.clean_review = df.clean_review.apply(lambda x: x.translate(str.maketrans('','',string.digits)))
    return df[["hotel","city","content","clean_review"]]

@my_timer
# @load_or_make('../data/interim/preprocessed3_tokens.pickle')
def tokenize(df):
    df["tokens"] = df["clean_review"].apply(lambda row: tokenize_single(row))
    return df

## Clean the Text:

We first do some general cleaning, including:

- removing the lists inside the pandas dataframe
- lowercase all strings in the review 
- remove outer whitespaces
- remove digits, words 2 or less characters, punctuations
- remove non-English words

In [11]:
pos = clean(pos)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


clean ran in: 8.104113817214966 sec


In [12]:
pos.head()

Unnamed: 0,hotel,city,content,clean_review
0,The Reserve at Paradisus Punta Cana,Punta Cana,Hands down great family vacation. Estefanía ou...,hands down great family vacation estefan our f...
1,Paradisus Punta Cana Resort,Punta Cana,Juan Batistae has got to be the best host/bart...,juan batistae has got be the best host bartend...
2,Dreams Palm Beach Punta Cana,Punta Cana,This was our first trip! We went with our frie...,this was our first trip went with our friends ...
3,Now Onyx Punta Cana,Punta Cana,Me and my partner travelled from Glasgow Scotl...,me and partner travelled from glasgow scotland...
4,Paradisus Punta Cana Resort,Punta Cana,We had amazing time with family and friends. A...,we had amazing time with family and friends an...


In [13]:
pickle.dump(pos,open("../data/interim/reviews_cleaned.pickle","wb"))

## Tokenize the Text:

With the cleaned text, I will run NLTK to convert it into tokenized text:

In [14]:
pos = tokenize(pos)

tokenize ran in: 35.4605278968811 sec


In [15]:
pickle.dump(pos,open("../data/processed/reviews_tokens.pickle","wb"))

In [16]:
pos=pickle.load(open("../data/processed/reviews_tokens.pickle","rb"))

In [17]:
# # Working with Scott's
# mixed = reviews.clean_review.to_list()
# mixed2 = scott2.recipe.to_list()
# mixed2 = [clean_single(row) for row in mixed2]
# mixedall = mixed+mixed2

# Explore Data

In [85]:
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
whistler = tokenize(clean(whistler))

In [112]:
bor = tokenize(clean(bor))

clean ran in: 1.3403558731079102 sec
tokenize ran in: 7.933361053466797 sec


In [115]:
whistler_freq={}
bor_freq={}

def count_freq(freq,word):
    for w in word:
        try:
            if w not in stop_words_en:
                if w in list(freq.keys()):
                    freq[w] += 1
                else:
                    freq[w] = 1
        except:
            pass

In [None]:
for doc in bor.tokens:
    count_freq(bor,doc)

In [73]:
for doc in whistler.tokens:
    count_freq(whistler_freq,doc)

In [108]:
newwhistler=dict(Counter(whistler_freq).most_common(5))

In [109]:
newwhistler

{'stay': 2083, 'village': 1880, 'would': 1758, 'location': 1703, 'staff': 1606}

## N-grams and Lemmatization

After Cleaning and Tokenizing, the next step is to then generate a Document-Term Matrix. I can use either the CountVectorizer or the TF-IDF to vectorize into Document-Term Matrix with stop_words and ngrams included.

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer 

In [44]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

vec_choices = {
    'cv': CountVectorizer,
    'tfidf': TfidfVectorizer,
}
    
@my_timer
def vectorize(documents, method='cv'):
    vectorizer = vec_choices[method](stop_words = stop_words_en, ngram_range=(1, 1))
    doc_word = vectorizer.fit_transform(documents)
    return doc_word, vectorizer

## CountVectorizer

In [45]:
# doc_cv, cv= vectorize(reviews.clean_review.to_list(), reviews.hotel,'cv')

In [46]:
# pd.DataFrame(doc_cv.toarray(), index=reviews.hotel.to_list(), columns=cv.get_feature_names()).head(10)

In [47]:
# pickle.dump(cv, open("../results/models/cv.pickle","wb"))

## TF-IDF

In [48]:
doc_tfidf, tfidf = vectorize(pos.clean_review.to_list(), 'tfidf')
# doc_tfidf, tfidf = vectorize(reviews.clean_review.to_list(), reviews.hotel,'tfidf')

vectorize ran in: 5.861459970474243 sec


In [49]:
pickle.dump(tfidf, open("../results/models/tfidf.pickle","wb"))

# Model & Interpret Data

I use some of the dimensionality reduction techniques here, specifically Topic Modelling. The three that are used here:

- Non-negative Matrix factorization (NMF)
- Latent Semantic Analysis (LSA)
- Latent Dirichlet Allocation (LDA)

In [50]:
import gensim
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models, similarities, matutils

In [51]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
dim_red_choices = {
    1: NMF,
    2: TruncatedSVD,
    3: models.LdaModel
}

def docs_with_topics(doc_topic,name,num_topics):
    return pd.DataFrame(doc_topic.round(5), index = name, columns = ["Topic "+str(i) for i in range(num_topics)])

@my_timer
def topic_analysis(model, num_topics, vectorizer):
    topic_word = pd.DataFrame(model.components_.round(3), index = ["Topic "+str(i) for i in range(num_topics)]
                              ,columns = vectorizer.get_feature_names())
    display_topics(model, vectorizer.get_feature_names(), 10)
    return topic_word
    
@my_timer
def dim_reduction_modeling(model_num, num_topics, doc_word,id2word=None):
    """
    Select 1 for 
    """
    try:
        if model_num == 3:
            return dim_red_choices[model_num](corpus=doc_word, num_topics=num_topics, id2word=id2word, passes=5)
        else:
            model = dim_red_choices[model_num](num_topics)
            doc_topic = model.fit_transform(doc_word)
            return model,doc_topic
    except KeyError as K:
        print("ERROR: Please select model numbers 1, 2, or 3")

## NMF

In [52]:
num_topics = 7
nmf, nmf_topic = dim_reduction_modeling(1,num_topics,doc_tfidf)

dim_reduction_modeling ran in: 17.996272087097168 sec


In [53]:
topic_word = topic_analysis(nmf, num_topics, tfidf)


Topic  0
one, night, get, bed, two, also, stayed, could, small, shower

Topic  1
city, station, metro, breakfast, walk, close, walking, center, minutes, distance

Topic  2
amazing, service, time, best, made, food, always, back, beautiful, make

Topic  3
nice, really, place, clean, rooms, big, breakfast, people, close, also

Topic  4
pool, bar, lovely, area, food, holiday, sun, plenty, entertainment, restaurant

Topic  5
beach, place, walk, white, away, restaurants, front, right, beautiful, water

Topic  6
stay, staff, friendly, helpful, recommend, location, would, clean, definitely, place
topic_analysis ran in: 0.19394898414611816 sec


In [54]:
nmf_results = docs_with_topics(nmf_topic,pos.hotel.to_list(),num_topics)

In [55]:
pickle.dump(nmf, open("../results/models/nmf.pickle","wb"))
pickle.dump(nmf_results, open("../results/models/nmf_results.pickle","wb"))

## LSA

In [None]:
lsa, lsa_topic = dim_reduction_modeling(2,num_topics,doc_tfidf)
lsa.explained_variance_ratio_

In [None]:
topic_word2 = topic_analysis(lsa, num_topics, tfidf)

In [None]:
lsa_results=docs_with_topics(lsa_topic,pos.hotel.to_list(),num_topics)

## LDA

In [None]:
corpus = matutils.Sparse2Corpus(doc_tfidf.transpose())

In [None]:
id2word = dict((v, k) for k, v in tfidf.vocabulary_.items())

In [None]:
lda = dim_reduction_modeling(3,num_topics,corpus,id2word)

In [None]:
lda.print_topics()

# Test using User Input Data

Now that I have a working model, I can use custom input to match with the closest review in terms of cosine similarities. This will allow me to suggest a hotel for someone who's looking for one.

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

## Sanity Check: Using one of the existing reviews:

I want to make sure the model works correctly. If the user input is an exact match of one of the reviews, the highest cosine similarity should return that exact same review (since the cosine similarity of two identical text should be 1). I implement that here before I do a custom input text:

In [32]:
sanity_check = pos.content[17]

In [33]:
sanity_check

'My girlfriend and I have traveled from Newfoundland, Canada and have been blown away by the resort, food and mostly the staff. From the moment we arrived we were tended to like royalty. We were given a great room close to all amenities. Whether it was breakfast , lunch , or supper the staff was polite and most helpful. The service on the beach from Jhoan Tavárez Jose Reyes Edwin Canarie and Johan Ventura is what has made our trip. We had rented the Bali beds every day and didn’t have to leave them. Everything we needed the guys made sure we had it. Will definitely be back !'

In [34]:
sanity_check2 = "They moved but  luckily we found them! We recived  great  hospitality, red roses for valentines day. The room was, as always, very clean, spacious and well furnished. The place is very quite with a pretty patio for breakfast.  Close to most things  you need. Great Michela & Fabrizio! "

In [35]:
# sanity_token = word_tokenize(clean_single(sanity_check))
sanity_clean = clean_single(sanity_check)

In [36]:
print(sanity_clean)

my girlfriend and have traveled from newfoundland canada and have been blown away the resort food and mostly the staff from the moment arrived were tended like royalty were given great room close all amenities whether was breakfast lunch supper the staff was polite and most helpful the service the beach from jhoan tav rez jose reyes edwin canarie and johan ventura what has made our trip had rented the bali beds every day and didn have leave them everything needed the guys made sure had will definitely back 


In [37]:
sanity_topic = nmf.transform(tfidf.transform([sanity_clean]))

In [38]:
index = np.argmax(cosine_similarity(sanity_topic, nmf_results))

In [39]:
index

17

In [40]:
returned_review = pos.iloc[index,:].content
returned_review

'My girlfriend and I have traveled from Newfoundland, Canada and have been blown away by the resort, food and mostly the staff. From the moment we arrived we were tended to like royalty. We were given a great room close to all amenities. Whether it was breakfast , lunch , or supper the staff was polite and most helpful. The service on the beach from Jhoan Tavárez Jose Reyes Edwin Canarie and Johan Ventura is what has made our trip. We had rented the Bali beds every day and didn’t have to leave them. Everything we needed the guys made sure we had it. Will definitely be back !'

## Custom User Input

With the sanity check performing correctly, I can now implement the model using a custom text:

In [42]:
@my_timer
def predict(vectorizer, model, data_results):
    description = input("Describe your dream vacation: ")
    
    # Preprocessing + Tokenizing
    description = clean_single(description)
#     desc_tokens = word_tokenize(description)
    
    # Vectorize and Tranform using model
    user_matrix = vectorizer.transform([description])
#     user_matrix = vectorizer.transform([desc_tokens])
    user_topic = model.transform(user_matrix)
    
    cosines = cosine_similarity(user_topic, data_results)
    
    # Find max cosine similarity and print the original review content
    idx = cosines[0].argsort()[-5:][::-1]

    score = np.max(cosines[0])
#     print("We suggest this hotel: ", reviews.iloc[index,:].hotel)
#     print("Read what a guest wrote: ", reviews.iloc[index,:].content)
    for i in range(len(idx)):
        print(f"Hotel #{i}: ", pos.iloc[idx[i],:].hotel)
        print(f"City #{i}", pos.iloc[idx[i],:].city)
        print("Read what a guest wrote: ", pos.iloc[idx[i],:].content)
    
    return score

In [43]:
matrix, indices= predict(tfidf, nmf, nmf_results)

Describe your dream vacation: I want to go to the beach
Hotel #0:  White Beach de Boracay
City #0 Boracay
Read what a guest wrote:  Very beautiful beach. The purest water, a white sand, numerous cafes and shops in close proximity to the sea.
Hotel #1:  Paamul Hotel
City #1 Playa Del Carmen
Read what a guest wrote:  Paamul is truly a little jewel in the Riviera Maya.  The hotel rooms are comfortable, spacious and very clean.  The views of the beach/ocean from the rooms are unbeatable.  The pool is awesome, overlooking the beach and the restaurant has delicious food.  Who can beat eating at a restaurant on the beach with the sea breeze blowing in your hair!  We can't wait to go back to Paamul!
Hotel #2:  Ancora Punta Cana Private Residence, Yacht Club & Marina
City #2 Punta Cana
Read what a guest wrote:  We have visited there couple of times during our stay in Punta Cana and overall good beach to relax and swim. It is not that luxury beach or most attractive clean beach - but a decent be

TypeError: 'numpy.float64' object is not iterable