In [258]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random
import plotly.graph_objs as go
import plotly.plotly as py
import cufflinks
pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='solar')

In [259]:
df = pd.read_csv('Seattle_Hotels.csv', encoding = 'latin-1')
print(len(df), 'hotels in the data')
df.head()

152 hotels in the data


Unnamed: 0,name,address,desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...


In [260]:
def description(index):
    example = df[df.index == index][['desc', 'name']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Name:', example[1])

In [261]:
# function(index)
description(50)

Welcome to the Crowne Plaza Seattle Airport Located across the sky bridge from Seattle Tacoma International Airport and adjacent to the train station, where the light rail train leaves every 12 minutes to downtown Seattle. Take the train to visit the Space Needle, Pike Place Market, Westlake Shopping Center, Nordstroms flag ship store, Century Link, Safeco Field, and the University of Washington.Just beyond our front doors are headquarters of Fortune 500 companies such as Boeing, Microsoft, Amazon, Apple, Google, Tableaux, Facebook and Starbucks. Seattle offers everything from professional sports to hiking up Mt. Rainier, leisure guests planning cruises from Seattle love our hotel's "Park, Stay and Go" package. Join us in Reflections Bar and Grill for a Pacific Northwest dining experience featuring local beers and wine with light bites.Crowne Plaza Seattle Airport has over 12,000 square feet of private event space onsite, with 7 meeting rooms, and two Ballrooms, including our 5,000 sq

In [262]:
# vocabulary frequency before removing stop-words 
def top_n_words(corpus, n = None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, 
                        key = lambda x: x[1], 
                        reverse = True)
    return words_freq[:n]

In [263]:
# Apply function to grab from dataframe
top_words = top_n_words(df['desc'], 20)

# Group by word count in descending order 
df1 = pd.DataFrame(top_words, columns = ['desc', 'count'])
df1.groupby('desc').sum()['count'].sort_values().iplot(kind = 'barh',
                                                       yTitle = 'count',
                                                       linecolor = 'black',
                                                       title = 'Top 20 word frequency before removing stop-words')

In [264]:
# Vocabulary tokens after removing stop-words

def top_n_words(corpus, n = None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, 
                        key = lambda x: x[1], 
                        reverse = True)
    return words_freq[:n]

In [265]:
common_words = top_n_words(df['desc'], 20)

df2 = pd.DataFrame(common_words, columns = ['desc', 'count'])
df2.groupby('desc').sum()['count'].sort_values(ascending = False).iplot(kind = 'barh',
                                                       yTitle = 'count',
                                                       linecolor = 'black',
                                                       title = 'Top 20 word frequency before removing stop-words')

In [266]:
# Bigrams freq before removing stop-words 
def top_n_bigrams(corpus, n = None):
    vec = CountVectorizer(ngram_range = (2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, 
                        key = lambda x: x[1], 
                        reverse = True)
    return words_freq[:n]

In [267]:
common_bigrams = top_n_bigrams(df['desc'], 20)
df3 = pd.DataFrame(common_bigrams, columns = ['desc', 'count'])

In [268]:
df3.head()

Unnamed: 0,desc,count
0,in the,147
1,of the,133
2,pike place,86
3,place market,85
4,to the,81


In [269]:
df3.groupby('desc').sum()['count'].sort_values().iplot(kind = 'bar',
                                                       yTitle = 'count',
                                                       linecolor = 'black',
                                                       title = 'Top bigrams before removing stop-words')

In [270]:
# Bigram frequneyc after removing stop-words
def top_n_bigrams(corpus, n = None):
    vec = CountVectorizer(ngram_range = (2, 2), 
                         stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, 
                        key = lambda x: x[1], 
                        reverse = True)
    return words_freq[:n]

In [271]:
top_bigrams = top_n_bigrams(df['desc'], 20)
df4 = pd.DataFrame(top_bigrams, columns = ['desc', 'count'])

df4.groupby('desc').sum()['count'].sort_values().iplot(kind = 'bar',
                                                                        yTitle = 'count',
                                                                        linecolor = 'black',
                                                                        title = 'Top bigrams after removing stop-words')

In [272]:
# Trigram freq before removing stop words 

def top_n_trigrams(corpus, n = None):
    vec = CountVectorizer(ngram_range = (3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_of_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_of_words[0, idx]) for word, 
                  idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, 
                        key = lambda x: x[1], 
                        reverse = True)
    return words_freq[:n]

In [273]:
top_trigrams = top_n_trigrams(df['desc'], 20)

df5 = pd.DataFrame(top_trigrams, columns = ['desc', 'count'])

In [274]:
df5.head()

Unnamed: 0,desc,count
0,pike place market,85
1,the space needle,39
2,the heart of,33
3,in the heart,28
4,located in the,26


In [275]:
df5.groupby('desc').sum()['count'].sort_values().iplot(kind = 'bar', 
                                                       linecolor = 'black',
                                                       title = 'Top trigrams before removing stop-words')

In [276]:
# Top trigrams after removing stopwords 

def top_n_trigrams(corpus, n = None):
    vec = CountVectorizer(ngram_range = (3, 3), 
                          stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_of_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_of_words[0, idx]) for word, 
                  idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,
                        key = lambda x: x[1], 
                        reverse = True)
    return words_freq[:n]

In [277]:
top_trigrams = top_n_trigrams(df['desc'], 20)

df6 = pd.DataFrame(top_trigrams, columns = ['desc', 'count'])

df6.head()

Unnamed: 0,desc,count
0,pike place market,85
1,seattle tacoma international,21
2,tacoma international airport,21
3,free wi fi,19
4,washington state convention,17


In [278]:
df6.groupby('desc').sum()['count'].sort_values().iplot(kind = 'bar', 
                                                      linecolor = 'black',
                                                      title = 'Top trigrams after removing stop-words')

In [279]:
# Word count distribution 

df['word_count'] = df['desc'].apply(lambda x: len(str(x).split()))

In [280]:
desc_lengths = list(df['word_count'])

print('Description count:', len(desc_lengths),
     '\nAverage word count', np.average(desc_lengths), 
     '\nMinimum word count', min(desc_lengths),
     '\nMaximum word count', max(desc_lengths))

Description count: 152 
Average word count 156.94736842105263 
Minimum word count 16 
Maximum word count 494


In [281]:
df['word_count'].iplot(kind = 'hist',
                       bins = 50,
                       linecolor = 'black',
                       xTitle = 'Desc word count',
                       yTitle = 'frequency',
                       title = 'Distribution of word counts in description')

In [282]:
# Text cleaning 
import nltk
nltk.download('stopwords')

SPACE_REPLACE = re.compile('[/(){}\[\]\|@,;]')
SYMBOL_REPLACE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ke117\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [283]:
def clean_text(text):
    '''
    text: a string
    return: modified initial string
    '''
    text = text.lower() # lowercase
    text = SPACE_REPLACE.sub(' ', text) # replace symbols & string with space
    text = SYMBOL_REPLACE.sub('', text) # replace symbols with nothing
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords
    return text

df['clean_desc'] = df['desc'].apply(clean_text)

In [284]:
df.head()

Unnamed: 0,name,address,desc,word_count,clean_desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the...",184,located southern tip lake union hilton garden ...
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat...",152,located citys vibrant core sheraton grand seat...
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ...",147,located heart downtown seattle awardwinning cr...
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...,150,whats near hotel downtown seattle location bet...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...,151,situated amid incredible shopping iconic attra...


In [285]:
df.set_index('name', inplace = True)

In [286]:
# TF-IDF (term frequency inverse document frequency)
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer = 'word',
                    ngram_range = (1, 3),
                    min_df = 0,
                    stop_words = 'english')

tfidf_matrix = tf.fit_transform(df['clean_desc'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [287]:
indices = pd.Series(df.index)

def recommendations(name, cosine_similarities = cosine_similarities):
    recommended_hotels = []
    idx = indices[indices == name].index[0]
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    for i in top_10_indexes:
        recommended_hotels.append(list(df.index)[i])
        
    return recommended_hotels

In [288]:
# Confirm search on tripadvisor 
recommendations('Hilton Seattle Airport & Conference Center')

['Embassy Suites by Hilton Seattle Tacoma International Airport',
 'DoubleTree by Hilton Hotel Seattle Airport',
 'Seattle Airport Marriott',
 'Motel 6 Seattle Sea-Tac Airport South',
 'Econo Lodge SeaTac Airport North',
 'Four Points by Sheraton Downtown Seattle Center',
 'Knights Inn Tukwila',
 'Econo Lodge Renton-Bellevue',
 'Hampton Inn Seattle/Southcenter',
 'Radisson Hotel Seattle Airport']

In [289]:
recommendations('Radisson Hotel Seattle Airport')

['Country Inn & Suites by Radisson, Seattle-Tacoma International Airport',
 'Red Roof Inn Seattle Airport - SEATAC',
 'Ramada by Wyndham SeaTac Airport',
 'DoubleTree by Hilton Hotel Seattle Airport',
 'Best Western Seattle Airport Hotel',
 'Homewood Suites by Hilton Seattle-Tacoma Airport/Tukwila',
 'Holiday Inn Express & Suites Seattle-City Center',
 'Red Lion Hotel Seattle Airport Sea-Tac',
 'Embassy Suites by Hilton Seattle Tacoma International Airport',
 'Four Points by Sheraton Seattle Airport South']