### Seattle Hotel Recommendation by Text Similarity

In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import random
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

In [3]:
df = pd.read_csv('Seattle_Hotels.csv', encoding="latin-1")
df.head()

Unnamed: 0,name,address,desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...


In [4]:
df.shape

(152, 3)

In [5]:
df['desc'][100]

'On a budget in Seattle or looking for something different? The historic charm and "home away from home" atmosphere of The Baroness will be sure to make you feel like one of the family. Conveniently located on First Hill, we are proud to be part of the Virginia Mason Hospital campus and only minutes from Harborview Medical Center and Swedish Hospital. The Baroness Hotel is a great option for short or long term medical, patient or family stays. Whether you are visiting the area\'s world-class medical facilities or on a budget vacation, our goal is to ensure a wonderful stay. Guest Amenities: Complimentary Internet access, Two twin, one or two queen studios with mini fridge and microwave, Two twin or one queen suites with full kitchens, Laundry facilities available, Flat screen cable television with HBO, Complimentary local calls, Ice and vending machines located in the lobby, Coffee maker and hairdryers in all guestrooms, Room service available seven days a week from the Rhododendron Ca

### Hotel Information Analysis

In [6]:
vec = CountVectorizer().fit(df['desc'])
bag_of_words = vec.transform(df['desc'])

In [7]:
bag_of_words.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [8]:
bag_of_words.shape

(152, 3200)

In [9]:
sum_words = bag_of_words.sum(axis=0)
sum_words

matrix([[ 1, 11, 11, ...,  2,  6,  2]], dtype=int64)

In [10]:
words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
words_freq

[('located', 108),
 ('on', 129),
 ('the', 1258),
 ('southern', 1),
 ('tip', 1),
 ('of', 536),
 ('lake', 41),
 ('union', 33),
 ('hilton', 12),
 ('garden', 11),
 ('inn', 89),
 ('seattle', 533),
 ('downtown', 133),
 ('hotel', 295),
 ('is', 271),
 ('perfectly', 6),
 ('for', 216),
 ('business', 87),
 ('and', 1062),
 ('leisure', 18),
 ('neighborhood', 35),
 ('home', 57),
 ('to', 471),
 ('numerous', 1),
 ('major', 12),
 ('international', 32),
 ('companies', 6),
 ('including', 47),
 ('amazon', 19),
 ('google', 6),
 ('bill', 4),
 ('melinda', 4),
 ('gates', 5),
 ('foundation', 4),
 ('wealth', 1),
 ('eclectic', 8),
 ('restaurants', 35),
 ('bars', 7),
 ('make', 43),
 ('this', 63),
 ('area', 51),
 ('one', 75),
 ('most', 40),
 ('sought', 1),
 ('out', 23),
 ('by', 71),
 ('locals', 5),
 ('visitors', 4),
 ('our', 359),
 ('proximity', 8),
 ('allows', 3),
 ('take', 31),
 ('in', 449),
 ('some', 22),
 ('pacific', 42),
 ('northwest', 42),
 ('majestic', 4),
 ('scenery', 2),
 ('enjoy', 93),
 ('outdoor', 23),


In [11]:
words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
words_freq

[('the', 1258),
 ('and', 1062),
 ('of', 536),
 ('seattle', 533),
 ('to', 471),
 ('in', 449),
 ('our', 359),
 ('you', 304),
 ('hotel', 295),
 ('with', 280),
 ('is', 271),
 ('at', 231),
 ('from', 224),
 ('for', 216),
 ('your', 186),
 ('or', 161),
 ('center', 151),
 ('are', 136),
 ('downtown', 133),
 ('on', 129),
 ('we', 128),
 ('free', 123),
 ('as', 117),
 ('located', 108),
 ('rooms', 106),
 ('stay', 105),
 ('place', 102),
 ('all', 100),
 ('airport', 99),
 ('space', 97),
 ('market', 97),
 ('enjoy', 93),
 ('an', 91),
 ('pike', 90),
 ('inn', 89),
 ('business', 87),
 ('just', 82),
 ('city', 79),
 ('room', 77),
 ('one', 75),
 ('by', 71),
 ('breakfast', 68),
 ('needle', 68),
 ('suites', 67),
 ('washington', 67),
 ('that', 65),
 ('re', 64),
 ('this', 63),
 ('complimentary', 62),
 ('also', 62),
 ('amenities', 60),
 ('offer', 59),
 ('attractions', 59),
 ('away', 59),
 ('access', 59),
 ('home', 57),
 ('guest', 57),
 ('can', 55),
 ('it', 55),
 ('guests', 54),
 ('service', 53),
 ('experience', 52),

In [12]:
def get_top_n_words(corpus,n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
    return words_freq[:n]

In [13]:
common_words=get_top_n_words(df['desc'],20)

In [14]:
common_words

[('the', 1258),
 ('and', 1062),
 ('of', 536),
 ('seattle', 533),
 ('to', 471),
 ('in', 449),
 ('our', 359),
 ('you', 304),
 ('hotel', 295),
 ('with', 280),
 ('is', 271),
 ('at', 231),
 ('from', 224),
 ('for', 216),
 ('your', 186),
 ('or', 161),
 ('center', 151),
 ('are', 136),
 ('downtown', 133),
 ('on', 129)]

In [15]:
df1 = pd.DataFrame(common_words,columns=['desc','count'])

In [16]:
df1.head()

Unnamed: 0,desc,count
0,the,1258
1,and,1062
2,of,536
3,seattle,533
4,to,471


In [17]:
df1.groupby('desc').sum()['count'].sort_values().iplot(kind='barh',yTitle='Count',linecolor='black',title='top 20 before remove stopwords')

In [18]:
def get_top_n_words(corpus,n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
    return words_freq[:n]

In [19]:
common_words=get_top_n_words(df['desc'],20)
df2 = pd.DataFrame(common_words,columns=['desc','count'])
df2.groupby('desc').sum()['count'].sort_values().iplot(kind='barh',yTitle='Count',linecolor='black',title='top 20 after remove stopwords')

In [20]:
def get_top_n_words(corpus,n=None):
    vec = CountVectorizer(stop_words='english',ngram_range=(1,3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key=lambda x:x[1],reverse=True)
    return words_freq[:n]

In [21]:
common_words=get_top_n_words(df['desc'],20)
df3 = pd.DataFrame(common_words,columns=['desc','count'])
df3.groupby('desc').sum()['count'].sort_values().iplot(kind='barh',yTitle='Count',linecolor='black',title='top 20 before remove stopwords-ngram_range=(2,2)')

In [22]:
df['word_count']=df['desc'].apply(lambda x:len(str(x).split()))

In [23]:
df.head()

Unnamed: 0,name,address,desc,word_count
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the...",184
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat...",152
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ...",147
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...,150
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...,151


In [24]:
df['word_count'].iplot(kind='hist',bins=50)

### Natural Language Processing

In [25]:
sub_replace = re.compile('[^0-9a-z #+_]')
stopwords = set(stopwords.words('english'))

def clean_txt(text):
    text.lower()
    text = sub_replace.sub('',text)
    ' '.join(word for word in text.split() if word not in stopwords)
    return text
df['desc_clean'] = df['desc'].apply(clean_txt)

In [26]:
df.head()

Unnamed: 0,name,address,desc,word_count,desc_clean
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the...",184,ocated on the southern tip of ake nion the ilt...
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat...",152,ocated in the citys vibrant core the heraton r...
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ...",147,ocated in the heart of downtown eattle the awa...
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...,150,hats near our hotel downtown eattle location h...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...,151,ituated amid incredible shopping and iconic at...


In [27]:
df['desc'][0]

"Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. \nThe neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay productive.

In [28]:
df['desc_clean'][0]

'ocated on the southern tip of ake nion the ilton arden nn eattle owntown hotel is perfectly located for business and leisure he neighborhood is home to numerous major international companies including mazon oogle and the ill  elinda ates oundation  wealth of eclectic restaurants and bars make this area of eattle one of the most sought out by locals and visitors ur proximity to ake nion allows visitors to take in some of the acific orthwests majestic scenery and enjoy outdoor activities like kayaking and sailing over 2000 sq ft of versatile space and a complimentary business center tateoftheart  technology and our helpful staff will guarantee your conference cocktail reception or wedding is a success efresh in the sparkling saltwater pool or energize with the latest equipment in the 24hour fitness center astefully decorated and flooded with natural light our guest rooms and suites offer everything you need to relax and stay productive nwind in the bar and enjoy merican cuisine for brea

### Text Similarity

In [29]:
df.set_index('name',inplace = True)

In [30]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english')

In [31]:
tfidf_matrix=tf.fit_transform(df['desc_clean'])

In [32]:
tfidf_matrix.shape

(152, 27976)

In [36]:
cosine_similarity =linear_kernel(tfidf_matrix,tfidf_matrix)

In [37]:
cosine_similarity.shape

(152, 152)

In [38]:
cosine_similarity[0]

array([1.00000000e+00, 1.07618507e-02, 2.39000494e-02, 5.46873017e-03,
       2.64161143e-02, 1.05158253e-02, 1.70265099e-02, 1.26932177e-02,
       6.55905011e-03, 1.89826340e-02, 1.01682769e-02, 5.81427763e-03,
       8.97164751e-03, 5.11332703e-03, 6.98081551e-03, 1.46651716e-02,
       1.01506328e-02, 3.48428336e-02, 1.05628890e-02, 2.03920044e-02,
       2.31715424e-02, 8.66803402e-03, 4.19927749e-03, 1.25464260e-02,
       1.35516385e-02, 1.90864472e-02, 2.92211862e-02, 5.29767659e-03,
       2.34027898e-02, 1.84009370e-02, 1.11063777e-02, 3.24877554e-02,
       1.59088468e-02, 2.03903610e-02, 3.34542421e-02, 2.08424726e-02,
       6.37061770e-03, 7.22769959e-03, 1.76879937e-02, 3.40610778e-02,
       1.39733856e-02, 7.16109150e-03, 1.40189178e-02, 3.08597799e-02,
       3.31898710e-02, 1.32485388e-02, 3.49498978e-02, 1.03401842e-02,
       2.91144195e-02, 1.41758154e-02, 2.22237640e-02, 1.64940308e-02,
       3.11683463e-02, 1.59544326e-02, 2.61636177e-02, 1.26140542e-02,
      

In [39]:
indices = pd.Series(df.index)
indices[:5]

0    Hilton Garden Seattle Downtown
1            Sheraton Grand Seattle
2     Crowne Plaza Seattle Downtown
3     Kimpton Hotel Monaco Seattle 
4                The Westin Seattle
Name: name, dtype: object

In [48]:
def recommendations(name,cosine_similarity):
    recommended_hotels = []
    idx = indices[indices == name].index[0]
    score_series = pd.Series(cosine_similarity[idx]).sort_values(ascending=False)
    top_10_indexes = list(score_series[1:11].index)
    for i in top_10_indexes:
        recommended_hotels.append(list(df.index)[i])
    return recommended_hotels

In [49]:
recommendations('Hilton Garden Seattle Downtown',cosine_similarity)

['Staybridge Suites Seattle Downtown - Lake Union',
 'Silver Cloud Inn - Seattle Lake Union',
 'Residence Inn by Marriott Seattle Downtown/Lake Union',
 'MarQueen Hotel',
 'Embassy Suites by Hilton Seattle Tacoma International Airport',
 'Silver Cloud Hotel - Seattle Broadway',
 'The Loyal Inn',
 'Homewood Suites by Hilton Seattle Downtown',
 'Inn at Queen Anne',
 'SpringHill Suites Seattle\xa0Downtown']