## Recommendation system Based on service description of Hotel

In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import random
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

In [None]:
df = pd.read_csv('Hotels.csv', encoding="latin-1")

In [None]:
df.head()

Unnamed: 0,name,address,desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...


### Check the Data

In [None]:
# word 2 vector
w2v = CountVectorizer().fit(df['desc'])
bow = w2v.transform(df['desc'])

In [None]:
bow.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [None]:
bow.shape

(152, 3200)

In [None]:
#count words by index
count_words = bow.sum(axis=0)
count_words

matrix([[ 1, 11, 11, ...,  2,  6,  2]], dtype=int64)

In [None]:
count_words.ndim

2

In [None]:
count_words.shape

(1, 3200)

In [None]:
w2v.vocabulary_

{'located': 1669,
 'on': 1956,
 'the': 2815,
 'southern': 2610,
 'tip': 2860,
 'of': 1940,
 'lake': 1594,
 'union': 2961,
 'hilton': 1391,
 'garden': 1236,
 'inn': 1492,
 'seattle': 2453,
 'downtown': 898,
 'hotel': 1422,
 'is': 1528,
 'perfectly': 2068,
 'for': 1177,
 'business': 487,
 'and': 215,
 'leisure': 1633,
 'neighborhood': 1888,
 'home': 1403,
 'to': 2863,
 'numerous': 1931,
 'major': 1719,
 'international': 1511,
 'companies': 661,
 'including': 1471,
 'amazon': 201,
 'google': 1280,
 'bill': 402,
 'melinda': 1777,
 'gates': 1242,
 'foundation': 1188,
 'wealth': 3081,
 'eclectic': 933,
 'restaurants': 2334,
 'bars': 338,
 'make': 1720,
 'this': 2837,
 'area': 256,
 'one': 1958,
 'most': 1842,
 'sought': 2601,
 'out': 1987,
 'by': 493,
 'locals': 1668,
 'visitors': 3039,
 'our': 1984,
 'proximity': 2194,
 'allows': 188,
 'take': 2775,
 'in': 1466,
 'some': 2593,
 'pacific': 2009,
 'northwest': 1923,
 'majestic': 1718,
 'scenery': 2429,
 'enjoy': 988,
 'outdoor': 1988,
 'activ

In [None]:
w2v.vocabulary_.items()

dict_items([('located', 1669), ('on', 1956), ('the', 2815), ('southern', 2610), ('tip', 2860), ('of', 1940), ('lake', 1594), ('union', 2961), ('hilton', 1391), ('garden', 1236), ('inn', 1492), ('seattle', 2453), ('downtown', 898), ('hotel', 1422), ('is', 1528), ('perfectly', 2068), ('for', 1177), ('business', 487), ('and', 215), ('leisure', 1633), ('neighborhood', 1888), ('home', 1403), ('to', 2863), ('numerous', 1931), ('major', 1719), ('international', 1511), ('companies', 661), ('including', 1471), ('amazon', 201), ('google', 1280), ('bill', 402), ('melinda', 1777), ('gates', 1242), ('foundation', 1188), ('wealth', 3081), ('eclectic', 933), ('restaurants', 2334), ('bars', 338), ('make', 1720), ('this', 2837), ('area', 256), ('one', 1958), ('most', 1842), ('sought', 2601), ('out', 1987), ('by', 493), ('locals', 1668), ('visitors', 3039), ('our', 1984), ('proximity', 2194), ('allows', 188), ('take', 2775), ('in', 1466), ('some', 2593), ('pacific', 2009), ('northwest', 1923), ('majesti

In [None]:
for word, idx in w2v.vocabulary_.items():
    print(word, idx)

located 1669
on 1956
the 2815
southern 2610
tip 2860
of 1940
lake 1594
union 2961
hilton 1391
garden 1236
inn 1492
seattle 2453
downtown 898
hotel 1422
is 1528
perfectly 2068
for 1177
business 487
and 215
leisure 1633
neighborhood 1888
home 1403
to 2863
numerous 1931
major 1719
international 1511
companies 661
including 1471
amazon 201
google 1280
bill 402
melinda 1777
gates 1242
foundation 1188
wealth 3081
eclectic 933
restaurants 2334
bars 338
make 1720
this 2837
area 256
one 1958
most 1842
sought 2601
out 1987
by 493
locals 1668
visitors 3039
our 1984
proximity 2194
allows 188
take 2775
in 1466
some 2593
pacific 2009
northwest 1923
majestic 1718
scenery 2429
enjoy 988
outdoor 1988
activities 142
like 1647
kayaking 1558
sailing 2401
over 1999
000 1
sq 2650
ft 1213
versatile 3019
space 2613
complimentary 672
center 550
state 2669
art 269
technology 2795
helpful 1372
staff 2656
will 3129
guarantee 1313
your 3188
conference 686
cocktail 621
reception 2253
or 1974
wedding 3083
success 27

activity 143
streets 2705
lined 1651
diversified 874
sophisticated 2598
chic 583
excursion 1042
afternoon 165
gasworks 1241
quiet 2217
beauty 361
have 1351
instant 1502
both 436
worlds 3168
travel 2910
pleasure 2112
trips 2930
few 1116
corporate 719
vacationers 2999
museums 1866
less 1636
landmark 1599
distinctly 867
charming 575
unmistakable 2972
sprawling 2645
system 2767
shows 2523
pristine 2163
outdoors 1989
performing 2069
cultural 777
thriving 2845
visitor 3038
metro 1794
attracts 298
deal 800
professional 2173
travelers 2912
booming 432
fortune 1185
500 94
costco 725
wholesale 3120
microsoft 1798
facebook 1067
furthermore 1228
fans 1082
athletic 288
three 2844
teams 2792
nestled 1892
embassy 967
sleeping 2560
queen 2212
size 2552
sofa 2588
bed 366
50 93
inch 1467
hdtv 1355
kitchenettes 1578
dine 850
institution 1504
13 16
coins 627
hand 1333
zephyr 3194
stop 2689
health 1363
includes 1470
heated 1367
hot 1421
sun 2734
deck 803
begin 376
free 1193
made 1711
order 1976
evening 102

arctic 254
doubletree 894
aaa 118
echoing 932
post 2137
700 106
polar 2122
sazerac 2426
juno 1554
freshly 1201
prepared 2148
menu 1785
influence 1484
celebrate 544
edward 941
curtis 785
dome 884
hdtvs 1356
french 1197
pressed 2154
bellevue 382
crisp 762
interior 1510
178 31
function 1220
levels 1639
preferred 2144
bottled 437
robe 2369
deluxe 825
purchase 2200
groups 1309
240 68
quickly 2216
transformed 2904
parties 2037
placed 2097
redmond 2264
kirkland 1575
issaquah 1530
budget 473
conscious 694
european 1023
tucked 2938
characteristics 571
coved 745
arched 248
entryways 1003
rare 2233
gem 1249
lower 1696
keyarena 1565
ballet 329
mccaw 1763
tourist 2885
nightspots 1909
bumbershoot 481
cornish 718
school 2434
aloft 192
headquarters 1362
spacex 2615
nintendo 1911
breezy 453
offices 1947
mingle 1810
xyz 3176
sweet 2763
savory 2421
healthy 1364
snack 2578
sm 2564
play 2106
shower 2521
bliss 411
connectivity 693
charges 573
electronics 950
casts 530
49 89
lcd 1621
under 2957
element 953
s

very 3020
creature 760
staypineapple 2678
delightful 818
balance 324
designer 836
definition 812
founded 1189
commitment 653
conservation 695
olive 1951
redefines 2262
ecological 935
footprint 1176
leed 1631
certified 560
comfortably 642
caring 523
environment 1005
governor 1285
impressive 1464
wineries 3133
highlighting 1382
eateries 931
partnering 2038
farmers 1087
purveyors 2205
sustainably 2756
source 2606
foods 1172
create 755
favorites 1093
twist 2948
urbane 2988
global 1269
taste 2781
coffees 625
microbrews 1797
anywhere 230
laptop 1605
offered 1943
shore 2514
fastest 1090
growing 1311
globally 1270
medicine 1771
streetcar 2704
seaplane 2445
uniquely 2963
below 386
bacon 318
mansion 1728
volunteer 3042
eleven 958
weddings 3084
far 1084
yesterday 3182
shields 2507
seen 2464
stained 2657
passes 2041
1909 41
cecil 540
edwardian 942
tudor 2939
trademark 2893
crest 761
woods 3155
chandelier 563
crystals 772
harvard 1347
belmont 384
retained 2340
1984 51
fire 1133
burned 483
necessita

In [None]:
# Show the frequency of each word
word_Freq = [(word, count_words[0,idx]) for word, idx in w2v.vocabulary_.items()]
word_Freq

[('located', 108),
 ('on', 129),
 ('the', 1258),
 ('southern', 1),
 ('tip', 1),
 ('of', 536),
 ('lake', 41),
 ('union', 33),
 ('hilton', 12),
 ('garden', 11),
 ('inn', 89),
 ('seattle', 533),
 ('downtown', 133),
 ('hotel', 295),
 ('is', 271),
 ('perfectly', 6),
 ('for', 216),
 ('business', 87),
 ('and', 1062),
 ('leisure', 18),
 ('neighborhood', 35),
 ('home', 57),
 ('to', 471),
 ('numerous', 1),
 ('major', 12),
 ('international', 32),
 ('companies', 6),
 ('including', 47),
 ('amazon', 19),
 ('google', 6),
 ('bill', 4),
 ('melinda', 4),
 ('gates', 5),
 ('foundation', 4),
 ('wealth', 1),
 ('eclectic', 8),
 ('restaurants', 35),
 ('bars', 7),
 ('make', 43),
 ('this', 63),
 ('area', 51),
 ('one', 75),
 ('most', 40),
 ('sought', 1),
 ('out', 23),
 ('by', 71),
 ('locals', 5),
 ('visitors', 4),
 ('our', 359),
 ('proximity', 8),
 ('allows', 3),
 ('take', 31),
 ('in', 449),
 ('some', 22),
 ('pacific', 42),
 ('northwest', 42),
 ('majestic', 4),
 ('scenery', 2),
 ('enjoy', 93),
 ('outdoor', 23),


In [None]:
type(word_Freq)

list

In [None]:
# ranking with descending order
word_Freq = sorted(word_Freq, key= lambda x: x[1], reverse=True)
word_Freq

[('the', 1258),
 ('and', 1062),
 ('of', 536),
 ('seattle', 533),
 ('to', 471),
 ('in', 449),
 ('our', 359),
 ('you', 304),
 ('hotel', 295),
 ('with', 280),
 ('is', 271),
 ('at', 231),
 ('from', 224),
 ('for', 216),
 ('your', 186),
 ('or', 161),
 ('center', 151),
 ('are', 136),
 ('downtown', 133),
 ('on', 129),
 ('we', 128),
 ('free', 123),
 ('as', 117),
 ('located', 108),
 ('rooms', 106),
 ('stay', 105),
 ('place', 102),
 ('all', 100),
 ('airport', 99),
 ('space', 97),
 ('market', 97),
 ('enjoy', 93),
 ('an', 91),
 ('pike', 90),
 ('inn', 89),
 ('business', 87),
 ('just', 82),
 ('city', 79),
 ('room', 77),
 ('one', 75),
 ('by', 71),
 ('breakfast', 68),
 ('needle', 68),
 ('suites', 67),
 ('washington', 67),
 ('that', 65),
 ('re', 64),
 ('this', 63),
 ('complimentary', 62),
 ('also', 62),
 ('amenities', 60),
 ('offer', 59),
 ('attractions', 59),
 ('away', 59),
 ('access', 59),
 ('home', 57),
 ('guest', 57),
 ('can', 55),
 ('it', 55),
 ('guests', 54),
 ('service', 53),
 ('experience', 52),

### Create a function to get the Top N words

In [None]:
def get_topN_words(corpus, n=None):
    w2v = CountVectorizer(stop_words='english', ngram_range=(1,3)).fit(corpus)
    bow = w2v.transform(corpus)
    count_words = bow.sum(axis=0)
    word_Freq = [(word, count_words[0,idx]) for word, idx in w2v.vocabulary_.items()]
    word_Freq = sorted(word_Freq, key= lambda x: x[1], reverse=True)
    return word_Freq[:n]

In [None]:
#Get the top n words
Top_words= get_topN_words(df['desc'], n=20)

In [None]:
#Create a data frame for top words and draw a graph
DF = pd.DataFrame(Top_words, columns=['desc','count'])
DF.groupby('desc').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', title='Top 20 key words')

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# processing ['desc'] column
stopwords_set = set(stopwords.words('english'))

# remove stopwords from your input
def process_text(text):
    text = ' '.join(word for word in text.split() if word not in stopwords_set)
    return text

In [None]:
# Create a new column without stopwords
df['desc_new'] = df['desc'].apply(process_text)

In [None]:
df['desc'][0]

"Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. \nThe neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay productive.

In [None]:
df['desc_new'][0]

"Located southern tip Lake Union, Hilton Garden Inn Seattle Downtown hotel perfectly located business leisure. The neighborhood home numerous major international companies including Amazon, Google Bill & Melinda Gates Foundation. A wealth eclectic restaurants bars make area Seattle one sought locals visitors. Our proximity Lake Union allows visitors take Pacific Northwest's majestic scenery enjoy outdoor activities like kayaking sailing. 2,000 sq. ft. versatile space complimentary business center. State-of-the-art A/V technology helpful staff guarantee conference, cocktail reception wedding success. Refresh sparkling saltwater pool, energize latest equipment 24-hour fitness center. Tastefully decorated flooded natural light, guest rooms suites offer everything need relax stay productive. Unwind bar, enjoy American cuisine breakfast, lunch dinner restaurant. The 24-hour Pavilion Pantry? stocks variety snacks, drinks sundries."

### Similarity calculation

In [None]:
df.set_index('name', inplace = True)

In [None]:
df.index

Index(['Hilton Garden Seattle Downtown', 'Sheraton Grand Seattle',
       'Crowne Plaza Seattle Downtown', 'Kimpton Hotel Monaco Seattle ',
       'The Westin Seattle', 'The Paramount Hotel Seattle', 'Hilton Seattle',
       'Motif Seattle', 'Warwick Seattle', 'Four Seasons Hotel Seattle',
       ...
       '11th Avenue Inn Bed and Breakfast', 'Oakwood Seattle South Lake Union',
       'Mildred's Bed and Breakfast', 'First Hill Apartments',
       'Hampton Inn Seattle/Southcenter', 'The Halcyon Suite Du Jour',
       'Vermont Inn', 'Stay Alfred on Wall Street',
       'Pike's Place Lux Suites by Barsala',
       'citizenM Seattle South Lake Union hotel'],
      dtype='object', name='name', length=152)

In [None]:
#Convert a collection of raw documents to a matrix of TF-IDF features.
TFid =  TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words = 'english')

In [None]:
TFid_Matrix = TFid.fit_transform(df['desc_new'])

In [None]:
TFid_Matrix.shape

(152, 26623)

In [None]:
# return X @ Y.T
cosine_similarity = linear_kernel(TFid_Matrix,TFid_Matrix)

In [None]:
cosine_similarity.shape

(152, 152)

In [None]:
cosine_similarity[0]

array([1.        , 0.01406466, 0.03391591, 0.00993816, 0.03297936,
       0.0150115 , 0.02084233, 0.0158099 , 0.00776991, 0.01999756,
       0.0182464 , 0.01231142, 0.01684615, 0.0119307 , 0.01085672,
       0.01791009, 0.01116566, 0.04070581, 0.00971403, 0.02608081,
       0.03035044, 0.00885341, 0.01056546, 0.02008606, 0.01868132,
       0.02816165, 0.0321467 , 0.00685534, 0.02548283, 0.01969646,
       0.01638717, 0.04434173, 0.0167791 , 0.02169556, 0.03742179,
       0.03900557, 0.0069193 , 0.01352541, 0.04098383, 0.03227337,
       0.0172481 , 0.01166389, 0.01520804, 0.03544255, 0.04699436,
       0.01317424, 0.03274589, 0.01625349, 0.03786155, 0.01421505,
       0.02656012, 0.01830098, 0.03764235, 0.01329187, 0.0274474 ,
       0.01444152, 0.02460523, 0.0309297 , 0.01229223, 0.02683908,
       0.03151467, 0.01008797, 0.045226  , 0.03114224, 0.0323932 ,
       0.01846074, 0.03120343, 0.01118123, 0.02208553, 0.01201834,
       0.02366127, 0.01679123, 0.02597236, 0.02219805, 0.02326

In [None]:
indices= pd.Series(df.index)
indices[indices=='Moore Hotel'].index

Int64Index([123], dtype='int64')

In [None]:
def recommendationSYS(name, cosine_similarity):
    recommendations = []
    idx = indices[indices==name].index
    score = pd.Series(cosine_similarity[idx.item()]).sort_values(ascending=False)
    top_10s = list(score[1:11].index)
    for i in top_10s:
        recommendations.append(list(df.index)[i])
    print('Here are the top 10 hotels we recommend that are most similar to %s for your consideration.'%name)
    return recommendations

In [None]:
recommendationSYS('Moore Hotel', cosine_similarity)

Here are the top 10 hotels we recommend that are most similar to Moore Hotel for your consideration.


['Inn at the Market',
 'Hotel Theodore',
 'The State Hotel',
 'Hilton Seattle',
 'The Maxwell Hotel - A Staypineapple Hotel',
 'The Paramount Hotel Seattle',
 'Hotel Seattle',
 'Quality Inn & Suites Seattle Center',
 'Homewood Suites by Hilton Seattle Downtown',
 'Econo Lodge SeaTac Airport North']