## Recommendation system Based on service description of the Hotel

In [6]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import random
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()

In [8]:
df = pd.read_csv('Hotels.csv', encoding="latin-1")

In [9]:
df.head()

Unnamed: 0,name,address,desc
0,Hilton Garden Seattle Downtown,"1821 Boren Avenue, Seattle Washington 98101 USA","Located on the southern tip of Lake Union, the..."
1,Sheraton Grand Seattle,"1400 6th Avenue, Seattle, Washington 98101 USA","Located in the city's vibrant core, the Sherat..."
2,Crowne Plaza Seattle Downtown,"1113 6th Ave, Seattle, WA 98101","Located in the heart of downtown Seattle, the ..."
3,Kimpton Hotel Monaco Seattle,"1101 4th Ave, Seattle, WA98101",What?s near our hotel downtown Seattle locatio...
4,The Westin Seattle,"1900 5th Avenue, Seattle, Washington 98101 USA",Situated amid incredible shopping and iconic a...


### Check the Data

In [10]:
# word 2 vector
w2v = CountVectorizer().fit(df['desc'])
bow = w2v.transform(df['desc'])

In [11]:
bow.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [12]:
bow.shape

(152, 3200)

In [13]:
#count words by index
count_words = bow.sum(axis=0)
count_words

matrix([[ 1, 11, 11, ...,  2,  6,  2]], dtype=int64)

In [14]:
count_words.ndim

2

In [15]:
count_words.shape

(1, 3200)

In [24]:
# Show the frequency of each word
word_Freq = [(word, count_words[0,idx]) for word, idx in w2v.vocabulary_.items()]
type(word_Freq)

list

In [26]:
# ranking with descending order
word_Freq = sorted(word_Freq, key= lambda x: x[1], reverse=True)

### Create a function to get the Top N words

In [27]:
def get_topN_words(corpus, n=None):
    w2v = CountVectorizer(stop_words='english', ngram_range=(1,3)).fit(corpus)
    bow = w2v.transform(corpus)
    count_words = bow.sum(axis=0)
    word_Freq = [(word, count_words[0,idx]) for word, idx in w2v.vocabulary_.items()]
    word_Freq = sorted(word_Freq, key= lambda x: x[1], reverse=True)
    return word_Freq[:n]

In [28]:
#Get the top n words
Top_words= get_topN_words(df['desc'], n=20)

In [29]:
#Create a data frame for top words and draw a graph
DF = pd.DataFrame(Top_words, columns=['desc','count'])
DF.groupby('desc').sum()['count'].sort_values().iplot(kind='barh', yTitle='Count', title='Top 20 key words')

In [30]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ZXF\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [31]:
# processing ['desc'] column
stopwords_set = set(stopwords.words('english'))

# remove stopwords from your input
def process_text(text):
    text = ' '.join(word for word in text.split() if word not in stopwords_set)
    return text

In [32]:
# Create a new column without stopwords
df['desc_new'] = df['desc'].apply(process_text)

In [33]:
df['desc'][0]

"Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. \nThe neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay productive.

In [34]:
df['desc_new'][0]

"Located southern tip Lake Union, Hilton Garden Inn Seattle Downtown hotel perfectly located business leisure. The neighborhood home numerous major international companies including Amazon, Google Bill & Melinda Gates Foundation. A wealth eclectic restaurants bars make area Seattle one sought locals visitors. Our proximity Lake Union allows visitors take Pacific Northwest's majestic scenery enjoy outdoor activities like kayaking sailing. 2,000 sq. ft. versatile space complimentary business center. State-of-the-art A/V technology helpful staff guarantee conference, cocktail reception wedding success. Refresh sparkling saltwater pool, energize latest equipment 24-hour fitness center. Tastefully decorated flooded natural light, guest rooms suites offer everything need relax stay productive. Unwind bar, enjoy American cuisine breakfast, lunch dinner restaurant. The 24-hour Pavilion Pantry? stocks variety snacks, drinks sundries."

### Similarity calculation

In [35]:
df.set_index('name', inplace = True)

In [36]:
df.index

Index(['Hilton Garden Seattle Downtown', 'Sheraton Grand Seattle',
       'Crowne Plaza Seattle Downtown', 'Kimpton Hotel Monaco Seattle ',
       'The Westin Seattle', 'The Paramount Hotel Seattle', 'Hilton Seattle',
       'Motif Seattle', 'Warwick Seattle', 'Four Seasons Hotel Seattle',
       ...
       '11th Avenue Inn Bed and Breakfast', 'Oakwood Seattle South Lake Union',
       'Mildred's Bed and Breakfast', 'First Hill Apartments',
       'Hampton Inn Seattle/Southcenter', 'The Halcyon Suite Du Jour',
       'Vermont Inn', 'Stay Alfred on Wall Street',
       'Pike's Place Lux Suites by Barsala',
       'citizenM Seattle South Lake Union hotel'],
      dtype='object', name='name', length=152)

In [37]:
#Convert a collection of raw documents to a matrix of TF-IDF features.
TFid =  TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words = 'english')

In [38]:
TFid_Matrix = TFid.fit_transform(df['desc_new'])

In [39]:
TFid_Matrix.shape

(152, 26623)

In [40]:
# return X @ Y.T
cosine_similarity = linear_kernel(TFid_Matrix,TFid_Matrix)

In [41]:
cosine_similarity.shape

(152, 152)

In [43]:
indices= pd.Series(df.index)
indices[indices=='Moore Hotel'].index

Int64Index([123], dtype='int64')

In [44]:
def recommendationSYS(name, cosine_similarity):
    recommendations = []
    idx = indices[indices==name].index
    score = pd.Series(cosine_similarity[idx.item()]).sort_values(ascending=False)
    top_10s = list(score[1:11].index)
    for i in top_10s:
        recommendations.append(list(df.index)[i])
    print('Here are the top 10 hotels we recommend that are most similar to %s for your consideration.'%name)
    return recommendations

In [45]:
recommendationSYS('Moore Hotel', cosine_similarity)

Here are the top 10 hotels we recommend that are most similar to Moore Hotel for your consideration.


['Inn at the Market',
 'Hotel Theodore',
 'The State Hotel',
 'Hilton Seattle',
 'The Maxwell Hotel - A Staypineapple Hotel',
 'The Paramount Hotel Seattle',
 'Hotel Seattle',
 'Quality Inn & Suites Seattle Center',
 'Homewood Suites by Hilton Seattle Downtown',
 'Econo Lodge SeaTac Airport North']