# Importing the necessary libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from ast import literal_eval

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tatha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tatha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tatha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading the dataset

In [2]:
data = pd.read_csv("Hotel_Reviews.csv")
data.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


# Pre-processing the data
### Replacing `"United Kingdom"` with `"UK"`

In [3]:
data.Hotel_Address = data.Hotel_Address.str.replace("United Kingdom", "UK")

### Getting the countries from the addresses

In [4]:
data["countries"] = data.Hotel_Address.apply(lambda x: x.split(" ")[-1])
print(data.countries.unique())

['Netherlands' 'UK' 'France' 'Spain' 'Italy' 'Austria']


### We don't need the following columns

In [5]:
data.drop(['Additional_Number_of_Scoring',
           'Review_Date','Reviewer_Nationality',
           'Negative_Review', 'Review_Total_Negative_Word_Counts',
           'Total_Number_of_Reviews', 'Positive_Review',
           'Review_Total_Positive_Word_Counts',
           'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
           'days_since_review', 'lat', 'lng'], 1, inplace=True)

data.head()

Unnamed: 0,Hotel_Address,Average_Score,Hotel_Name,Tags,countries
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Couple ', ' Duplex Double...",Netherlands
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Couple ', ' Duplex Double...",Netherlands
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Family with young childre...",Netherlands
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",Netherlands
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",Netherlands


### Making a list of tags

We create a function to convert the strings of list into a normal list and then apply it to the `"Tags"` column in the dataset.

In [6]:
def impute(column):
    column = column[0]
    if type(column) != list:
        return "".join(literal_eval(column))
    else:
        return column

In [7]:
data["Tags"] = data[["Tags"]].apply(impute, axis=1)
data.head()

Unnamed: 0,Hotel_Address,Average_Score,Hotel_Name,Tags,countries
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,Leisure trip Couple Duplex Double Room Sta...,Netherlands
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,Leisure trip Couple Duplex Double Room Sta...,Netherlands
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,Leisure trip Family with young children Dup...,Netherlands
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,Leisure trip Solo traveler Duplex Double Ro...,Netherlands
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,Leisure trip Couple Suite Stayed 2 nights ...,Netherlands


### Writing all country names and tags in lowercase

In [8]:
data["countries"] = data["countries"].str.lower()
data["Tags"] = data["Tags"].str.lower()
data.head()

Unnamed: 0,Hotel_Address,Average_Score,Hotel_Name,Tags,countries
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,leisure trip couple duplex double room sta...,netherlands
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,leisure trip couple duplex double room sta...,netherlands
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,leisure trip family with young children dup...,netherlands
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,leisure trip solo traveler duplex double ro...,netherlands
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,leisure trip couple suite stayed 2 nights ...,netherlands


# Recommender function

In [9]:
def hotel_recommender(location, description, df=data):
    """
    Args:
        location (str): country
        description (str): description of the hotel or the purpose of the trip
    """
    
    description = description.lower()
    desc_tok = word_tokenize(description) # tokenizing the description
    stop_words = stopwords.words('english') # list of stop words in English ('a', 'we', 'myself', etc)
    lemm = WordNetLemmatizer() # initialising the lemmatizer
    desc_set = {word for word in desc_tok}
    unwanted = set(stop_words).intersection(desc_set) # removing stop word from the description
    FS = desc_set - unwanted
    filtered_set = set()
    # adding the lemmatized (root) word in the filtered_set
    for fs in FS:
        filtered_set.add(lemm.lemmatize(fs))
    
    # getting the countries passed as location
    country = data[data["countries"] == location.lower()]
    country = country.set_index(np.arange(country.shape[0]))
    
    # creating some empty lists
    cos = []
    
    matches = []
    
    # going through each hotel in the country mentioned
    for i in range(country.shape[0]):
        # tokenizing and lemmatizing the description of the hotel
        temp_token = word_tokenize(country["Tags"][i])
        temp_set = [word for word in temp_token if not (word in stop_words)]
        
        temp2_set = set()
        
        for s in temp_set:
            temp2_set.add(lemm.lemmatize(s))
            
        # ranking each hotel based on the number of words matched
        vector = temp2_set.intersection(filtered_set)
        cos.append(len(vector))
        
        matches.append(vector)
    
    # sorting the country dataset based on the similarity index
    country["Similarity Index"] = cos
    country["matched"] = matches
    country = country.sort_values(by="Similarity Index", ascending=False)
    country.drop_duplicates(subset="Hotel_Name", keep="first" , inplace=True)
    # country.sort_values("Average_Score", ascending=False, inplace=True)
    country.reset_index(inplace=True)
    
    return country[["Hotel_Name", "Average_Score", "Hotel_Address", "Similarity Index", "matched"]].head(10)

# Testing

In [10]:
hotel_recommender("france", "business trip for two days")

Unnamed: 0,Hotel_Name,Average_Score,Hotel_Address,Similarity Index,matched
0,Villa Eugenie,6.8,167 rue de Rome 17th arr 75017 Paris France,3,"{trip, business, two}"
1,Phileas Hotel,8.7,24 Rue d Amsterdam 9th arr 75009 Paris France,3,"{trip, business, two}"
2,Hotel Saint Dominique,8.7,62 Rue Saint Dominique 7th arr 75007 Paris France,3,"{trip, business, two}"
3,Novotel Paris Les Halles,8.4,8 Place Marguerite de Navarre 1st arr 75001 Pa...,3,"{trip, business, two}"
4,H tel Gustave,8.8,34 Rue Viala 15th arr 75015 Paris France,3,"{trip, business, two}"
5,Grand H tel Du Palais Royal,9.2,4 Rue De Valois 1st arr 75001 Paris France,3,"{trip, business, two}"
6,Hotel OFF Paris Seine,8.4,86 Quai D Austerlitz 13th arr 75013 Paris France,3,"{trip, business, two}"
7,Le Metropolitan a Tribute Portfolio Hotel,8.3,10 Place De Mexico 16th arr 75016 Paris France,3,"{trip, business, two}"
8,H tel De Castiglione,7.2,38 Rue Du Faubourg Saint Honore 8th arr 75008 ...,3,"{trip, business, two}"
9,H tel Aiglon Esprit de France,8.9,232 Boulevard Raspail 14th arr 75014 Paris France,3,"{trip, business, two}"


In [11]:
hotel_recommender("italy", "leisure trip for 3 days with family")

Unnamed: 0,Hotel_Name,Average_Score,Hotel_Address,Similarity Index,matched
0,Hotel Da Vinci,8.1,Via Senigallia 6 20161 Milan Italy,4,"{3, trip, family, leisure}"
1,UNA Hotel Century,8.6,Via Fabio Filzi 25 B Central Station 20124 Mil...,4,"{3, trip, family, leisure}"
2,Rosa Grand Milano Starhotels Collezione,8.6,Piazza Fontana 3 Milan City Center 20122 Milan...,4,"{3, trip, family, leisure}"
3,TownHouse Galleria,8.3,Via Silvio Pellico 8 Milan City Center 20121 M...,4,"{3, trip, family, leisure}"
4,TownHouse Duomo,8.4,Via Silvio Pellico 2 Milan City Center 20121 M...,4,"{3, trip, family, leisure}"
5,Carlyle Brera Hotel,8.2,Corso Garibaldi 84 Milan City Center 20121 Mil...,4,"{3, trip, family, leisure}"
6,Hotel Spadari Al Duomo,9.3,Via Spadari 11 Milan City Center 20123 Milan I...,4,"{3, trip, family, leisure}"
7,TownHouse 12,8.1,Piazza Gerusalemme 12 Sempione 20154 Milan Italy,4,"{3, trip, family, leisure}"
8,Starhotels Ritz,8.1,Via Spallanzani 40 Central Station 20129 Milan...,4,"{3, trip, family, leisure}"
9,Hotel Galileo,7.9,Corso Europa 9 Milan City Center 20122 Milan I...,4,"{3, trip, family, leisure}"
