In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('Updated_Hotel_Reviews.csv')


In [24]:
df.head(10)

Unnamed: 0,hotel_name,hotel_address,average_score,total_number_of_reviews,positive_review,negative_review,reviewer_score,lat,lng
0,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,Only the park outside of the hotel was beauti...,I am so angry that i made this post available...,2.9,52.360576,4.915968
1,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,No real complaints the hotel was great great ...,No Negative,7.5,52.360576,4.915968
2,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,Location was good and staff were ok It is cut...,Rooms are nice but for elderly a bit difficul...,7.1,52.360576,4.915968
3,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,Great location in nice surroundings the bar a...,My room was dirty and I was afraid to walk ba...,3.8,52.360576,4.915968
4,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,Amazing location and building Romantic setting,You When I booked with your company on line y...,6.7,52.360576,4.915968
5,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,Good restaurant with modern design great chil...,Backyard of the hotel is total mess shouldn t...,6.7,52.360576,4.915968
6,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,The room is spacious and bright The hotel is ...,Cleaner did not change our sheet and duvet ev...,4.6,52.360576,4.915968
7,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,Good location Set in a lovely park friendly s...,Apart from the price for the brekfast Everyth...,10.0,52.360576,4.915968
8,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,No Positive,Even though the pictures show very clean room...,6.5,52.360576,4.915968
9,Hotel Arena,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,1403,The room was big enough and the bed is good T...,The aircondition makes so much noise and its ...,7.9,52.360576,4.915968


In [26]:
# Extract the country from the Hotel_Address
df['Country'] = df['hotel_address'].apply(lambda x: x.split(' ')[-1])

In [27]:
# Combine the positive and negative reviews into a single column
df['Review'] = df['negative_review'] + ' ' + df['positive_review']


In [28]:
# Encode the Country and Trip type (Purpose of trip)
le_country = LabelEncoder()
df['Country_Code'] = le_country.fit_transform(df['Country'])

In [29]:
# Preprocess the review text using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_reviews = tfidf.fit_transform(df['Review'])

In [31]:
# Prepare the rating
df['Rating'] = df['reviewer_score']


In [38]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

def recommend_hotel_kmeans(country, trip_purpose, rating=8):
    # Filter by country
    country_code = le_country.transform([country])[0]
    country_df = df[df['Country_Code'] == country_code]
    
    # Fit KMeans
    kmeans = KMeans(n_clusters=5, random_state=42)
    country_df['Cluster'] = kmeans.fit_predict(X_reviews[country_df.index])
    
    # Find the cluster most similar to the trip purpose
    trip_vec = tfidf.transform([trip_purpose])
    similarity = cosine_similarity(trip_vec, kmeans.cluster_centers_)
    cluster_idx = np.argmax(similarity)
    
    # Recommend hotels in this cluster with similar rating
    recommendations = country_df[country_df['Cluster'] == cluster_idx]
    recommendations = recommendations[recommendations['Rating'] >= rating]
    recommendations = recommendations.sort_values(by='Rating', ascending=False)
    
    return recommendations[['hotel_name', 'hotel_address', 'Rating']].head(5)

In [39]:
# Example Recommendation
recommend_hotel_kmeans('Italy', 'I am going for a business trip', rating=8)

  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_df['Cluster'] = kmeans.fit_predict(X_reviews[country_df.index])


Unnamed: 0,hotel_name,hotel_address,Rating
335482,Four Points Sheraton Milan Center,Via Gerolamo Cardano 1 Central Station 20124 M...,10.0
342453,Hotel Berna,Via Napo Torriani 18 Central Station 20124 Mil...,10.0
284982,Hotel Michelangelo,Piazza Luigi di Savoia 6 Central Station 20124...,10.0
342495,Hotel Berna,Via Napo Torriani 18 Central Station 20124 Mil...,10.0
284986,Hotel Michelangelo,Piazza Luigi di Savoia 6 Central Station 20124...,10.0


In [45]:
from sklearn.linear_model import LinearRegression

def recommend_hotel_regression(country, trip_purpose, rating=8):
    # Filter by country
    country_code = le_country.transform([country])[0]
    country_df = df[df['Country_Code'] == country_code]
    
    # Train Linear Regression
    reg = LinearRegression()
    reg.fit(X_reviews[country_df.index], country_df['Rating'])
    
    # Predict ratings based on trip purpose
    trip_vec = tfidf.transform([trip_purpose])
    predicted_rating = reg.predict(trip_vec)
    
    # Recommend hotels with similar predicted rating
    recommendations = country_df[np.abs(country_df['Rating'] - predicted_rating) <= 1]
    recommendations = recommendations.sort_values(by='Rating', ascending=False)
    
    return recommendations[['hotel_name', 'hotel_address', 'Rating']].head(5)



In [46]:
# Example Recommendation
recommend_hotel_regression('Italy', 'I am going for a business trip', rating=8)

Unnamed: 0,hotel_name,hotel_address,Rating
341449,Hotel Carrobbio,Via Medici 3 Milan City Center 20123 Milan Italy,8.5
205333,Hotel Romana Residence,Corso di Porta Romana 64 Milan City Center 201...,8.5
343180,LHP Hotel Napoleon,Via Ozanam 12 Citt Studi 20129 Milan Italy,8.5
348979,AC Hotel Milano a Marriott Lifestyle Hotel,Via Tazzoli 2 Garibaldi Station 20154 Milan Italy,8.5
340852,Hotel Mediolanum,Via Mauro Macchi 1 Central Station 20124 Milan...,8.5


In [49]:
from sklearn.neighbors import NearestNeighbors

def recommend_hotel_knn(country, trip_purpose, rating=8):
    # Filter by country
    country_code = le_country.transform([country])[0]
    country_df = df[df['Country_Code'] == country_code]
    
    # Fit KNN
    knn = NearestNeighbors(n_neighbors=5, metric='cosine')
    knn.fit(X_reviews[country_df.index])
    
    # Find nearest neighbors based on trip purpose
    trip_vec = tfidf.transform([trip_purpose])
    distances, indices = knn.kneighbors(trip_vec)
    
    # Recommend hotels
    recommendations = country_df.iloc[indices[0]]
    recommendations = recommendations[recommendations['Rating'] >= rating]
    recommendations = recommendations.sort_values(by='Rating', ascending=False)
    
    return recommendations[['hotel_name', 'hotel_address', 'Rating']].head(5)



In [50]:
# Example Recommendation
recommend_hotel_knn('Italy', 'I am going for a business trip', rating=8)


Unnamed: 0,hotel_name,hotel_address,Rating
341307,Novotel Milano Linate Aeroporto,Via Mecenate 121 20138 Milan Italy,10.0
251977,NH Collection Milano President,Largo Augusto 10 Milan City Center 20122 Milan...,10.0
344050,Hotel Mercure Milano Solari,Via Pietro Orseolo 1 Navigli 20144 Milan Italy,10.0
349334,Nhow Milan,Via Tortona 35 Navigli 20144 Milan Italy,9.6


In [60]:
from sklearn.decomposition import NMF

def recommend_hotel_mf(country, trip_purpose, rating=8):
    # Filter by country
    country_code = le_country.transform([country])[0]
    country_df = df[df['Country_Code'] == country_code]
    
    # Apply NMF
    nmf = NMF(n_components=5, random_state=42)
    W = nmf.fit_transform(X_reviews[country_df.index])
    H = nmf.components_
    
    # Transform the trip purpose to the latent space using H (the components of NMF)
    trip_vec = tfidf.transform([trip_purpose])
    trip_vec_latent = nmf.transform(trip_vec)
    
    # Compute the cosine similarity between the transformed trip vector and the W matrix
    similarity = cosine_similarity(trip_vec_latent, W)
    hotel_indices = np.argsort(similarity[0])[-5:]  # Get the indices of top 5 similar hotels
    
    # Recommend hotels based on this similarity
    recommendations = country_df.iloc[hotel_indices]
    recommendations = recommendations[recommendations['Rating'] >= rating]
    recommendations = recommendations.sort_values(by='Rating', ascending=False)
    
    return recommendations[['hotel_name', 'hotel_address', 'Rating']].head(5)


In [61]:
# Example Recommendation
recommend_hotel_mf('Italy', 'I am going for a business trip', rating=8)


Unnamed: 0,hotel_name,hotel_address,Rating
282090,Excelsior Hotel Gallia Luxury Collection Hotel,Piazza Duca D Aosta 9 Central Station 20124 Mi...,10.0
330708,Brunelleschi Hotel,Via Baracchini 12 Milan City Center 20123 Mila...,9.6
349815,Radisson Blu Hotel Milan,Via Villapizzone 24 Certosa 20156 Milan Italy,9.6


In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import KBinsDiscretizer

def recommend_hotel_bayes(country, trip_purpose, rating=8):
    # Filter by country
    country_code = le_country.transform([country])[0]
    country_df = df[df['Country_Code'] == country_code]
    
    # Bin the ratings into discrete classes
    kbins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    binned_ratings = kbins.fit_transform(country_df[['Rating']])
    
    # Train Naive Bayes
    nb = MultinomialNB()
    nb.fit(X_reviews[country_df.index], binned_ratings.ravel())
    
    # Predict rating category based on trip purpose
    trip_vec = tfidf.transform([trip_purpose])
    predicted_rating_bin = nb.predict(trip_vec)
    
    # Convert predicted bin back to original rating scale
    predicted_rating = kbins.inverse_transform([[predicted_rating_bin[0]]])[0][0]
    
    # Recommend hotels with similar predicted rating
    recommendations = country_df[np.abs(country_df['Rating'] - predicted_rating) <= 1]
    recommendations = recommendations.sort_values(by='Rating', ascending=False)
    
    return recommendations[['hotel_name', 'hotel_address', 'Rating']].head(5)


In [64]:
# Example Recommendation
recommend_hotel_bayes('Italy', 'I am going for a business trip', rating=8)




Unnamed: 0,hotel_name,hotel_address,Rating
145257,Hotel VIU Milan,6 Via Aristotile Fioravanti Garibaldi Station ...,9.6
346632,Baglioni Hotel Carlton The Leading Hotels of t...,Via Senato 5 Milan City Center 20121 Milan Italy,9.6
329710,The Square Milano Duomo,Via Albricci 2 4 Milan City Center 20122 Milan...,9.6
346634,Baglioni Hotel Carlton The Leading Hotels of t...,Via Senato 5 Milan City Center 20121 Milan Italy,9.6
329703,The Square Milano Duomo,Via Albricci 2 4 Milan City Center 20122 Milan...,9.6
