In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('./data/google_hotel_data_clean_v2.csv')
print('File read')

File read


In [3]:
df.head(10)

Unnamed: 0,Hotel_Name,Hotel_Rating,City,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Hotel_Price
0,Crowne Plaza Kochi,4.6,kochi,5-star hotel,Free breakfast,Free Wi-Fi,Free parking,Pool,Hot tub,Air conditioning,Fitness center,Spa,8854.0
1,Trident Hotel Cochin,4.5,kochi,5-star hotel,Free breakfast,Wi-Fi,Free parking,Pool,Air conditioning,Fitness center,Spa,Restaurant,6441.0
2,The Galaxy Suites,3.8,kochi,Apartment,Sleeps 10,Free parking,Free Wi-Fi,No air conditioning,No airport shuttle,No beach access,No elevator,No fireplace,831.0
3,The Renai cochin,4.2,kochi,4-star hotel,Free breakfast,Free Wi-Fi,Free parking,Pool,Air conditioning,Fitness center,Spa,Bar,2768.0
4,Ramada by Wyndham Kochi,4.5,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Pool,Air conditioning,Fitness center,Spa,Bar,8938.0
5,The Renai cochin,4.2,kochi,4-star hotel,Free breakfast,Free Wi-Fi,Free parking,Pool,Air conditioning,Fitness center,Spa,Bar,2768.0
6,"Radisson Blu Hotel, Kochi",4.3,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Pool,Hot tub,Air conditioning,Fitness center,Spa,6061.0
7,"Holiday Inn Cochin, an IHG Hotel",4.4,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Pool,Air conditioning,Fitness center,Bar,Restaurant,5689.0
8,OAK FIELD INN,3.8,kochi,Free breakfast,Wi-Fi,Free parking,Air conditioning,Restaurant,Kitchen,Full-service laundry,Kid-friendly,0,819.0
9,Grand Hyatt Kochi Bolgatty,4.7,kochi,5-star hotel,Breakfast,Free Wi-Fi,Free parking,Pool,Hot tub,Air conditioning,Fitness center,Spa,14282.0


In [4]:
# Find and print rows with repeated hotel names
duplicate_hotel_names = df[df.duplicated(subset=['Hotel_Name'], keep=False)]['Hotel_Name']
duplicate_hotel_names

0                        Crowne Plaza Kochi
1                      Trident Hotel Cochin
3                          The Renai cochin
5                          The Renai cochin
6                 Radisson Blu Hotel, Kochi
                       ...                 
1057    The Orchard Retreat & Spa, Srinagar
1081                  Clafouti Beach Resort
1084    The Lost Hostels, Varkala - Helipad
1099                  Clafouti Beach Resort
1100    The Lost Hostels, Varkala - Helipad
Name: Hotel_Name, Length: 201, dtype: object

In [5]:
df[df['Hotel_Name'] == 'The Lost Hostels, Varkala - Helipad']

Unnamed: 0,Hotel_Name,Hotel_Rating,City,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Hotel_Price
1084,"The Lost Hostels, Varkala - Helipad",4.5,varkala,Breakfast,Wi-Fi,Beach access,Kitchen,Airport shuttle,Full-service laundry,Kid-friendly,0,0,1113.0
1100,"The Lost Hostels, Varkala - Helipad",4.5,varkala,Breakfast,Wi-Fi,Beach access,Kitchen,Airport shuttle,Full-service laundry,Kid-friendly,0,0,1304.0


In [6]:
# Some hotels have multiple rows with different prices. We will keep the second occurring row for each hotel
df = df.drop_duplicates(subset=['Hotel_Name'], keep='last')
df.shape

(999, 13)

In [7]:
# Printing the unique features occurring in the dataset

hotel_features = ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']

In [8]:
feature_freq = {}

for column in hotel_features:
    for feature in df[column]:
        if feature in feature_freq:
            feature_freq[feature] += 1
        else:
            feature_freq[feature] = 1

# Printing the features and their frequencies in sorted manner
sorted_feature_freq = sorted(feature_freq.items(), key=lambda x: x[1], reverse=True)
for feature in sorted_feature_freq:
    print(feature)

('0', 939)
('Free Wi-Fi', 818)
('Air conditioning', 802)
('Restaurant', 601)
('Free parking', 587)
('Free breakfast', 568)
('Room service', 504)
('Pool', 475)
('Full-service laundry', 431)
('Fitness center', 315)
('3-star hotel', 297)
('Kitchen', 261)
('Paid parking', 253)
('Airport shuttle', 239)
('Spa', 230)
('Breakfast', 228)
('Bar', 185)
('Kid-friendly', 162)
('5-star hotel', 130)
('4-star hotel', 125)
('Wi-Fi', 89)
('Pet-friendly', 81)
('Hot tub', 67)
('Accessible', 62)
('Fireplace', 45)
('No elevator', 43)
('No crib', 42)
('2-star hotel', 38)
('Smoke-free', 37)
('Cable TV', 30)
('Beach access', 30)
('No fitness center', 24)
('2 bedrooms', 19)
('1 bedroom', 18)
('Business center', 17)
('3 bedrooms', 15)
('No air conditioning', 15)
('No beach access', 13)
('1-star hotel', 10)
('Sleeps 4', 9)
('Sleeps 6', 9)
('Sleeps 2', 9)
('Elevator', 9)
('No airport shuttle', 9)
('House', 7)
('Sleeps 10', 7)
('Apartment', 6)
('Sleeps 8', 6)
('Sleeps 9', 6)
('Sleeps 5', 5)
('Sleeps 3', 5)
('4 bedr

Choosing only those features that can contribute to the model, i.e., ignoring those features that occur very few times in the feature lists.

In [11]:
# Create a list of the desired feature names
selected_features = ['Free breakfast', 'Free Wi-Fi', 'Air conditioning', 'Restaurant', 'Free parking', 
                    'Room service', 'Pool', 'Full-service laundry', 'Fitness centre', 'Kitchen', 'Airport shuttle', 'Spa']

# Create the new DataFrame with the hotel name, rating, and the desired features
modified_df = df[['Hotel_Name', 'City', 'Hotel_Rating', 'Hotel_Price']].copy()

In [13]:
# Iterate through the desired features and set 1 or 0 based on their presence
for feature in selected_features:
    modified_df[feature] = df.apply(lambda row: 1 if feature in row[['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9']].values else 0, axis=1)

In [14]:
modified_df.head(10)

Unnamed: 0,Hotel_Name,City,Hotel_Rating,Hotel_Price,Free breakfast,Free Wi-Fi,Air conditioning,Restaurant,Free parking,Room service,Pool,Full-service laundry,Fitness centre,Kitchen,Airport shuttle,Spa
2,The Galaxy Suites,kochi,3.8,831.0,0,1,0,0,1,0,0,0,0,0,0,0
4,Ramada by Wyndham Kochi,kochi,4.5,8938.0,0,1,1,0,1,0,1,0,0,0,0,1
5,The Renai cochin,kochi,4.2,2768.0,1,1,1,0,1,0,1,0,0,0,0,1
7,"Holiday Inn Cochin, an IHG Hotel",kochi,4.4,5689.0,0,1,1,1,1,0,1,0,0,0,0,0
8,OAK FIELD INN,kochi,3.8,819.0,1,0,1,1,1,0,0,1,0,1,0,0
9,Grand Hyatt Kochi Bolgatty,kochi,4.7,14282.0,0,1,1,0,1,0,1,0,0,0,0,1
10,Hotel South Gate Residency,kochi,3.9,1051.0,0,1,1,1,1,0,0,1,0,0,1,0
11,Cherai Beach Resorts,kochi,3.9,3281.0,1,0,1,0,1,0,1,0,0,0,0,1
12,North Centre Hotel,kochi,4.7,1118.0,0,1,1,0,1,0,0,1,0,0,1,0
13,Boche Island - Kumbalangi,kochi,4.1,5855.0,1,1,1,1,1,0,1,0,0,0,0,1


In [34]:
def similarity_features_only(df, user_features, selected_features):

    # Calculate cosine similarity between the user input and the DataFrame
    similarity_scores = cosine_similarity([user_features], df[selected_features])

    # Add the similarity scores to the DataFrame
    df['Similarity'] = similarity_scores[0]

    # Sort the DataFrame by similarity score in descending order
    sorted_data = df.sort_values(by='Similarity', ascending=False)

    # Print the top recommended hotels
    return sorted_data.head(10)

In [36]:
# user input with just features as input
user_features = [0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 ,0]
recommendations = similarity_features_only(modified_df, user_features, selected_features)
recommendations

Unnamed: 0,Hotel_Name,City,Hotel_Rating,Hotel_Price,Free breakfast,Free Wi-Fi,Air conditioning,Restaurant,Free parking,Room service,Pool,Full-service laundry,Fitness centre,Kitchen,Airport shuttle,Spa,Similarity
753,Park Plaza Ludhiana,ludhiana,4.2,8057.0,0,1,1,0,0,0,1,0,0,0,0,1,0.866025
114,Radisson Blu Hotel New Delhi Paschim Vihar,delhi,4.4,7285.0,0,1,1,0,0,0,1,0,0,0,0,1,0.866025
692,Radhika Ex,aurangabad,3.9,2357.0,0,1,1,0,0,0,1,0,0,1,0,0,0.866025
203,Radisson Lucknow City Center,lucknow,4.3,4189.0,0,1,1,0,0,0,1,0,0,0,0,1,0.866025
197,Novotel Mumbai Juhu Beach,mumbai,4.3,11800.0,0,1,1,0,1,0,1,0,0,0,0,0,0.866025
179,Grand Hyatt Mumbai Hotel & Residences,mumbai,4.5,14025.0,0,1,1,0,1,0,1,0,0,0,0,0,0.866025
163,"The Hosteller Goa, Candolim",goa,4.7,1315.0,0,1,1,0,1,0,1,0,0,0,0,0,0.866025
157,Little India Beach Cottages,goa,4.0,6947.0,0,1,1,0,1,0,1,0,0,0,0,0,0.866025
135,Empires Hotel Bhubaneswar,bhubaneswar,4.1,3646.0,0,1,1,0,0,0,1,0,0,0,0,1,0.866025
330,Radisson Blu Hotel Nagpur,nagpur,4.5,9361.0,0,1,1,0,0,0,1,0,0,0,0,1,0.866025
