In [1]:
!pip install pandas numpy scikit-learn scipy

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import pandas as pd

# Load the dataset
file_path = 'Hotel_Goibibo_NoNull.csv'
hotel_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(hotel_data.head())
print(hotel_data.info())


In [6]:
#importing various modules
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, accuracy_score
import numpy as np

In [7]:
# Load the dataset
file_path = 'Hotel_Goibibo_NoNull.csv'  # Update the file path as needed
hotel_data = pd.read_csv(file_path)


In [8]:
hotel_data.head()

Unnamed: 0,property_name,state,city,area,address,hotel_star_rating,property_type,hotel_brand,hotel_description,hotel_facilities,...,room_type,room_count,point_of_interest,rating,additional_rating,review_count_by_category,site_review_count,guest_recommendation,latitude,longitude
0,Baragarh Regency,Himachal Pradesh,Manali,Others,"15th Mile, N.H.21,Manali, District Kullu,Himac...",2,Resort,no brand,The standard check-in time is 12:00 PM and the...,Doctor on Call|Dry Cleaning|Laundry Service Av...,...,Deluxe Room,17,Hadimba Temple|Naggar Village|Himalayan Nyingm...,4.0,Service Quality::3.9|Amenities::3.7|Food and D...,positive reviews::74|critical reviews::13|revi...,87.0,85.0,32.139387,77.15466
1,Asian Suites A- 585,Haryana,Gurgaon,Sushant Lok,"A-585, Sushant Lok-1 ,Near Iffco Chowk Metro S...",0,Guest House,no brand,The standard check-in time is 12:00 PM and the...,Airport Transfer Available / Surcharge|Banquet...,...,Deluxe Room With Free WIFI,18,Sushant Lok|Sahara Mall|Amity International Sc...,4.5,Service Quality::4.7|Amenities::4.7|Food and D...,positive reviews::8|critical reviews::0|review...,8.0,87.0,28.472097,77.072546
2,Bevvan Resort,Goa,Goa,Calangute Area,"Cobra Vaddo,Calungate Baga Road, Bardez, Calan...",0,Resort,no brand,The standard check-in time is 12:00 PM and the...,Swimming Pool|Bar / Lounge |Laundry Service Av...,...,Standard Room,15,"Anjuna Beach|Calangute Beach|Titos lane, baga|...",2.5,Service Quality::2.5|Amenities::2.5|Food and D...,positive reviews::1|critical reviews::1|review...,2.0,50.0,15.548398,73.757634
3,Apple Inn Cottage,Himachal Pradesh,Manali,Village Simsa,Simsa,2,Cottage,no brand,The standard check-in time is 12:00 PM and the...,Doctor on Call|Dry Cleaning|Laundry Service Av...,...,Deluxe Room,24,The Mall|Tibetian Monastery|Vashisht Hot Baths...,5.0,Service Quality::5.0|Amenities::5.0|Food and D...,positive reviews::1|critical reviews::0|review...,1.0,100.0,32.223603,77.1859
4,Anmol Hotel Pvt.Ltd,Delhi,Delhi,Paharganj,"8180 Street No.-6,Arakashan Road,Paharganj",2,Hotel,no brand,The standard check-in time is 12:00 PM and the...,Internet Access - Surcharge|Laundry Service Av...,...,Standard Room Non AC,20,Gaffar Market|YMCA Institute|Agrasen Boali|Cha...,2.8,Service Quality::2.7|Amenities::2.6|Food and D...,positive reviews::56|critical reviews::65|revi...,121.0,63.0,28.646777,77.212735


In [9]:
# Preprocessing
# Split 'review_count_by_category' into separate columns
review_categories = hotel_data['review_count_by_category'].str.split('|', expand=True)
review_categories.columns = ['positive_reviews', 'critical_reviews', 'neutral_reviews']


In [10]:
# Extract numeric values from review categories
for column in review_categories.columns:
    review_categories[column] = review_categories[column].str.extract('(\d+)').astype(float)


In [11]:
# Combine the extracted columns back into the main dataset
hotel_data = pd.concat([hotel_data, review_categories], axis=1)
hotel_data.drop(columns=['review_count_by_category'], inplace=True)


In [12]:
# Using separate vectorizers for hotel and room facilities
hotel_vectorizer = CountVectorizer()
room_vectorizer = CountVectorizer()


In [13]:
# Transform hotel and room facilities into one-hot encoded matrices
hotel_facilities_matrix = hotel_vectorizer.fit_transform(hotel_data['hotel_facilities'])
room_facilities_matrix = room_vectorizer.fit_transform(hotel_data['room_facilities'])

In [14]:
# Create DataFrames for the facilities
hotel_facilities_hotel_data = pd.DataFrame(hotel_facilities_matrix.toarray(), columns=hotel_vectorizer.get_feature_names_out())
room_facilities_hotel_data = pd.DataFrame(room_facilities_matrix.toarray(), columns=room_vectorizer.get_feature_names_out())

In [15]:
# Add prefixes to avoid column name conflicts
hotel_facilities_hotel_data = hotel_facilities_hotel_data.add_prefix('hotel_facility_')
room_facilities_hotel_data = room_facilities_hotel_data.add_prefix('room_facility_')

In [16]:
# Combine facilities data with the main dataset
hotel_data = pd.concat([hotel_data, hotel_facilities_hotel_data, room_facilities_hotel_data], axis=1)


In [17]:
# Selecting relevant features for the model
features = ['rating', 'guest_recommendation', 'positive_reviews', 'critical_reviews', 'neutral_reviews']
features.extend(hotel_facilities_hotel_data.columns)
features.extend(room_facilities_hotel_data.columns)


In [18]:
X = hotel_data[features]
y = hotel_data['rating']  # Predicting rating as a proxy for hotel recommendation

In [19]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Standardizing the feature set
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:
# Model 1: Support Vector Machine (SVM)
svm_model = SVR(kernel='linear')
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_rmse = np.sqrt(mean_squared_error(y_test, svm_predictions))

In [22]:
# Model 2: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))

In [23]:
# Model 3: Naive Bayes
# Note: Naive Bayes is not typically used for regression, but let's include it for completeness
nb_model = GaussianNB()
nb_model.fit(X_train, y_train > y_train.mean())  # Binary classification
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test > y_test.mean(), nb_predictions)

In [None]:

# Model 4: Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))


In [24]:

# Recommendation: Predict scores and select top 5 hotels
def recommend_hotels(model, X, hotel_data, top_n=5):
    predictions = model.predict(X)
    hotel_data['predicted_score'] = predictions
    

In [None]:
from sklearn.model_selection import train_test_split

# Assuming X is your feature set and y is your target variable
# and hotel_data is your original DataFrame
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Assuming you have trained your models and obtained predictions
# svm_predictions, lr_predictions, rf_predictions, nb_predictions

# Combine predictions by averaging them
combined_predictions = (svm_predictions + lr_predictions + rf_predictions + nb_predictions) / 4

# Create a subset of the original DataFrame corresponding to the test set
hotel_data_test = hotel_data.iloc[X_test.index].copy()



In [None]:
# Example of user input:
user_city = input("Enter the City: ").strip()
user_rating = float(input("Enter the minimum Rating (e.g., 4.0): ").strip())
user_amenities = input("Enter required Amenities (comma-separated, e.g., 'Free WiFi, Pool'): ").strip().lower().split(',')

# Clean the amenities input to ensure consistent matching
user_amenities = [amenity.strip() for amenity in user_amenities]

# Function to recommend hotels based on combined predictions and user inputs
def recommend_hotels_with_input(hotel_data, predictions, city, rating, amenities, top_n=5):
    # Add combined score to the DataFrame
    hotel_data['combined_score'] = predictions
    
    # Filter based on city and rating
    filtered_hotel_data = hotel_data[(hotel_data['city'].str.lower() == user_city.lower()) & (hotel_data['rating'] >= user_rating)]
    
    # Filter based on amenities
    def has_amenities(hotel_amenities):
        hotel_amenities_set = set([amenity.strip().lower() for amenity in hotel_amenities.split(',')])
        return all(any(user_amenity in hotel_amenity for hotel_amenity in hotel_amenities_set) for user_amenity in amenities)
    
    filtered_hotel_data = filtered_hotel_data[filtered_hotel_data['hotel_facilities'].apply(has_amenities)]
    
    # Sort by the combined score and select top_n hotels
    top_recommendations = filtered_hotel_data[['property_name', 'address', 'rating', 'city', 'combined_score']]
    top_recommendations = top_recommendations.sort_values(by='combined_score', ascending=False).head(top_n)
    
    return top_recommendations

# Get top 5 recommendations using user inputs and combined predictions
top_hotels_user_input = recommend_hotels_with_input(hotel_data_test, combined_predictions, user_city, user_rating, user_amenities)

# Display the top 5 recommendations
print("\nTop 5 Hotel Recommendations Based on Your Input:")
print(top_hotels_user_input)

# Additional debugging step
if top_hotels_user_input.empty:
    print("No hotels match the criteria. Consider adjusting your inputs.")



In [None]:
from sklearn.metrics import mean_squared_error, accuracy_score
import numpy as np

# Function to calculate RMSE for regression models
def calculate_rmse(model, X_test, y_test):
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    return rmse

# Function to calculate accuracy for classification models
def calculate_accuracy(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test > y_test.mean(), predictions)
    return accuracy

# Calculate RMSE for SVM and Linear Regression models
svm_rmse = calculate_rmse(svm_model, X_test, y_test)
lr_rmse = calculate_rmse(lr_model, X_test, y_test)

# Calculate accuracy for Naive Bayes model
nb_accuracy = calculate_accuracy(nb_model, X_test, y_test)

# Print the RMSE and accuracy values
print(f"SVM RMSE: {svm_rmse:.2f}")
print(f"Linear Regression RMSE: {lr_rmse:.2f}")
print(f"Naive Bayes Accuracy: {nb_accuracy:.2%}")

# Get top 5 recommendations using the best model (choose based on RMSE or accuracy)
top_hotels_svm = recommend_hotels(svm_model, X_test, hotel_data, y_test)
top_hotels_lr = recommend_hotels(lr_model, X_test, hotel_data, y_test)
top_hotels_nb = recommend_hotels(nb_model, X_test, hotel_data, y_test)

# Display the top 5 recommendations from the models
print("Top 5 Hotel Recommendations (SVM):")
print(top_hotels_svm)

print("\nTop 5 Hotel Recommendations (Linear Regression):")
print(top_hotels_lr)

print("\nTop 5 Hotel Recommendations (Naive Bayes):")
print(top_hotels_nb)
