# Natural Language Processing  : First Project 
## TripAdvisor Recommendation Challenge 
Beating BM25

### Data Preprocessing

In [None]:
import pandas as pd


data_hotels = pd.read_csv('../offerings.csv')
data_reviews = pd.read_csv('../reviews.csv')

In [2]:
data_reviews = data_reviews.drop(columns=["id", "via_mobile","author","date", "date_stayed","num_helpful_votes", "title"])

In [3]:
import re
import json
def fix_json_format(address_string):
    address_string = re.sub(r"'", r'"', address_string)
    
    return address_string
#  Function to fix and convert address strings in the DataFrame
def convert_review_string(address_string):
    if isinstance(address_string, str):
        fixed_string = fix_json_format(address_string)
        return json.loads(fixed_string)  # Convert to dictionary
    return address_string  # Return as is if not a string

# Apply the function to the 'address' column
data_reviews['ratings'] = data_reviews['ratings'].apply(convert_review_string)


# Convert the 'address' column into a dataframe where each key becomes a new column
reviews_df = pd.json_normalize(data_reviews['ratings'])

# Now, concatenate this new dataframe to your original dataframe
data_reviews = pd.concat([data_reviews, reviews_df], axis=1)

# Optionally, you can drop the original 'address' column if you no longer need it
data_reviews.drop(columns=['ratings','check_in_front_desk', 'business_service_(e_g_internet_access)'], inplace=True)

In [4]:
data_hotels = data_hotels.drop(columns=["phone", "details","region_id","type", "url"])

In [5]:
import json
import re

# Function to fix the JSON format, while preserving apostrophes in words
def fix_json_format(address_string):


    # 3. Detect all apostrophes inside double quotes and remove them
    # This will remove any single quote (apostrophe) between a pair of double quotes
    address_string = re.sub(r'("(?:[^"\\]|\\.)*?)\'(.*?")', r'\1\2', address_string)

    
    # 4. Handle any double quotes inside a street address properly (if needed)
    # Escape problematic double quotes inside actual string values like street names
    address_string = re.sub(r'(?<!\\)"([A-Za-z ]*)"(?=\s+Street)', r'\"\1\"', address_string)
    address_string = re.sub(r"'", r'"', address_string)
    
    return address_string

# Function to fix and convert address strings in the DataFrame
def convert_address_string(address_string):
    if isinstance(address_string, str):
        fixed_string = fix_json_format(address_string)
        return json.loads(fixed_string)  # Convert to dictionary
    return address_string  # Return as is if not a string

# Apply the function to the 'address' column
data_hotels['address'] = data_hotels['address'].apply(convert_address_string)

In [6]:
# Convert the 'address' column into a dataframe where each key becomes a new column
address_df = pd.json_normalize(data_hotels['address'])

# Now, concatenate this new dataframe to your original dataframe
data_hotels = pd.concat([data_hotels, address_df], axis=1)

# Optionally, you can drop the original 'address' column if you no longer need it
data_hotels.drop(columns=['address'], inplace=True)

In [7]:
import numpy as np
# Calculate the average rating based on the specified columns
data_reviews["rating"] = data_reviews[[
    "service", "cleanliness", "overall", "value", 
    "location", "sleep_quality", "rooms"
]].mean(axis=1)

data_reviews.drop(columns=["service", "cleanliness", "overall", "value", 
    "location", "sleep_quality", "rooms"] , inplace=True)


In [8]:
# Rename columns from 'hotel' table with 'hotel_' prefix
hotel_columns = {col: f'hotel_{col}' for col in data_hotels.columns if col != 'offering_id'}

# Rename columns from 'reviews' table with 'reviews_' prefix
reviews_columns = {col: f'reviews_{col}' for col in data_reviews.columns if col != 'id'}

data = pd.merge(data_reviews, data_hotels, left_on="offering_id", right_on="id", how="left",suffixes=("_rewiew", "_hotel"))
data = data.drop(columns=["offering_id"])

data = data.rename(columns={**hotel_columns, **reviews_columns})
data = data.dropna()

### Show data

In [9]:
data.head()

Unnamed: 0,reviews_text,reviews_rating,hotel_hotel_class,hotel_id,hotel_name,hotel_region,hotel_street-address,hotel_postal-code,hotel_locality
0,Stayed in a king suite for 11 nights and yes i...,5.0,3.0,93338,Hotel Beacon,NY,2130 Broadway at 75th Street,10023,New York City
1,"On every visit to NYC, the Hotel Beacon is the...",5.0,3.0,93338,Hotel Beacon,NY,2130 Broadway at 75th Street,10023,New York City
2,This is a great property in Midtown. We two di...,4.285714,4.0,1762573,Andaz 5th Avenue,NY,485 5th Avenue,10017,New York City
3,The Andaz is a nice hotel in a central locatio...,4.857143,4.0,1762573,Andaz 5th Avenue,NY,485 5th Avenue,10017,New York City
4,I have stayed at each of the US Andaz properti...,4.428571,4.0,1762573,Andaz 5th Avenue,NY,485 5th Avenue,10017,New York City


In [10]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 841943 entries, 0 to 878560
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   reviews_text          841943 non-null  object 
 1   reviews_rating        841943 non-null  float64
 2   hotel_hotel_class     841943 non-null  float64
 3   hotel_id              841943 non-null  int64  
 4   hotel_name            841943 non-null  object 
 5   hotel_region          841943 non-null  object 
 6   hotel_street-address  841943 non-null  object 
 7   hotel_postal-code     841943 non-null  object 
 8   hotel_locality        841943 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 64.2+ MB
None


### BM vs. My model

In [11]:
from rank_bm25 import BM25Okapi

class Query:
    def __init__(self, text, avg_rating, region	, locality):
        self.text = text
        self.rating = avg_rating
        self.region = region
        self.locality = locality


    def bm25_model(self, reviews):
        filtered_reviews = reviews[(reviews['hotel_region'] == self.region) & (reviews['hotel_locality'] == self.locality)]
        
        corpus = filtered_reviews["reviews_text"]
        tokenized_corpus = [rating.split(" ") for rating in corpus]

        bm25 = BM25Okapi(tokenized_corpus)
        tokenized_query = self.text.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        most_related_place_bm = filtered_reviews.iloc[doc_scores.argmax()]["hotel_id"]
        print(filtered_reviews.iloc[doc_scores.argmax()]["reviews_text"])
        return most_related_place_bm


    def my_model(self, reviews):
        filtered_reviews = reviews[(reviews['hotel_region'] == self.region) & (reviews['hotel_locality'] == self.locality)]
        
        print(len(self.text.split(" ")))
        corpus = filtered_reviews["reviews_text"][0:5]
        tokenized_corpus = [rating.split(" ") for rating in corpus]
        print(tokenized_corpus)
        corpus = filtered_reviews["reviews_text"]
        tokenized_corpus = [rating.split(" ") for rating in corpus]

        bm25 = BM25Okapi(tokenized_corpus)
        tokenized_query = self.text.split(" ")
        doc_scores = bm25.get_scores(tokenized_query)
        most_related_place_bm = filtered_reviews.iloc[doc_scores.argmax()]["hotel_id"]
        print(filtered_reviews.iloc[doc_scores.argmax()]["reviews_text"])
        return most_related_place_bm

    def evaluate(self, selected_place, reviews):
        filtered_reviews = reviews[reviews['hotel_id'] == selected_place]
        mse = 0 
        for i in range(len(filtered_reviews)):
            review = filtered_reviews.iloc[i]
            mse += (self.rating - review["reviews_rating"]) **2
        mse /= filtered_reviews.shape[0]
        return mse


query = Query(text="best breakfast", 
              avg_rating = 4, 
              region="MA", 
              locality = "Boston")


similates_id = query.bm25_model(data)
print("Selected hotel:" , np.unique(data[data['hotel_id'] == similates_id][["hotel_name", "hotel_region", "hotel_locality"]]))
mse = query.evaluate(similates_id,data)
print("Mean Squared Error - BM25:", mse)

similates_id = query.my_model(data)
print("Selected hotel:" , np.unique(data[data['hotel_id'] == similates_id][["hotel_name", "hotel_region", "hotel_locality"]]))
mse = query.evaluate(similates_id,data)
print("Mean Squared Error - My model:", mse)

Really, really good. Despite being a huge city Hotel it still has a personal touch. Without a doubt it also has the best buffet breakfast in town. Also the very best open air roof top pool, you couldn't ask for any more. If you can stay - do stay.
Selected hotel: ['Boston' 'MA' 'Sheraton Boston Hotel']
Mean Squared Error - BM25: 0.7810000750680801
2
[['We', 'have', 'been', 'to', 'Boston', '5', 'times', 'now', 'around', 'Thanksgiving..and', 'now', 'only', 'stay', 'Kimpton.', 'Our', 'friends', 'who', 'have', 'never', 'been', 'to', 'US', 'before', 'were', 'amazed.', 'The', 'Onyx', 'did', 'not', 'let', 'us', 'down.', 'Smaller', 'than', 'other', 'hotels', 'in', 'reception', 'but', 'just', 'as', 'welcoming.', 'Restaurant', 'area', 'smaller', 'but', 'just', 'as', 'accommodating.', 'Staff', 'brilliant.', 'We', 'walked', 'everywhere', "'til", 'we', 'dropped.', 'Saw', 'sights', 'we', 'had', 'never', 'seen.', 'The', 'location', 'is', 'brilliant', 'for', 'north', 'end,', 'Little', 'Italy,', 'Cambr