In [64]:
!pip install langid
import numpy as np
import pandas as pd
import langid


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [65]:
# Load dataset
df_hotel = pd.read_csv("booking_hotel_details.csv")
df_review = pd.read_csv("booking_reviews_details.csv")

In [66]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review)} rows.")

Currently, hotel dataset has 225 rows.
Currently, review dataset has 6091 rows.


## 1) converting the date to dd/mm/yyyy

In [67]:
df_review['Review_Date'] = pd.to_datetime(df_review['Review_Date'])
df_review['Scraped_At'] = pd.to_datetime(df_review['Scraped_At'])
df_review['Stay_Date'] = pd.to_datetime(df_review['Stay_Date'])
# dd/mm/yyyy
df_review['Review_Date'] = df_review['Review_Date'].dt.strftime('%d/%m/%Y')
df_review['Scraped_At'] = df_review['Scraped_At'].dt.strftime('%d/%m/%Y')
df_review['Stay_Date'] = df_review['Stay_Date'].dt.strftime('%d/%m/%Y')

# Save merged_df dataset as csv
df_review.to_csv("booking_reviews_details_final.csv", index=False, encoding="utf-8")
df_review = pd.read_csv("booking_reviews_details_final.csv")

## 2) remove the rows with unvalid reviews

In [68]:
# Remove rows of invalid review from df_review dataset
df_review= df_review[~df_review['Unhappy_Review'].str.contains("There are no comments available for this review", na=False)]

In [69]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review)} rows.")

Currently, hotel dataset has 225 rows.
Currently, review dataset has 5537 rows.


## 3) remove rows from df_review dataset when tourists stayed before April 2022 

In [70]:
# Filter the dataset to only include stays on or after April 2022
df_review['Stay_Date'] = pd.to_datetime(df_review['Stay_Date'], format='%d/%m/%Y')
df_review = df_review[df_review['Stay_Date'] >= '2022-04-01']

In [71]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review)} rows.")

Currently, hotel dataset has 225 rows.
Currently, review dataset has 5221 rows.


## 4) Remove rows when "review" is not in English

In [72]:
# Function to check whether text is English
def is_english(text):
    if pd.isna(text):
        return False  # if value is null, return False
    lang, _ = langid.classify(text)
    return lang == "en"

In [73]:
# Create a new column "is_english" to indicate whether the review is wrote in English

df_review.loc[:, "is_english"] = df_review["Unhappy_Review"].apply(is_english)
df_review.loc[:, "is_english"] = df_review["Happy_Review"].apply(is_english)

# Keep English reviews and save into df_review_english dataset
df_review_english = df_review[df_review["is_english"]]
df_review_english = df_review_english.reset_index(drop=True)

In [74]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review_english)} rows, deleted {len(df_review) - len(df_review_english)} rows which are not in English.")

Currently, hotel dataset has 225 rows.
Currently, review dataset has 4670 rows, deleted 551 rows which are not in English.


In [75]:
df_review_english.head(2)

Unnamed: 0,hotel_name,User_Name,Country,Room_Info,No_of_nights_stayed,Stay_Date,Traveller_Type,Review_Title,Rating,Happy_Review,Unhappy_Review,Review_Date,Scraped_At,is_english
0,Ascott Raffles Place Singapore,Luis,Portugal,Cutler Suite,1.0,2022-12-01,Couple,Exceptional,10.0,"room very spacious and clean, great location",the night receptionist was unpleasant and not ...,12/12/2022,10/02/2024,True
1,Ascott Raffles Place Singapore,Thi,Australia,Cutler Suite,4.0,2024-01-01,Group,Excellent place,8.0,It was in a good location. The staff was very ...,,29/01/2024,10/02/2024,True


In [76]:
# Remove rows where both 'Happy_Review' and 'Unhappy_Review' are empty or NaN
df_review_english = df_review_english[~((df_review_english['Happy_Review'].isna() | df_review_english['Happy_Review'].eq('')) & 
                        (df_review_english['Unhappy_Review'].isna() | df_review_english['Unhappy_Review'].eq('')))]


In [77]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review_english)} rows.")

Currently, hotel dataset has 225 rows.
Currently, review dataset has 4670 rows.


In [79]:
# Remove "is_english" columns
df_review_english = df_review_english.drop(["is_english"], axis=1)

In [81]:
df_review_english.head(2)

Unnamed: 0,hotel_name,User_Name,Country,Room_Info,No_of_nights_stayed,Stay_Date,Traveller_Type,Review_Title,Rating,Happy_Review,Unhappy_Review,Review_Date,Scraped_At
0,Ascott Raffles Place Singapore,Luis,Portugal,Cutler Suite,1.0,2022-12-01,Couple,Exceptional,10.0,"room very spacious and clean, great location",the night receptionist was unpleasant and not ...,12/12/2022,10/02/2024
1,Ascott Raffles Place Singapore,Thi,Australia,Cutler Suite,4.0,2024-01-01,Group,Excellent place,8.0,It was in a good location. The staff was very ...,,29/01/2024,10/02/2024


## 4) revise the column names

In [83]:
# Change all column names to lower case and replace Spaces with underscores
df_review_english.columns = df_review_english.columns.str.lower().str.replace(' ', '_')
df_hotel.columns = df_hotel.columns.str.lower().str.replace(' ', '_')

In [86]:
df_hotel.head(2)


Unnamed: 0,url,hotel_name,address,stars,reviews_count,reviews_grade,grades_staff,grades_facilities,grades_cleanliness,grades_comfort,grades_value_for_money,grades_location,sustainable_level,most_popular_facilities,highlights
0,https://www.booking.com/hotel/sg/marina-bay-sa...,Marina Bay Sands,"10 Bayfront Avenue, Marina Bay, 018956 Singapo...",5.0,12661,9.2,9.4,9.4,9.5,9.5,8.0,9.6,3+,"2 restaurants, Bar, Family rooms, Fitness cent...",Top location: Highly rated by recent guests (9...
1,https://www.booking.com/hotel/sg/carlton.en-gb...,Carlton Hotel Singapore,"76 Bras Basah Road, City Hall, 189558 Singapor...",5.0,7322,8.4,8.7,8.5,9.0,9.1,8.1,9.4,2,"4 restaurants, Family rooms, Fitness centre, F...","Situated in the real heart of Singapore, this ..."


In [87]:
df_review_english.head(2)

Unnamed: 0,hotel_name,user_name,country,room_info,no_of_nights_stayed,stay_date,traveller_type,review_title,rating,happy_review,unhappy_review,review_date,scraped_at
0,Ascott Raffles Place Singapore,Luis,Portugal,Cutler Suite,1.0,2022-12-01,Couple,Exceptional,10.0,"room very spacious and clean, great location",the night receptionist was unpleasant and not ...,12/12/2022,10/02/2024
1,Ascott Raffles Place Singapore,Thi,Australia,Cutler Suite,4.0,2024-01-01,Group,Excellent place,8.0,It was in a good location. The staff was very ...,,29/01/2024,10/02/2024


## 5) save latest dataset as csv file

In [88]:
# Save latest dataset as csv
df_hotel.to_csv("booking_hotels_details_valid.csv", index=False, encoding="utf-8")
df_review_english.to_csv("booking_reviews_details_valid.csv", index=False, encoding="utf-8")