In [1]:
import numpy as np
import pandas as pd
import langid

In [2]:
# Define default source path
SRC_PATH = "src/"

In [3]:
# Load dataset
df_hotel = pd.read_csv(SRC_PATH + "agoda_hotels_details.csv")
df_review = pd.read_csv(SRC_PATH + "agoda_reviews_details.csv")

In [4]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review)} rows.")

Currently, hotel dataset has 361 rows.
Currently, review dataset has 55281 rows.


## 1) Remove invalid rows
### Invalid rows: Rows with NULL value in "Overall Rating" column of df_hotel

#### a) Remove invalid rows from df_review dataset

In [5]:
# Find out "Hotel ID" in df_hotel with NULL value "Overall Rating" column
missing_overall_rating_ids = df_hotel[df_hotel["Overall Rating"].isnull()]["Hotel ID"]

# Find out all rows's index of invalid hotels in df_review
matching_rows_index = df_review[df_review["hotel_id"].isin(missing_overall_rating_ids)].index

# Remove rows of invalid hotels from df_review 
df_review = df_review.drop(matching_rows_index).reset_index(drop=True)

#### b) Remove invalid rows from df_hotel dataset

In [6]:
# Remove rows of invalid hotels from df_hotel dataset
df_hotel = df_hotel.dropna(subset=["Overall Rating"]).reset_index(drop=True)

In [7]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows, deleted {len(missing_overall_rating_ids)} rows of invalid hotels.")
print(f"Currently, review dataset has {len(df_review)} rows, deleted {len(matching_rows_index)} rows of invalid hotels.")

Currently, hotel dataset has 361 rows, deleted 0 rows of invalid hotels.
Currently, review dataset has 55281 rows, deleted 0 rows of invalid hotels.


## 2) Remove duplicate rows
#### a) Remove duplicate reviews from df_review dataset

In [8]:
# Convert "review_date" as date format
df_review["review_date"] = pd.to_datetime(df_review["review_date"])

# List key columns that are used to identify the duplicate rows
key_columns_review = ["hotel_id", "reviewer", "stay_details", "review_score", "review_date", "review_title", "review"]

# Find out duplicated rows and keep first one
duplicated_rows_review = df_review[df_review.duplicated(subset=key_columns_review, keep="first")]

# Remove duplicated rows and keep first one
df_review = df_review.drop_duplicates(subset=key_columns_review, keep="first").reset_index(drop=True)

#### b) Remove duplicate hotels from df_hotel dataset

In [9]:
# List key columns that are used to identify the duplicate rows
key_columns_hotel = ["Hotel ID"]

# Find out duplicated rows and keep first one
duplicated_rows_hotel = df_hotel[df_hotel.duplicated(subset=key_columns_hotel, keep="first")]

# Remove duplicated rows and keep first one
df_hotel = df_hotel.drop_duplicates(subset=key_columns_hotel, keep="first").reset_index(drop=True)

In [10]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows, deleted {len(duplicated_rows_hotel)} duplicate rows.")
print(f"Currently, review dataset has {len(df_review)} rows, deleted {len(duplicated_rows_review)} duplicate rows.")

Currently, hotel dataset has 361 rows, deleted 0 duplicate rows.
Currently, review dataset has 35686 rows, deleted 19595 duplicate rows.


## 3) Remove rows from df_review dataset when tourists stayed before April 2022 

In [11]:
# Remove "stay_date" column if it already exists
if "stay_date" in df_review.columns:
    df_review = df_review.drop(columns=["stay_date"])

# Add an empty column "stay_date" after "stay_details" column
df_review.insert(df_review.columns.get_loc("stay_details") + 1, "stay_date", None)

# Extract year and month from "stay_details" column
# and store extracted date into "stay_date" column
df_review["stay_date"] = df_review["stay_details"].str.extract(r"in (\w+ \d+)")

# Convert as date format
df_review["stay_date"] = pd.to_datetime(df_review["stay_date"], format="%B %Y")

# Keep reviews after April 2022, and save into df_review_after202204 dataset
df_review_after202204 = df_review[df_review["stay_date"] >= "2022-04-01"]
df_review_after202204 = df_review_after202204.reset_index(drop=True)

# Convert as "dd/mm/yyyy" format
df_review_after202204["stay_date"] = df_review_after202204["stay_date"].dt.strftime("%d/%m/%Y")

In [12]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review_after202204)} rows, deleted {len(df_review) - len(df_review_after202204)} rows which tourists stayed before April 2022.")

Currently, hotel dataset has 361 rows.
Currently, review dataset has 32279 rows, deleted 3407 rows which tourists stayed before April 2022.


## 4) Remove rows from df_review_after202204 dataset when "review" is not in English

In [13]:
# Function to check whether text is English
def is_english(text):
    if pd.isna(text):
        return False  # if value is null, return False
    lang, _ = langid.classify(text)
    return lang == "en"

In [14]:
# Create a new column "is_english" to indicate whether the review is wrote in English
#df_review_after202204["is_english"] = df_review_after202204["review"].apply(is_english)
df_review_after202204.loc[:, "is_english"] = df_review_after202204["review"].apply(is_english)

# Keep English reviews and save into df_review_english dataset
df_review_english = df_review_after202204[df_review_after202204["is_english"]]
df_review_english = df_review_english.reset_index(drop=True)

In [15]:
print(f"Currently, hotel dataset has {len(df_hotel)} rows.")
print(f"Currently, review dataset has {len(df_review_english)} rows, deleted {len(df_review_after202204) - len(df_review_english)} rows which are not in English.")

Currently, hotel dataset has 361 rows.
Currently, review dataset has 31599 rows, deleted 680 rows which are not in English.


In [16]:
df_review_english.head(2)

Unnamed: 0,hotel_id,hotel_name,rating_cat,reviewer,country,group_name,room_type,stay_details,stay_date,review_score,review_score_cat,review_date,review_title,review,is_english
0,1007,"PARKROYAL on Kitchener Road, Singapore",9+ Exceptional,ravi,India,Couple,Superior Corner Room with 1 King Bed,Stayed 6 nights in January 2024,01/01/2024,9.6,Exceptional,2024-01-28,Gr8,Awesome in all aspects,True
1,1007,"PARKROYAL on Kitchener Road, Singapore",9+ Exceptional,Pankaj,India,Couple,Superior Room with 1 King Bed,Stayed 3 nights in January 2024,01/01/2024,9.6,Exceptional,2024-01-23,Overall it was a good experience,Everything was good there except they dont hav...,True


## 5) Correct wrong "rating_cat" category

In [17]:
# Note:
# rating_cat = "9+ Exceptional": review scores > 9
# rating_cat = "8-9 Excellent": review scores > 8 and <= 9
# rating_cat = "7-8 Very Good": review scores> 7 and <= 8
# rating_cat = "6-7 Good": review scores > 6 and <= 7
# rating_cat = "<6 Below Expectation": review scores <= 6

In [18]:
# Copy "rating_cat" column and insert it into df_review_english dataset as "old_rating_cat" 
#df_review_english["old_rating_cat"] = df_review_english["rating_cat"].copy()
df_review_english.loc[:, "old_rating_cat"] = df_review_english["rating_cat"].copy()

In [19]:
# Assign correct rating category based on above note
df_review_english.loc[df_review_english["review_score"] > 9, "rating_cat"] = "9+ Exceptional"
df_review_english.loc[(df_review_english["review_score"] > 8) & (df_review_english["review_score"] <= 9), "rating_cat"] = "8-9 Excellent"
df_review_english.loc[(df_review_english["review_score"] > 7) & (df_review_english["review_score"] <= 8), "rating_cat"] = "7-8 Very Good"
df_review_english.loc[(df_review_english["review_score"] > 6) & (df_review_english["review_score"] <= 7), "rating_cat"] = "6-7 Good"
df_review_english.loc[df_review_english["review_score"] <= 6, "rating_cat"] = "<6 Below Expectation"

In [20]:
# Count total number of rows with wrong rating category
incorrect_cat = df_review_english.loc[df_review_english["rating_cat"] != df_review_english["old_rating_cat"]]

print(f"Total rows with incorrect rating category: {len(incorrect_cat)}")

Total rows with incorrect rating category: 4


In [21]:
df_review_english.head(2)

Unnamed: 0,hotel_id,hotel_name,rating_cat,reviewer,country,group_name,room_type,stay_details,stay_date,review_score,review_score_cat,review_date,review_title,review,is_english,old_rating_cat
0,1007,"PARKROYAL on Kitchener Road, Singapore",9+ Exceptional,ravi,India,Couple,Superior Corner Room with 1 King Bed,Stayed 6 nights in January 2024,01/01/2024,9.6,Exceptional,2024-01-28,Gr8,Awesome in all aspects,True,9+ Exceptional
1,1007,"PARKROYAL on Kitchener Road, Singapore",9+ Exceptional,Pankaj,India,Couple,Superior Room with 1 King Bed,Stayed 3 nights in January 2024,01/01/2024,9.6,Exceptional,2024-01-23,Overall it was a good experience,Everything was good there except they dont hav...,True,9+ Exceptional


## 6) Sort df_review_english data by the specified columns

In [22]:
# Define category order for "rating_cat"
cat_order = {"9+ Exceptional": 0, "8-9 Excellent": 1, "7-8 Very Good": 2, "6-7 Good": 3, "<6 Below Expectation": 4}

# Create a new column "cat_order"
#df_review_english["cat_order"] = df_review_english["rating_cat"].map(cat_order)
df_review_english.loc[:, "cat_order"] = df_review_english["rating_cat"].map(cat_order)


# To sort reviews
# 1) Sort df_review_english data according to the order of "hotel_id"
# 2) Each hotel's reviews are sorted according to "rating_cat" based on cat_order
# 3) Each "rating_cat" category is sorted by "review_date" by descending
df_review_sorted = df_review_english.sort_values(by=["hotel_id", "cat_order", "review_date"], ascending=[True, True, False])
df_review_sorted = df_review_sorted.reset_index(drop=True)

In [23]:
# Convert "review_date" as "dd/mm/yyyy" date format
df_review_sorted["review_date"] = df_review_sorted["review_date"].dt.strftime("%d/%m/%Y")

## 7) Other

#### a) For df_review_sorted dataset

In [24]:
# Remove "stay_length" column if it already exists
if "stay_length" in df_review_sorted.columns:
    df_review_sorted = df_review_sorted.drop(columns=["stay_length"])

# Add an empty column "stay_length" after "stay_details" column
df_review_sorted.insert(df_review_sorted.columns.get_loc("stay_details") + 1, "stay_length", None)

# Extract stay_length from "stay_details" column
# and store extracted value into "stay_length" column
df_review_sorted["stay_length"] = df_review_sorted["stay_details"].str.extract("(\d+)").astype(float)

In [25]:
# Remove empty rows for "reviewer", "country", "review_title" and "review" columns
df_review_sorted = df_review_sorted.dropna(subset=["reviewer", "country", "review_title", "review"]).reset_index(drop=True)

In [26]:
# Remove "is_english", "old_rating_cat", "cat_order" columns
df_review_sorted = df_review_sorted.drop(["is_english", "old_rating_cat", "cat_order"], axis=1)

In [27]:
df_review_sorted.head(2)

Unnamed: 0,hotel_id,hotel_name,rating_cat,reviewer,country,group_name,room_type,stay_details,stay_length,stay_date,review_score,review_score_cat,review_date,review_title,review
0,1007,"PARKROYAL on Kitchener Road, Singapore",9+ Exceptional,ravi,India,Couple,Superior Corner Room with 1 King Bed,Stayed 6 nights in January 2024,6.0,01/01/2024,9.6,Exceptional,28/01/2024,Gr8,Awesome in all aspects
1,1007,"PARKROYAL on Kitchener Road, Singapore",9+ Exceptional,Pankaj,India,Couple,Superior Room with 1 King Bed,Stayed 3 nights in January 2024,3.0,01/01/2024,9.6,Exceptional,23/01/2024,Overall it was a good experience,Everything was good there except they dont hav...


#### b) For df_hotel dataset

In [28]:
# Change all column names to lower case and replace Spaces with underscores
df_hotel.columns = df_hotel.columns.str.lower().str.replace(' ', '_')

In [29]:
df_hotel.head(2)

Unnamed: 0,hotel_id,hotel_name,hotel_url,hotel_address,total_reviews,overall_rating,overall_rating_category,rating_location,rating_service,rating_cleanliness,rating_room_comfort_and_quality,rating_value_for_money,rating_facilities
0,408551,Dorsett Singapore,https://www.agoda.com/dorsett-singapore/hotel/...,"333 New Bridge Road, Chinatown, Singapore, Sin...",10337.0,8.2,Excellent,9.0,8.4,8.3,8.1,8.1,7.9
1,1635,M Hotel Singapore,https://www.agoda.com/m-hotel/hotel/singapore-...,"81 Anson Road, CBD, Singapore, Singapore, 079908",9034.0,8.1,Excellent,8.2,8.1,8.5,8.2,8.0,8.0


## 8) Save latest dataset as csv file

In [30]:
# Save latest dataset as csv
df_hotel.to_csv(SRC_PATH + "agoda_hotels_details_valid.csv", index=False, encoding="utf-8")
df_review_sorted.to_csv(SRC_PATH + "agoda_reviews_details_valid.csv", index=False, encoding="utf-8")