In [1]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
Current_File = "Restaurants.xlsx"
Processed_File= "Data_Restaurants.xlsx"

In [3]:
df = pd.read_excel(Current_File)
print("Current data shape : ", df.shape)

Current data shape :  (212589, 18)


In [4]:
print("Current Column name : ")
print(df.columns.tolist())
df.columns = [colum.strip().replace("_", " ").title().replace(" ", "") for colum in df.columns]
print("Column names are standardized. Below are all the new column names : ")
print(df.columns.tolist())

Current Column name : 
['res_id', 'name', 'establishment', 'url', 'address', 'city', 'city_id', 'locality', 'zipcode', 'cuisines', 'timings', 'average_cost_for_two', 'price_range', 'currency', 'highlights', 'aggregate_rating', 'rating_text', 'votes']
Column names are standardized. Below are all the new column names : 
['ResId', 'Name', 'Establishment', 'Url', 'Address', 'City', 'CityId', 'Locality', 'Zipcode', 'Cuisines', 'Timings', 'AverageCostForTwo', 'PriceRange', 'Currency', 'Highlights', 'AggregateRating', 'RatingText', 'Votes']


In [5]:
def Transform_List(x):
    try:
        if isinstance(x, str):
            val = ast.literal_eval(x)
            if isinstance(val, list):
                return [element.strip().title() for element in val if isinstance(element, str)]
            else:
                return [str(val).strip().title()]
        elif isinstance(x, list):
            return [element.strip().title() for element in x if isinstance(element, str)]
        else:
            return [str(x).strip().title()]
    except:
        return [str(x).strip().title()]

for col_val in ["Establishment", "Highlights"]:
    if col_val  in df.columns:
        df[col_val ] = df[col_val].apply(Transform_List)
        print(f"The column '{col_val}' has been converted into a list element.")

The column 'Establishment' has been converted into a list element.
The column 'Highlights' has been converted into a list element.


In [None]:
temp = len(df)
df = df.drop_duplicates(subset=["ResId", "Name", "Address", "City"])
print(f"Removed {temp - len(df)} duplicate rows.")

Removed 156912 duplicate rows


In [7]:
essential_columns = ["ResId", "Name", "Address", "City", "Cuisines", "AggregateRating"]
temp = len(df)
df = df.dropna(subset=essential_columns)
print(f"Dropped {temp - len(df)} rows with nulls in essential columns.")

Dropped 685 rows with nulls in essential columns.


In [8]:
df["Cuisines"] = df["Cuisines"].str.lower().str.strip()
df["AggregateRating"] = pd.to_numeric(df["AggregateRating"], errors='coerce')
df["AverageCostForTwo"] = pd.to_numeric(df["AverageCostForTwo"], errors='coerce')
df["PriceRange"] = pd.to_numeric(df["PriceRange"], errors='coerce')
df = df.dropna(subset=["AggregateRating", "AverageCostForTwo", "PriceRange"])
for col_val in ["Address", "City", "Locality"]:
    df[col_val] = df[col_val].astype(str).str.strip().str.title()
print("Normalized the text and numeric fields.")

Normalized the text and numeric fields.


In [None]:
df["Votes"] = pd.to_numeric(df["Votes"], errors='coerce')
df["Votes"] = df["Votes"].apply(lambda y: y if y >= 0 else 0)
df["Votes"] = df["Votes"].fillna(0)
df["VotesLog"] = np.log1p(df["Votes"])
print("Normalized votes column. ")

Normalized votes column. 


In [10]:
df["AggregateRating"] = pd.to_numeric(df["AggregateRating"], errors='coerce')
df = df[df["AggregateRating"].notna()]  
df = df[df["AggregateRating"] > 0]     
df = df.reset_index(drop=True)
rating_minimum = df["AggregateRating"].min()
rating_maximum = df["AggregateRating"].max()
if rating_maximum > rating_minimum:
    df["RatingNormalized"] = (df["AggregateRating"] - rating_minimum) / (rating_maximum - rating_minimum)
    print("Normalized AggregateRating between 0–1.")
else:
    df["RatingNormalized"] = 1.0
    print("All ratings are identical, setted RatingNormalized to 1.0")


Normalized AggregateRating between 0–1.


In [11]:
df["CuisineList"] = df["Cuisines"].apply(lambda y: [c.strip().title() for c in y.split(",")])
print("Converted Cuisines coulmn values into list.")

Converted Cuisines coulmn values into list.


In [12]:
change = MultiLabelBinarizer()
highlight_encoded = pd.DataFrame(change.fit_transform(df["Highlights"]), columns=[f"Highlight_{c}" for c in change.classes_])
df = pd.concat([df, highlight_encoded], axis=1)
print("One-hot encoded is applied on highlights column.")


One-hot encoded is applied on highlights column.


In [13]:
city_frequency = df["City"].value_counts().to_dict()
df["CityPopularity"] = df["City"].map(city_frequency)
print("Added a column CityPopularity, having the score of how popular a city is. ")

Added a column CityPopularity, having the score of how popular a city is. 


In [14]:
def Open_days(text):
    if not isinstance(text, str): return 0
    days = re.findall(r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun)", text, flags=re.IGNORECASE)
    return len(set(day.lower() for day in days))

def Is_open_all_week(days): return 1 if days == 7 else 0
def Is_weekend_open(text):
    if not isinstance(text, str): return 0
    return int(bool(re.search(r"(Sat|Sun)", text, flags=re.IGNORECASE)))

df["OpenDays"] = df["Timings"].apply(Open_days)
df["IsOpenAllWeek"] = df["OpenDays"].apply(Is_open_all_week)
df["IsWeekendOpen"] = df["Timings"].apply(Is_weekend_open)
print("Extracted features about open days of restaurant, weekend opening and open all week. ")


Extracted features about open days of restaurant, weekend opening and open all week. 


In [15]:
def Open_Close_Time(text):
    if not isinstance(text, str):
        return (None, None)
    times = re.findall(r"(\d{1,2}[:.]?\d{0,2}\s*(?:AM|PM))", text, flags=re.IGNORECASE)
    if len(times) >= 2:
        opening = times[0]
        closing = times[-1]
        return (opening, closing)
    else:
        return (None, None)

df[["OpeningTime", "ClosingTime"]] = df["Timings"].apply(lambda y: pd.Series(Open_Close_Time(y)))
print("Extracted the OpeningTime and ClosingTime of the restaurant")

Extracted the OpeningTime and ClosingTime of the restaurant


In [None]:
temp = len(df)
df = df[(df["AggregateRating"] > 0) & (df["AverageCostForTwo"] > 0)]
print(f"Removed {temp - len(df)} rows with 0 rating or 0 cost")

Removed 191 rows with 0 ratin g or 0 cost


In [17]:
df = df.reset_index(drop=True)
df.to_excel(Processed_File, index=False)
print(f"Final cleaned data saved to: {Processed_File}")
print("Final counts of processed file :", df.shape)

Final cleaned data saved to: Data_Restaurants.xlsx
Final counts of processed file : (45143, 130)
