In [13]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('../data/zomato_dataset.csv', encoding='utf-8')
print(df.shape)

(44891, 7)


In [14]:
# Drop Missing Cuisines rows
df.dropna(subset=['Cuisine'], inplace=True)
print(f"Shape after dropping missing cuisines: {df.shape}")

Shape after dropping missing cuisines: (44872, 7)


In [15]:
# Converting rating from string to float
df['Rating'] = df['Rating'].replace({'New': np.nan, '-': np.nan})
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
print(f"Rating range: {df['Rating'].min()} to {df['Rating'].max()}")
print(f"Unrated restaurants: {df['Rating'].isnull().sum()}")

Rating range: 2.4 to 4.9
Unrated restaurants: 7054


In [16]:
# Converting price from string to float
df['Average Price'] = df['Average Price'].astype(str).str.replace('₹', '').str.replace('for one', '').str.replace(',', '').str.strip()
df['Average Price'] = pd.to_numeric(df['Average Price'], errors='coerce')
print(f"Price range: ₹{df['Average Price'].min()} to ₹{df['Average Price'].max()}")
print(f"Missing prices: {df['Average Price'].isnull().sum()}")

Price range: ₹10.0 to ₹2000.0
Missing prices: 19


In [17]:
# Converting delivery time from string to float
df['Average Delivery Time'] = df['Average Delivery Time'].str.replace('min', '').str.strip()
df['Average Delivery Time'] = pd.to_numeric(df['Average Delivery Time'], errors='coerce')
print(f"Delivery Time dtype: {df['Average Delivery Time'].dtype}")
print(f"Delivery time range: {df['Average Delivery Time'].min()} to {df['Average Delivery Time'].max()} mins")

Delivery Time dtype: float64
Delivery time range: 6.0 to 190.0 mins


In [18]:
# Encoding safety measure as binary for better analysis
df['Safety Measure'] = df['Safety Measure'].map({
    'Follows all Max Safety measures to ensure your food is safe': 1,
    'Restaurant partner follows WHO protocol': 0
})
print(df['Safety Measure'].value_counts())

Safety Measure
1    25721
0    19151
Name: count, dtype: int64


In [19]:
# Checking for duplicates and dropping them
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
df.drop_duplicates(inplace=True)
print(f"Shape after removing duplicates: {df.shape}")

Duplicate rows: 147
Shape after removing duplicates: (44725, 7)


In [21]:
print("Final shape:", df.shape)
print("\nMissing values:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

Final shape: (44725, 7)

Missing values:
Restaurant Name             0
Rating                   6960
Cuisine                     0
Average Price              18
Average Delivery Time    5143
Safety Measure              0
Location                    0
dtype: int64

Data types:
Restaurant Name           object
Rating                   float64
Cuisine                   object
Average Price            float64
Average Delivery Time    float64
Safety Measure             int64
Location                  object
dtype: object


In [22]:
df.to_csv('../data/zomato_cleaned.csv', index=False)
print("Cleaned data saved!")

Cleaned data saved!
