# üìì Zomato Dataset Cleaning and Preprocessing

In [None]:
import pandas as pd
import numpy as np

## üîΩ Step 1: Load the Dataset

In [None]:
df = pd.read_csv("zomato.csv")
df.head()

## üßπ Step 2: Drop Unnecessary Columns

In [None]:
columns_to_drop = ['url', 'phone', 'rest_type', 'dish_liked', 'reviews_list', 'menu_item', 'listed_in(city)']
df_cleaned = df.drop(columns=columns_to_drop)
df_cleaned.head()

## üè∑Ô∏è Step 3: Rename Columns for Better Readability

In [None]:
df_cleaned = df_cleaned.rename(columns={
    'approx_cost(for two people)': 'two_people_cost',
    'listed_in(type)': 'type_of_restaurant',
    'rate': 'rating'
})
df_cleaned.head()

## ‚ùå Step 4: Drop Rows with Missing Key Information

In [None]:
df_cleaned = df_cleaned.dropna(subset=['location', 'cuisines', 'two_people_cost'])
df_cleaned.info()

## üí∞ Step 5: Clean and Convert Cost Column

In [None]:
df_cleaned['two_people_cost'] = df_cleaned['two_people_cost'].str.replace(',', '')
df_cleaned['two_people_cost'] = df_cleaned['two_people_cost'].astype(int)
df_cleaned['cost_per_person'] = df_cleaned['two_people_cost'] / 2
df_cleaned = df_cleaned.drop(columns=['two_people_cost'])
df_cleaned.head()

## üåü Step 6: Clean the Rating Column

In [None]:
def handle_rating(value):
    if value in ['NEW', '-']:
        return np.nan
    else:
        return float(str(value).split('/')[0])

df_cleaned['rating'] = df_cleaned['rating'].apply(handle_rating)
df_cleaned['rating'] = df_cleaned['rating'].fillna(df_cleaned['rating'].mean())
df_cleaned.head()

## üíæ Step 7: Save Cleaned Data

In [None]:
df_cleaned.to_csv("zomato_data_analysis.csv", index=False)

## üìä Step 8: Overview of Final Cleaned Data

In [None]:
df_cleaned.info()