In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_palette("Set1")

In [None]:
# Read file into dataframe
df = pd.read_csv(r"C:\Users\LENOVO\Documents\My Files Practice\Zomato-data-.csv")
df.head()

In [None]:
# Get summary about the data frame 
df.info()

In [None]:
 # 1. Check for missing values
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
# 2. Remove the denominator in the rate column to put data in cleaner format
def handleRate(value):
    value=str(value).split('/')
    value=value[0];
    return float(value)

In [None]:
df['rate']=df['rate'].apply(handleRate)
df.head()

In [None]:
df.info()

In [None]:
 # 3. Standardize categorical columns
df["online_order"] = df["online_order"].str.strip()
df["book_table"] = df["book_table"].str.strip()
df["resturant_type"] = df["listed_in(type)"].str.strip()

In [None]:
df["cost_for_two"] = df["approx_cost(for two people)"]

In [None]:
# 4. Drop columns
df.drop(columns = ["approx_cost(for two people)", "listed_in(type)"], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
# Exploring online vs offline order
sns.countplot(x = df["online_order"], hue = df["online_order"])
plt.xlabel("Online Order");


In [None]:
# Order mode prefered by resturant type
pivot_table = df.pivot_table(index="resturant_type", columns="online_order", aggfunc="size", fill_value=0)
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', fmt='d')
plt.title("Heatmap")
plt.xlabel("Online Order")
plt.ylabel("Resturant Type")
plt.show()

In [None]:
online_order_percentage = ((df["online_order"].value_counts(normalize = True))*100).round(2)
print(online_order_percentage)

In [None]:
# Exploring the resturant types by availability
sns.countplot(x = df["resturant_type"], hue = df["resturant_type"])
plt.xlabel("Type of Resturant");

In [None]:
percentage_of_resturant_type = ((df["resturant_type"].value_counts(normalize = True)) * 100).round(2)
print(percentage_of_resturant_type)

In [None]:
# Exploring resturant types by popularity
grouped_data = df.groupby("resturant_type")["votes"].sum()
plt.plot(grouped_data, marker = "o")
plt.xlabel("Resturant Type")
plt.ylabel("Sum of Votes")
plt.title("Resturant Type Popularity Chart")

In [None]:
# Price category function
def categorize_price(cost):
    if cost <= 300:
        return 'Budget (≤300)'
    elif cost <= 600:
        return 'Mid-range (301-600)'
    elif cost <= 900:
        return 'Premium (601-900)'
    else:
        return 'Luxury (>900)'
    
df['price_category'] = df['cost_for_two'].apply(categorize_price)
    

In [None]:
# Analyze preferences by votes (popularity proxy)
price_analysis = df.groupby("price_category").agg({
        "votes": ["count", "sum", "mean"],
        "rate": "mean",
        "cost_for_two": "mean"
 }).round(2)
    
price_analysis.columns = ["num_restaurants", "total_votes", "avg_votes_per_restaurant", "avg_rating", "avg_cost"]
    
# Sort by total votes to see most popular price ranges
price_analysis = price_analysis.sort_values("total_votes", ascending=False)

In [None]:
price_analysis

In [None]:
# Exploring the price rate for couples
sns.countplot(x= df["cost_for_two"], hue = df["cost_for_two"]);

In [None]:
plt.hist(x=df["cost_for_two"], bins = 10);