In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re


In [None]:

df = pd.read_csv("zomato.csv", encoding='latin1', on_bad_lines='skip')


In [None]:

def extract_rating(val):
    if isinstance(val, str):
        match = re.search(r'(\d\.\d)/5', val.strip())
        if match:
            return float(match.group(1))
    return np.nan

df['rate'] = df['rate'].apply(extract_rating)
df['rate'].fillna(df['rate'].mean(), inplace=True)


In [None]:

df.drop(['address', 'phone', 'dish_liked'], axis=1, inplace=True)


In [None]:

df['votes'] = df['votes'].astype(str).str.replace(',', '', regex=False)
df['votes'] = pd.to_numeric(df['votes'], errors='coerce')
df['votes'].fillna(0, inplace=True)


In [None]:

df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(str).str.replace(',', '', regex=False)
df['approx_cost(for two people)'] = pd.to_numeric(df['approx_cost(for two people)'], errors='coerce')


In [None]:

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)


In [None]:

sns.set_style("whitegrid")
plt.figure(figsize=(12,6))
df['location'].value_counts()[:10].plot(kind='bar', color='coral')
plt.title("Top 10 Restaurant Locations")
plt.ylabel("Number of Restaurants")
plt.xlabel("Location")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

sns.countplot(data=df, x='online_order', palette='Set2')
plt.title("Online Ordering Availability")
plt.show()


In [None]:

plt.figure(figsize=(10,5))
sns.histplot(df['rate'], bins=20, kde=True, color='skyblue')
plt.title("Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()


In [None]:

top_types = df['rest_type'].value_counts().nlargest(10).index
plt.figure(figsize=(12,6))
sns.boxplot(data=df[df['rest_type'].isin(top_types)], x='rest_type', y='rate')
plt.title("Restaurant Type vs Rating")
plt.xticks(rotation=45)
plt.show()


In [None]:

plt.figure(figsize=(10,6))
sns.scatterplot(x='approx_cost(for two people)', y='rate', data=df, alpha=0.6)
plt.title('Cost vs Rating')
plt.xlabel('Approximate Cost for Two People')
plt.ylabel('Rating')
plt.show()


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df['online_order'] = df['online_order'].map({'Yes': 1, 'No': 0})
df['rest_type'] = df['rest_type'].astype('category').cat.codes

X = df[['rate', 'votes', 'approx_cost(for two people)', 'rest_type']]
y = df['online_order']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
