<div style="text-align: center;">
  <h1 style="font-size: 36px; font-weight: bold; text-decoration: underline;">Zomato - Classification</h1>
</div>

![Zomato](C:\Users\test\Zomato.jpg)

# Imoprt library

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib

In [4]:
data = pd.read_csv('zomato.csv')
df = data.copy()
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

# Cleaning Data

In [9]:
df.isnull().sum()

url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64

In [11]:
df.duplicated().sum()

0

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

In [13]:
df['rate'] = df['rate'].str.replace('/5', '').str.strip()
df['rate'] = pd.to_numeric(df['rate'], errors='coerce')

In [17]:
df['approx_cost(for two people)'] = pd.to_numeric(df['approx_cost(for two people)'], errors='coerce').fillna(0).astype(int)

In [19]:
df.drop(['url','address','phone'],axis = 1 , inplace = True)

In [20]:
df.columns

Index(['name', 'online_order', 'book_table', 'rate', 'votes', 'location',
       'rest_type', 'dish_liked', 'cuisines', 'approx_cost(for two people)',
       'reviews_list', 'menu_item', 'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [22]:
df.rename (columns = {'approx_cost(for two people)':'ave_cost_for_2','listed_in(type)':'restaurant_category','listed_in(city)':'restaurant_region'},inplace =True)

In [24]:
df[df['votes'] == 0]['rate'].value_counts()

rate
3.7    5
3.6    3
3.9    2
3.8    2
3.4    2
3.3    2
4.0    1
4.3    1
4.1    1
Name: count, dtype: int64

In [25]:
def change_rate(row):
    if row['votes'] == 0:
        return 0
    else:
        return row['rate']

df['rate'] = df.apply(change_rate, axis=1)

In [26]:
df[df['votes'] == 0]['rate'].value_counts()

rate
0.0    10027
Name: count, dtype: int64

In [29]:
# Create a new column 'is_good' (1 if rate > 3.75, otherwise 0)
df['target'] = np.where(df['rate'] > 3.75, 1, 0)

In [30]:
df.drop(['rate'], axis =1 , inplace = True)

In [31]:
df.head()

Unnamed: 0,name,online_order,book_table,votes,location,rest_type,dish_liked,cuisines,ave_cost_for_2,reviews_list,menu_item,restaurant_category,restaurant_region,target
0,Jalsa,Yes,Yes,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,1
1,Spice Elephant,Yes,No,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,1
2,San Churro Cafe,Yes,No,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari,1
3,Addhuri Udupi Bhojana,No,No,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari,0
4,Grand Village,No,No,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari,1


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 51717 non-null  object
 1   online_order         51717 non-null  object
 2   book_table           51717 non-null  object
 3   votes                51717 non-null  int64 
 4   location             51696 non-null  object
 5   rest_type            51490 non-null  object
 6   dish_liked           23639 non-null  object
 7   cuisines             51672 non-null  object
 8   ave_cost_for_2       51717 non-null  int32 
 9   reviews_list         51717 non-null  object
 10  menu_item            51717 non-null  object
 11  restaurant_category  51717 non-null  object
 12  restaurant_region    51717 non-null  object
 13  target               51717 non-null  int32 
dtypes: int32(2), int64(1), object(11)
memory usage: 5.1+ MB


## Analysis

#### Comparison between online ordering and booking tables in terms of which is more popular?

In [None]:
online_order_data = df['online_order'].value_counts().reset_index()
book_table_data = df['book_table'].value_counts().reset_index()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Online Orders', 'Booking Table'))

online_order_bar = px.bar(online_order_data, x='online_order', y='count', color = 'online_order', text_auto=True, color_discrete_sequence=px.colors.qualitative.Bold)

for trace in online_order_bar['data']:
    fig.add_trace(trace, row=1, col=1)

book_table_bar = px.bar(book_table_data, x='book_table', y='count', color = 'book_table' , text_auto=True, color_discrete_sequence=px.colors.qualitative.Bold)

for trace in book_table_bar['data']:
    fig.add_trace(trace, row=1, col=2)

fig.update_layout(
    title_text="Online Orders and Table Booking", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### Size of restaurant Category

In [None]:
fig = px.bar(df['restaurant_category'].value_counts().reset_index(), x='restaurant_category', y='count', 
              text_auto=True, color='restaurant_category',color_discrete_sequence=px.colors.qualitative.Bold)

fig.update_layout(
    title_text="Restaurant Category", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### The most restaurants that have branches in all region

In [None]:
df['name'].nunique()

In [None]:
df['restaurant_region'].nunique()

In [None]:
fig = px.bar(df.groupby(['name'])
       .agg({'restaurant_region': 'nunique'}).reset_index()
       .rename(columns={'restaurant_region': 'count regions'})
       .sort_values(by='count regions',ascending = False).head(20)
       , x = 'name' ,y = 'count regions', text_auto = True , color = 'name')

fig.update_layout(
    title_text="The most restaurants that have branches in all region", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### Regions have the most restaurants

In [None]:
fig = px.bar(df.groupby(['restaurant_region'])
       .agg({'name': 'nunique'}).reset_index()
       .rename(columns={'name': 'count names','restaurant_region':'region'})
       .sort_values(by='count names', ascending=False).head(10)
       , x = 'region' ,y = 'count names', text_auto = True , color = 'region')

fig.update_layout(
    title_text="Regions have the most restaurants", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### Locations have the most restaurants

In [None]:
fig = px.bar(df.groupby(['location'])
       .agg({'name': 'nunique'}).reset_index()
       .rename(columns={'name': 'count names'})
       .sort_values(by='count names', ascending=False).head(10)
       , x = 'location' ,y = 'count names' , text_auto = True , color = 'location')

fig.update_layout(
    title_text="Locations have the most restaurants", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### The most famous rest_type in all restaurants

In [None]:
fig = px.bar(df['rest_type'].value_counts().reset_index()
       .sort_values(by = 'count' , ascending = False).head(10)
       ,x = 'rest_type' , y = 'count' , text_auto = True, color ='rest_type')

fig.update_layout(
    title_text="The most famous rest_type in all restaurants", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### The most famous dish liked

In [None]:
df['dish_liked'].nunique()

In [None]:
fig = px.bar(df['dish_liked'].value_counts().reset_index()
       .sort_values(by='count',ascending=False)[1:].head(10)
       , x = 'dish_liked', y = 'count', text_auto = True , color = 'dish_liked' )

fig.update_layout(
    title_text="The most famous dish liked", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### The most famous cuisines

In [None]:
fig = px.bar(df['cuisines'].value_counts().reset_index()
       .sort_values(by='count',ascending=False)[1:].head(10)
       , x = 'cuisines', y = 'count', text_auto = True , color = 'cuisines' )

fig.update_layout(
    title_text="The most famous cuisines", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### Distribution of average cost for two people

In [None]:
# high average
df['ave_cost_for_2'].max()

In [None]:
# low average
df['ave_cost_for_2'].min()

In [None]:
fig = px.scatter(df, x = 'ave_cost_for_2')

fig.update_layout(
    title_text="Distribution of average cost for two people", 
    title_font=dict(size=20, family='Arial', color='black'),
    title_x=0.5,
    showlegend=False)

fig.show()

#### Correlations

In [None]:
numeric_df = df.select_dtypes(include=['float64', 'int64' , 'int32'])

correlation_matrix = numeric_df.corr()

fig = px.imshow(correlation_matrix, 
                title="Correlation Matrix (Numeric Features Only)",
                labels=dict(x="Features", y="Features", color="Correlation"),
                color_continuous_scale='RdBu', 
                zmin=-1, zmax=1)

fig.show()

#### Classification

In [None]:
X = df.drop(['target', 'reviews_list', 'name', 'menu_item','votes'], axis=1) 
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = ['online_order', 'location', 'book_table', 'rest_type', 'dish_liked', 'cuisines', 'restaurant_category', 'restaurant_region']
numeric_features = ['ave_cost_for_2']  

categorical_transformer = Pipeline(steps=[('label_encoder', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('normalizer', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features), 
                                               ('cat', categorical_transformer, categorical_features)])

models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVC': SVC()}

results = {}

for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy
    
best_model = max(results, key=results.get)
print(f"Best model is : {best_model} accuracy : {results[best_model]:.2f}")

best_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', models[best_model])])
best_model_pipeline.fit(X, y)  
joblib.dump(best_model_pipeline, f"{best_model}.pkl")

#### Draw models and display the best accuracy among them.

In [None]:
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])

fig = px.bar(results_df, x='Model', y='Accuracy', title="Model Accuracy Comparison", text='Accuracy', color='Model')

fig.show()

#### Model Test

In [None]:
model = joblib.load('RandomForest.pkl')

y_pred_new = model.predict(X_test)

accuracy_new = accuracy_score(y_test, y_pred_new)

print(f"Accuracy on original test data: {accuracy_new:.2f}")

In [None]:
model = joblib.load('RandomForest.pkl')

def get_user_input():
    online_order = input("Is the order online? (Yes/No): ")
    location = input("location: ")
    book_table = input("Do you want to book a table? (Yes/No): ")
    rest_type = input("Restaurant type: ")
    dish_liked = input("Favorite dishes: ")
    cuisines = input("Food: ")
    restaurant_category = input("Restaurant Category: ")
    restaurant_region = input("Restaurant area: ")
    ave_cost_for_2 = float(input("Average cost for two people: "))

    data = {
        'online_order': [online_order],
        'location': [location],
        'book_table': [book_table],
        'rest_type': [rest_type],
        'dish_liked': [dish_liked],
        'cuisines': [cuisines],
        'restaurant_category': [restaurant_category],
        'restaurant_region': [restaurant_region],
        'ave_cost_for_2': [ave_cost_for_2]
    }
    
    return pd.DataFrame(data)

user_input_df = get_user_input()

y_pred_new = model.predict(user_input_df)

print("predict:", y_pred_new)