## Zomato Bangalore Restaurants Prediction
The basic idea of analyzing the Zomato dataset is to get a fair idea about the factors affecting the establishment
of different types of restaurant at different places in Bengalur.

In [20]:
DF_PATH = '../data/processed/ML_zomato_processed.csv'
MODEL_SAVE_PATH = '../models/last_model.pkl'
INPUTS_SAVE_PATH = '../models/last_inputs.pkl'

### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from category_encoders import BinaryEncoder
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Read dataframe

In [3]:
df = pd.read_csv(DF_PATH)
df.sample(3)

Unnamed: 0,online_order,book_table,location,approx_cost(for two people),listed_in(type),listed_in(city),rest_type_counts,cuisines_counts,success
7531,No,No,Frazer Town,250.0,Delivery,Church Street,1,2,0
13382,Yes,No,BTM,400.0,Delivery,JP Nagar,1,1,0
7103,Yes,No,Vasanth Nagar,500.0,Cafes,Church Street,1,3,1


In [4]:
X = df.drop('success', axis = 1)
y = df['success']

### Create final pipline with best parameters

In [5]:
df.columns

Index(['online_order', 'book_table', 'location', 'approx_cost(for two people)',
       'listed_in(type)', 'listed_in(city)', 'rest_type_counts',
       'cuisines_counts', 'success'],
      dtype='object')

In [6]:
Encoders = ColumnTransformer(transformers = [('OHE', OneHotEncoder(sparse_output = False, drop = 'first'), ['online_order', 'book_table']),
                                             ('BE', BinaryEncoder(), ['location', 'listed_in(type)', 'listed_in(city)'])], 
                                             remainder = 'passthrough')

In [7]:
steps = []
steps.append(('Encoders', Encoders))
steps.append(('Scaler', RobustScaler()))
steps.append(('RF', RandomForestClassifier()))
pipline = Pipeline(steps = steps)

In [8]:
params = {
    'RF__n_estimators' : [50, 100, 200],
    'RF__max_depth' : [5, 10, 15, 20],
    'RF__min_samples_split' : [2, 5, 10],
    'RF__min_samples_leaf' : [1, 2, 5],
    'RF__max_features' : [0.5, 'sqrt', 'log2'],
    'RF__ccp_alpha' : [0.0001, 0.001, 0.01]
}

In [9]:
grid = GridSearchCV(estimator = pipline, param_grid = params, cv = 5, scoring = 'accuracy', return_train_score = True)

In [10]:
grid.fit(X, y)

In [11]:
grid.best_params_

{'RF__ccp_alpha': 0.0001,
 'RF__max_depth': 20,
 'RF__max_features': 0.5,
 'RF__min_samples_leaf': 1,
 'RF__min_samples_split': 5,
 'RF__n_estimators': 200}

In [12]:
grid.best_index_

248

In [13]:
grid.cv_results_['mean_train_score'][grid.best_index_]

0.8357676001983366

In [14]:
grid.cv_results_['mean_test_score'][grid.best_index_]

0.7656662764663438

In [15]:
final_model = grid.best_estimator_

In [16]:
final_model.fit(X, y)

### Save ourmodel into pkl file

In [17]:
import joblib

In [19]:
joblib.dump(final_model, MODEL_SAVE_PATH, compress = 2)

['../models/last_model.pkl']

In [22]:
joblib.dump(X.columns, INPUTS_SAVE_PATH, compress = 2)

['../models/last_inputs.pkl']