# _Predictive Modeling of Dining Costs in Bengaluru Restaurants Using Zomato Data_

In [1]:
# Import all the required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb

_**Data Cleaning and Pre-processing**_

In [2]:
# Load the Datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Drop columns which are not required
to_drop = ['reviews_list', 'menu_item', 'location']
train_data = train_data.drop(columns = to_drop)
test_data = test_data.drop(columns = to_drop)

# Converting all elements in cost column to hundreds
train_data['cost'] = train_data['cost'].apply(lambda x: x * 1000 if x < 10 else x)

# One- hot encoding 'onine_order' and 'reservations' 
train_data['online_order'] = train_data['online_order'].map({'Yes': 1, 'No': 0})
test_data['online_order'] = test_data['online_order'].map({'Yes': 1, 'No': 0})
train_data['reservations'] = train_data['reservations'].map({'Available': 1, 'Not available': 0})
test_data['reservations'] = test_data['reservations'].map({'Available': 1, 'Not available': 0})

# Create a LabelEncoder object
label_encoder = LabelEncoder()
requires_encoding = ['rest_type', 'cuisines', 'type', 'locality', 'restaurant_id']

# Iterate through all columns in the requires_encoding and encode elements
for column in requires_encoding:
    train_data[column] = label_encoder.fit_transform(train_data[column])
    test_data[column] = label_encoder.fit_transform(test_data[column])

# Rescaling ratings and votes
train_data['rating'] = train_data['rating']/(train_data['rating'].max())
test_data['rating'] = test_data['rating']/(test_data['rating'].max())

train_data['votes'] = train_data['votes']/(train_data['votes'].max())
test_data['votes'] = test_data['votes']/(test_data['votes'].max())

_**ML Model- XGBoost**_

In [4]:
# Extreme Gradient Boosting (XGBoost) algorithm
X_train, y_train = train_data.to_numpy()[:, :-1], train_data.to_numpy()[:, -1]
xgbr = xgb.XGBRegressor()
xgbr.fit(X_train, y_train)
train_data['XGBRPredictions'] = xgbr.predict(X_train) #Testing on the training dataset
train_data

Unnamed: 0,restaurant_id,online_order,reservations,rating,votes,rest_type,cuisines,type,locality,cost,XGBRPredictions
0,16823,0,0,0.938776,0.058163,49,2526,0,11,3000.0,2935.339355
1,1251,0,1,0.857143,0.185124,59,951,4,5,2500.0,2429.441650
2,7693,1,1,0.918367,0.024834,27,204,4,14,1200.0,1252.510742
3,30228,1,0,0.795918,0.014318,77,2193,4,16,600.0,538.671021
4,2795,1,0,0.877551,0.039092,23,655,1,27,750.0,730.998413
...,...,...,...,...,...,...,...,...,...,...,...
39511,21505,0,0,0.734694,0.001248,27,2082,4,12,600.0,593.372437
39512,23059,0,0,0.877551,0.027567,40,1412,3,0,400.0,353.619019
39513,4893,0,0,0.795918,0.000000,77,1713,4,23,300.0,300.549469
39514,21219,0,0,0.653061,0.000238,77,2316,2,23,350.0,243.267914


In [5]:
#Calculating the Root-Mean-Squared-Error (RMSE) value
mse = mean_squared_error(train_data['cost'], train_data['XGBRPredictions'])
mse**0.5

120.74616288907971

In [6]:
# Implementing it on actual test data
X_test = test_data.to_numpy()
test_data['PredictedCost'] = xgbr.predict(X_test) #Testing on the test dataset
test_data

Unnamed: 0,restaurant_id,online_order,reservations,rating,votes,rest_type,cuisines,type,locality,PredictedCost
0,6558,0,0,0.795918,0.000000,67,1239,4,29,409.593872
1,9121,1,0,0.673469,0.008024,33,690,2,28,969.656433
2,2983,0,0,0.857143,0.056365,16,511,4,29,598.779175
3,9742,0,0,0.612245,0.001605,67,330,4,28,513.686584
4,2277,0,0,0.795918,0.000000,67,181,3,19,417.142609
...,...,...,...,...,...,...,...,...,...,...
9874,1003,1,0,0.755102,0.001672,79,180,2,0,336.707947
9875,9115,1,0,0.857143,0.118347,23,681,4,20,829.819031
9876,2195,0,0,0.795918,0.000267,67,1089,4,13,628.173035
9877,2832,1,0,0.775510,0.020794,23,766,0,14,561.921631


In [7]:
test_data[['restaurant_id', 'PredictedCost']].to_csv('submissions.csv', index = False) # RMSE has not been calculated as the Actual values were hidden