# _Predictive Modeling of Dining Costs in Bengaluru Restaurants Using Zomato Data_

In [19]:
# Import all the required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb

_**Data Cleaning and Pre-processing**_

In [20]:
# Load the Datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [21]:
# Drop columns which are not required
to_drop = ['reviews_list', 'menu_item', 'location']
train_data = train_data.drop(columns = to_drop)
test_data = test_data.drop(columns = to_drop)

# Converting all elements in cost column to hundreds
train_data['cost'] = train_data['cost'].apply(lambda x: x * 1000 if x < 10 else x)

# Create a LabelEncoder object
label_encoder = LabelEncoder()
requires_encoding = ['rest_type', 'cuisines', 'type', 'locality', 'restaurant_id', 'reservations', 'online_order']

# Iterate through all columns in the requires_encoding and encode elements
for column in requires_encoding:
    train_data[column] = label_encoder.fit_transform(train_data[column])
    test_data[column] = label_encoder.fit_transform(test_data[column])

# Rescaling ratings and votes
train_data['rating'] = train_data['rating']/(train_data['rating'].max())
test_data['rating'] = test_data['rating']/(test_data['rating'].max())

train_data['votes'] = train_data['votes']/(train_data['votes'].max())
test_data['votes'] = test_data['votes']/(test_data['votes'].max())

# Remove outliers
z_threshold = 3
def remove_outliers(data, columns, z_threshold):
    z_scores = (data[columns] - data[columns].mean()) / data[columns].std()
    data_no_outliers = data[(np.abs(z_scores) < z_threshold).all(axis=1)]
    return data_no_outliers
train_data_sub = remove_outliers(train_data, ['cost'], z_threshold)
train_data_sub

Unnamed: 0,restaurant_id,online_order,reservations,rating,votes,rest_type,cuisines,type,locality,cost
2,7693,1,0,0.918367,0.024834,27,204,4,14,1200.0
3,30228,1,1,0.795918,0.014318,77,2193,4,16,600.0
4,2795,1,1,0.877551,0.039092,23,655,1,27,750.0
5,3543,0,0,0.795918,0.002139,27,1050,4,5,1500.0
6,35659,1,1,0.795918,0.038082,77,1785,2,18,400.0
...,...,...,...,...,...,...,...,...,...,...
39511,21505,0,1,0.734694,0.001248,27,2082,4,12,600.0
39512,23059,0,1,0.877551,0.027567,40,1412,3,0,400.0
39513,4893,0,1,0.795918,0.000000,77,1713,4,23,300.0
39514,21219,0,1,0.653061,0.000238,77,2316,2,23,350.0


_**ML Model- XGBoost**_

In [22]:
# Extreme Gradient Boosting (XGBoost) algorithm
X_train, y_train = train_data_sub.to_numpy()[:, :-1], train_data_sub.to_numpy()[:, -1]
xgbr = xgb.XGBRegressor()
xgbr.fit(X_train, y_train)

train_data_copy = train_data_sub.copy()
train_data_copy['XGBRPredictions'] = xgbr.predict(X_train)  #Testing on the training dataset
train_data_copy

Unnamed: 0,restaurant_id,online_order,reservations,rating,votes,rest_type,cuisines,type,locality,cost,XGBRPredictions
2,7693,1,0,0.918367,0.024834,27,204,4,14,1200.0,1321.019897
3,30228,1,1,0.795918,0.014318,77,2193,4,16,600.0,592.016602
4,2795,1,1,0.877551,0.039092,23,655,1,27,750.0,745.360657
5,3543,0,0,0.795918,0.002139,27,1050,4,5,1500.0,1265.505859
6,35659,1,1,0.795918,0.038082,77,1785,2,18,400.0,351.472107
...,...,...,...,...,...,...,...,...,...,...,...
39511,21505,0,1,0.734694,0.001248,27,2082,4,12,600.0,630.736450
39512,23059,0,1,0.877551,0.027567,40,1412,3,0,400.0,290.635315
39513,4893,0,1,0.795918,0.000000,77,1713,4,23,300.0,296.157562
39514,21219,0,1,0.653061,0.000238,77,2316,2,23,350.0,242.585815


In [15]:
#Calculating the Root-Mean-Squared-Error (RMSE) value
mse = mean_squared_error(train_data_sub['cost'], train_data_copy['XGBRPredictions'])
mse**0.5

99.55687451679584

In [23]:
# Implementing it on actual test data
X_test = test_data.to_numpy()
test_data['PredictedCost'] = xgbr.predict(X_test) #Testing on the test dataset
test_data

Unnamed: 0,restaurant_id,online_order,reservations,rating,votes,rest_type,cuisines,type,locality,PredictedCost
0,6558,0,1,0.795918,0.000000,67,1239,4,29,845.064819
1,9121,1,1,0.673469,0.008024,33,690,2,28,1169.118530
2,2983,0,1,0.857143,0.056365,16,511,4,29,556.432800
3,9742,0,1,0.612245,0.001605,67,330,4,28,887.205688
4,2277,0,1,0.795918,0.000000,67,181,3,19,705.727722
...,...,...,...,...,...,...,...,...,...,...
9874,1003,1,1,0.755102,0.001672,79,180,2,0,371.543671
9875,9115,1,1,0.857143,0.118347,23,681,4,20,818.739563
9876,2195,0,1,0.795918,0.000267,67,1089,4,13,1170.757202
9877,2832,1,1,0.775510,0.020794,23,766,0,14,731.650024


In [24]:
test_data[['restaurant_id', 'PredictedCost']].to_csv('submissions.csv', index = False) # RMSE has not been calculated as the Actual values were hidden