In [398]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import requests
from io import BytesIO

In [399]:
# URL of the Excel file
train_url = "https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Restaurant%20Food%20Cost/Data_Train.xlsx"
test_url = "https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Restaurant%20Food%20Cost/Data_Test.xlsx"

# Download the Excel files
def download_excel(url):
    response = requests.get(url)
    content = response.content
    return BytesIO(content)

# Load the training dataset
train = pd.read_excel(download_excel(train_url))

# Load the test dataset
test = pd.read_excel(download_excel(test_url))


# Exploratory Data Analysis (EDA)

In [400]:
train.head()

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49 votes,1200
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30 votes,1500
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221 votes,800
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24 votes,800
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165 votes,300


In [401]:
test.head()

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES
0,CASUAL DINING,4085,"North Indian, Chinese, Mughlai, Kebab",12noon – 12midnight (Mon-Sun),Noida,Sector 18,4.3,564 votes
1,QUICK BITES,12680,"South Indian, Fast Food, Pizza, North Indian",7am – 12:30AM (Mon-Sun),Mumbai,Grant Road,4.2,61 votes
2,CASUAL DINING,1411,"North Indian, Seafood, Biryani, Chinese",11am – 11:30pm (Mon-Sun),Mumbai,Marine Lines,3.8,350 votes
3,,204,Biryani,"9am – 10pm (Mon, Wed, Thu, Fri, Sat, Sun), 10:...",Faridabad,NIT,3.8,1445 votes
4,QUICK BITES,13453,"South Indian, Kerala",11am – 10pm (Mon-Sun),Kochi,Kaloor,3.6,23 votes


In [402]:
train.shape, test.shape

((12690, 9), (4231, 8))

In [403]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12690 entries, 0 to 12689
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   TITLE          12690 non-null  object
 1   RESTAURANT_ID  12690 non-null  int64 
 2   CUISINES       12690 non-null  object
 3   TIME           12690 non-null  object
 4   CITY           12578 non-null  object
 5   LOCALITY       12592 non-null  object
 6   RATING         12688 non-null  object
 7   VOTES          11486 non-null  object
 8   COST           12690 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 892.4+ KB


In [404]:
for i in train.columns:
    print("Unique values in", i, train[i].nunique())

Unique values in TITLE 113
Unique values in RESTAURANT_ID 11892
Unique values in CUISINES 4155
Unique values in TIME 2689
Unique values in CITY 359
Unique values in LOCALITY 1416
Unique values in RATING 32
Unique values in VOTES 1847
Unique values in COST 86


In [405]:
train.isnull().sum()

TITLE               0
RESTAURANT_ID       0
CUISINES            0
TIME                0
CITY              112
LOCALITY           98
RATING              2
VOTES            1204
COST                0
dtype: int64

## Data pre-processing

In [406]:
# merge train and test
df = train.append(test,ignore_index=True)

In [407]:
df = df[['TITLE', 'CUISINES', 'TIME', 'CITY', 'LOCALITY', 'RATING', 'VOTES', 'COST']]

In [408]:
import re

def extract_closed(time):
    a = re.findall('Closed \(.*?\)', time)
    if a != []:
        return a[0]
    else:
        return 'NA'

df['CLOSED'] = df['TIME'].apply(extract_closed)

In [409]:
df['TIME'] = df['TIME'].str.replace(r'Closed \(.*?\)','')
#df['TIME'] = df['TIME'].str.replace(r'Closed...','')

In [410]:
df['RATING'] = df['RATING'].str.replace('NEW', '1')
df['RATING'] = df['RATING'].str.replace('-', '1').astype(float)

In [411]:
df['VOTES'] = df['VOTES'].str.replace(' votes', '').astype(float)

In [412]:
df['CITY'].fillna('Missing', inplace=True)  
df['LOCALITY'].fillna('Missing', inplace=True)  
df['RATING'].fillna(3.8, inplace=True)  
df['VOTES'].fillna(0.0, inplace=True)

In [413]:
df['COST'] = df['COST'].astype(float)

In [414]:
df.head(2)

Unnamed: 0,TITLE,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,CLOSED
0,CASUAL DINING,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49.0,1200.0,
1,"CASUAL DINING,BAR","Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30.0,1500.0,


In [415]:
df['TITLE'].nunique(), df['CUISINES'].nunique()

(123, 5183)

In [416]:
calc_mean = df.groupby(['CITY'], axis=0).agg({'RATING': 'mean'}).reset_index()
calc_mean.columns = ['CITY','CITY_MEAN_RATING']
df = df.merge(calc_mean, on=['CITY'],how='left')

calc_mean = df.groupby(['LOCALITY'], axis=0).agg({'RATING': 'mean'}).reset_index()
calc_mean.columns = ['LOCALITY','LOCALITY_MEAN_RATING']
df = df.merge(calc_mean, on=['LOCALITY'],how='left')

In [417]:
df.head(2)

Unnamed: 0,TITLE,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,CLOSED,CITY_MEAN_RATING,LOCALITY_MEAN_RATING
0,CASUAL DINING,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49.0,1200.0,,3.376271,3.388889
1,"CASUAL DINING,BAR","Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30.0,1500.0,,3.584588,3.472222


In [418]:
df.isnull().sum()

TITLE                      0
CUISINES                   0
TIME                       0
CITY                       0
LOCALITY                   0
RATING                     0
VOTES                      0
COST                    4231
CLOSED                     0
CITY_MEAN_RATING           0
LOCALITY_MEAN_RATING       0
dtype: int64

In [419]:
# dropping extra cols
#df.drop(['TIME', 'CITY','LOCALITY'], axis = 1, inplace = True)

In [420]:
#Encoding categorical columns using OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
for i in df.columns:
    if df[i].dtype=="object":
        df[i]=OE.fit_transform(df[i].values.reshape(-1, 1))
df

Unnamed: 0,TITLE,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST,CLOSED,CITY_MEAN_RATING,LOCALITY_MEAN_RATING
0,34.0,3294.0,1125.0,401.0,328.0,3.6,49.0,1200.0,23.0,3.376271,3.388889
1,36.0,364.0,2528.0,75.0,1215.0,4.2,30.0,1500.0,23.0,3.584588,3.472222
2,34.0,3729.0,1033.0,75.0,1273.0,3.8,221.0,800.0,23.0,3.584588,3.550000
3,104.0,5156.0,567.0,278.0,161.0,4.1,24.0,800.0,23.0,3.697880,3.721622
4,55.0,2218.0,977.0,278.0,709.0,3.8,165.0,300.0,23.0,3.697880,3.986420
...,...,...,...,...,...,...,...,...,...,...,...
16916,34.0,4259.0,517.0,317.0,1185.0,3.9,287.0,,23.0,3.713113,3.920000
16917,95.0,785.0,977.0,42.0,461.0,4.3,469.0,,23.0,3.692381,3.670000
16918,104.0,2089.0,3123.0,128.0,1371.0,3.7,53.0,,23.0,3.522018,2.966667
16919,104.0,4580.0,832.0,216.0,642.0,1.0,0.0,,23.0,2.455968,2.884615


In [421]:
train_df = df[df['COST'].isnull()!=True]
test_df = df[df['COST'].isnull()==True]
test_df.drop('COST', axis=1, inplace=True)

In [422]:
train_df.shape, test_df.shape

((12690, 11), (4231, 10))

In [423]:
train_df['COST'] = np.log1p(train_df['COST'])

## Train test split

In [424]:
X = train_df.drop(labels=['COST'], axis=1)
y = train_df['COST'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [425]:
X_train.shape, y_train.shape, X_cv.shape, y_cv.shape

((9517, 10), (9517,), (3173, 5), (3173,))

## Build the model

## Linear Regression

In [426]:
from sklearn.linear_model import LinearRegression
lrReg = LinearRegression()
model=lrReg.fit(X_train, y_train)

In [427]:
lrReg.score(X_test, y_test)

0.2788805564502609

In [428]:
y_pred = model.predict(X_test)

In [429]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.3686707917136279
Root Mean Squared Error: 0.6071826675009984
R-squared: 0.2788805564502609


## Decision Tree Regressor

In [432]:
from sklearn.tree import DecisionTreeRegressor
DTReg = DecisionTreeRegressor()
model=DTReg.fit(X_train, y_train)

In [433]:
DTReg.score(X_test, y_test)

0.4112794194507077

In [434]:
y_pred = model.predict(X_test)

In [435]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.30098215277736257
Root Mean Squared Error: 0.5486184036079746
R-squared: 0.4112794194507077
