In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns
%matplotlib widget

In [2]:
dataset = pd.read_csv('2020-XTern-DS.csv')
dataset.head()

Unnamed: 0,Restaurant,Latitude,Longitude,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Cook_Time
0,ID_6321,39.262605,-85.837372,"Fast Food, Rolls, Burger, Salad, Wraps",$20.00,$50.00,3.5,12,4,30 minutes
1,ID_2882,39.775933,-85.740581,"Ice Cream, Desserts",$10.00,$50.00,3.5,11,4,30 minutes
2,ID_1595,39.253436,-85.123779,"Italian, Street Food, Fast Food",$15.00,$50.00,3.6,99,30,65 minutes
3,ID_5929,39.029841,-85.33205,"Mughlai, North Indian, Chinese",$25.00,$99.00,3.7,176,95,30 minutes
4,ID_6123,39.882284,-85.517407,"Cafe, Beverages",$20.00,$99.00,3.2,521,235,65 minutes


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Restaurant     2019 non-null   object 
 1   Latitude       2019 non-null   float64
 2   Longitude      2019 non-null   float64
 3   Cuisines       2019 non-null   object 
 4   Average_Cost   2019 non-null   object 
 5   Minimum_Order  2019 non-null   object 
 6   Rating         2019 non-null   object 
 7   Votes          2019 non-null   object 
 8   Reviews        2019 non-null   object 
 9   Cook_Time      2019 non-null   object 
dtypes: float64(2), object(8)
memory usage: 157.9+ KB


In [4]:
dataset.shape

(2019, 10)

In [5]:
restaurant = dataset['Restaurant']
dataset[restaurant.isin(restaurant[restaurant.duplicated()])].sort_values('Restaurant')

Unnamed: 0,Restaurant,Latitude,Longitude,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Cook_Time
261,ID_1004,39.417004,-85.148521,"Fast Food, Sandwich",$10.00,$50.00,3.0,4,1,30 minutes
814,ID_1004,39.704981,-85.426350,Street Food,$5.00,$50.00,-,-,-,30 minutes
669,ID_1074,39.871351,-85.479279,"Mithai, Fast Food",$15.00,$50.00,3.5,6,2,30 minutes
1176,ID_1074,39.255173,-85.367516,"Mithai, Street Food",$15.00,$50.00,3.7,6,-,30 minutes
399,ID_1140,39.550107,-85.426366,"American, Fast Food",$20.00,$50.00,3.8,80,27,30 minutes
...,...,...,...,...,...,...,...,...,...,...
1859,ID_861,39.689933,-85.487838,Biryani,$15.00,$50.00,2.8,8,3,30 minutes
1231,ID_861,39.521080,-85.053231,Biryani,$15.00,$50.00,2.8,8,3,30 minutes
255,ID_911,39.017742,-85.690338,Biryani,$10.00,$50.00,NEW,-,-,30 minutes
932,ID_911,39.211634,-85.808586,"North Indian, Biryani",$15.00,$50.00,3.5,28,5,30 minutes


## Data cleaning

Some rows are missing ratings and prices. We can replace the columns of restaurants that have a - with a 0 since a lack of number of ratings and average rating means no one has reviewed them yet. The same applies for NEW.

`Average_Cost` and `Minimum_Order` are each preceded with $, so they need to be converted into numbers. The same applies for `Cook_Time`.

We have some non-unique restaurant rows. We can merge the rows that have the same restaurant id, summing their numbers of reviews and averaging their non-zero ratings. We also need to take care of restaurant cuisines. Assuming longer cuisine names describe the same cuisines as the shorter cuisine names, we can just take those longer cuisine names.

Further, we need to standardize variant numerical values so that they don't produce any unwanted bias. Further, the Cuisine column introduces many values at once. We can add columns for each category with a one-hot encoding to see if there are any trends with Cuisine.

In [6]:
func = lambda x: max(x)
def rating_mean(series):
    # non_zeroes = list(series.filter(lambda x: x != 0))
    arr = []
    for s in series:
        if s != 0:
            arr.append(s)
    mean = sum(arr)/len(arr) if len(arr) != 0 else 0
    return mean

dataset = dataset.replace('-', 0)
dataset = dataset.replace(['-', 'NEW', 'Opening Soon'], 0)
dataset[dataset.columns[4:6]] = dataset[dataset.columns[4:6]].replace('[\$,]', '', regex=True).astype(float)
dataset[dataset.columns[9:]] = dataset[dataset.columns[9:]].replace('[minutes,]', '', regex=True).astype(float)

to_numeric = ['Latitude', 'Longitude', 'Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews', 'Cook_Time']
dataset[to_numeric] = dataset[to_numeric].apply(pd.to_numeric)

dataset = dataset.groupby('Restaurant').aggregate({
    'Latitude': 'max',
    'Longitude': 'max',
    'Cuisines': max,
    'Average_Cost': 'mean',
    'Minimum_Order': 'mean',
    'Rating': rating_mean, 
    'Votes': 'sum', 
    'Reviews': 'sum', 
    'Cook_Time': 'mean',
    }).reset_index()

cuisine_columns = dataset['Cuisines'].str.get_dummies(",")
dataset = dataset.drop('Cuisines', axis=1)
dataset = dataset.join(cuisine_columns)
dataset[['Average_Cost', 'Votes', 'Reviews', 'Cook_Time']] = MinMaxScaler().fit_transform(dataset[['Average_Cost','Votes', 'Reviews', 'Cook_Time']])

In [7]:
dataset

Unnamed: 0,Restaurant,Latitude,Longitude,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Cook_Time,Afghan,...,Salad,Sandwich,Seafood,South Indian,Street Food,Tamil,Tea,Thai,Tibetan,Wraps
0,ID_1000,39.829351,-85.249351,0.173913,50.0,3.7,0.010603,0.008456,0.181818,0,...,0,0,0,0,0,0,0,0,0,0
1,ID_1004,39.704981,-85.148521,0.021739,50.0,3.0,0.000442,0.000154,0.181818,0,...,0,0,0,0,1,0,0,0,0,0
2,ID_1005,39.444192,-85.842006,0.086957,50.0,4.3,0.027943,0.018143,0.181818,0,...,0,0,0,0,1,0,0,0,0,0
3,ID_1007,39.620668,-85.114727,0.173913,50.0,0.0,0.000000,0.000000,0.181818,0,...,0,0,0,0,0,0,0,0,0,0
4,ID_1013,39.615411,-85.273855,0.043478,50.0,3.3,0.000552,0.000000,0.181818,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1802,ID_986,39.063546,-85.028933,0.130435,50.0,0.0,0.000000,0.000000,0.181818,0,...,0,0,0,0,0,0,0,0,0,0
1803,ID_988,39.697670,-85.024031,0.260870,50.0,3.9,0.305721,0.210486,0.181818,0,...,0,0,0,0,0,0,0,0,0,0
1804,ID_989,39.029861,-85.078761,0.217391,50.0,4.1,0.129998,0.107011,0.318182,0,...,0,0,0,0,0,0,0,0,0,0
1805,ID_99,39.689777,-85.088266,0.043478,50.0,3.8,0.011155,0.005843,0.181818,0,...,0,0,0,1,0,0,0,0,0,0


## Locations of Restaurants

In [8]:
fig = plt.figure()
plt.scatter(x="Latitude", y="Longitude", data=dataset)
plt.title("Locations of restaurants")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

It doesn't look like there's any trends regarding how the restaurants are located. It looks pretty evenly distributed in this section of land. Maybe there's some trends if we look at rating versus location.

In [None]:
fig2 = plt.figure()
ax = Axes3D(fig2)
lat = dataset["Latitude"]
long = dataset["Longitude"]
rating = dataset["Cook_Time"]

ax.scatter(lat, long, rating)
plt.show()

It doesn't look like location has much to do with the rating of a restaurant.

## Clustering Analysis
We can try to find insights from clustering.

In [10]:
features = dataset.columns[1:]
features_ = dataset[features]

error_loss =[] 
for k in range(1,10): 
    kmeans = KMeans(n_clusters=k, random_state=0) 
    kmeans.fit(features_) 
    error_loss.append(kmeans.inertia_) 

fig3 = plt.figure()
plt.plot(range(1,10), error_loss) 
plt.xticks(np.arange(1, 10, step=1))
plt.title("Number of clusters (K) versus Error loss")
plt.xlabel("K") 
plt.ylabel("Error Loss")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'Error Loss')

It looks like we can take k=4 clusters.

In [41]:
k_clusters = KMeans(n_clusters=4, random_state=0).fit(features_)
clusters = k_clusters.predict(features_)

clustered_features = pd.DataFrame(features_, index=features_.index, columns=features_.columns)
clustered_features["cluster"] = clusters
clustered_features = pd.concat([clustered_features.iloc[:,0:8], clustered_features.iloc[:, -1:]],axis=1)

pca = PCA(n_components=2)
pca.fit(features_)
reduced_features = pca.transform(features_)

fig_2 = plt.figure()
plt.title("Parallel Coordinates Plot for each Cluster")
pd.plotting.parallel_coordinates(clustered_features, 'cluster', color=('#FF0000', '#00FF00', '#0000FF', '#FFFF00'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:title={'center':'Parallel Coordinates Plot for each Cluster'}>

**Insight 1:** It's hard to see in the above graph, but it seems that the clusters found are distinguishing between restaurants with minimum orders of varying degrees. This likely because of the discrete steps of 10 or so in which minimum order cost varies between restaurants.

## Linear Regression
Maybe we can find some insights concerning how each of the features contributes to an accurate model of high ratings.

In [12]:
ratings = dataset['Rating']
lin_reg_features = features_.drop('Rating', axis=1)
lin_reg = LinearRegression().fit(lin_reg_features, ratings)
coefficients = lin_reg.coef_

food_types = []
for i, coefficient in enumerate(coefficients):
    if i > 6:
        food_types.append([lin_reg_features.columns[i], coefficient])
    print(f"{lin_reg_features.columns[i]}: {coefficient}")

Latitude: 0.027261647917442716
Longitude: 0.03866380126354707
Average_Cost: 2.5241726803290656
Minimum_Order: -0.00016913941500129592
Votes: 9.50090638893329
Reviews: -7.37212442152066
Cook_Time: 1.967541272133367
 Afghan: 0.18337903325931842
 American: 0.25598772720502394
 Andhra: -0.15846604295099143
 Arabian: -0.05509703119927373
 Asian: 0.19462179443787309
 Assamese: -2.106340390075248
 Awadhi: -1.2696194764790119
 BBQ: 0.4204330987109851
 Bakery: -0.055055638028809306
 Bangladeshi: 1.019235531006366
 Belgian: 0.6651106024625535
 Bengali: -0.23907425447909741
 Beverages: -0.12472949916814086
 Bihari: -3.426049674340643
 Biryani: 0.17611926333598094
 Brazilian: 0.17451013271940377
 Bubble Tea: -13024788440.905233
 Burger: 0.24924054482536695
 Burmese: 13024788439.384022
 Cafe: 0.11107981630224628
 Cantonese: 0.369239237966599
 Charcoal Chicken: 0.6903303332902848
 Chettinad: -3.1079124455878513
 Chinese: -0.03688072495511108
 Coffee: -0.5415921709868327
 Continental: -0.158481450342

In [13]:
food_types

[[' Afghan', 0.18337903325931842],
 [' American', 0.25598772720502394],
 [' Andhra', -0.15846604295099143],
 [' Arabian', -0.05509703119927373],
 [' Asian', 0.19462179443787309],
 [' Assamese', -2.106340390075248],
 [' Awadhi', -1.2696194764790119],
 [' BBQ', 0.4204330987109851],
 [' Bakery', -0.055055638028809306],
 [' Bangladeshi', 1.019235531006366],
 [' Belgian', 0.6651106024625535],
 [' Bengali', -0.23907425447909741],
 [' Beverages', -0.12472949916814086],
 [' Bihari', -3.426049674340643],
 [' Biryani', 0.17611926333598094],
 [' Brazilian', 0.17451013271940377],
 [' Bubble Tea', -13024788440.905233],
 [' Burger', 0.24924054482536695],
 [' Burmese', 13024788439.384022],
 [' Cafe', 0.11107981630224628],
 [' Cantonese', 0.369239237966599],
 [' Charcoal Chicken', 0.6903303332902848],
 [' Chettinad', -3.1079124455878513],
 [' Chinese', -0.03688072495511108],
 [' Coffee', -0.5415921709868327],
 [' Continental', -0.15848145034295547],
 [' Desserts', 0.09128692836324342],
 [' European', 

**Insight 2:** We can see that Votes and Average_Cost are more correlated with high ratings. Cook_Time is also correlated with high ratings. Likely, a good restaurant will receive more votes and delivers food in a timely manner, and the higher the average cost, the more likely the food is to be high quality.

In terms of types of cuisine, it appears Mishti, Malaysian, Paan, and Tea cuisine categories are slightly correlated with high ratings. This is not to say that cuisines of other types are disliked; many are likely underrepresented which the regression can't account for.

## Correlations

In [66]:
corr = pd.concat([dataset.iloc[:,8:9], dataset.iloc[:, 9:]], axis=1).corr()
corr

Unnamed: 0,Cook_Time,Afghan,American,Andhra,Arabian,Asian,Assamese,Awadhi,BBQ,Bakery,...,Salad,Sandwich,Seafood,South Indian,Street Food,Tamil,Tea,Thai,Tibetan,Wraps
Cook_Time,1.000000,-0.019440,0.053471,0.009482,0.007123,0.008782,-0.013742,-0.019440,0.013428,0.025886,...,0.024082,0.007095,-0.003933,-0.051396,-0.020440,-0.019440,-0.027508,0.051204,-0.017348,0.036189
Afghan,-0.019440,1.000000,-0.002722,-0.001753,-0.001921,-0.003244,-0.000783,-0.001108,-0.002483,-0.003780,...,-0.002076,-0.002483,-0.001753,-0.008597,-0.005730,-0.001108,-0.001568,-0.001108,-0.001753,-0.001357
American,0.053471,-0.002722,1.000000,-0.004307,-0.004719,-0.007968,-0.001924,-0.002722,-0.006099,0.112280,...,-0.005099,-0.006099,-0.004307,-0.021117,-0.014074,-0.002722,-0.003851,-0.002722,-0.004307,-0.003334
Andhra,0.009482,-0.001753,-0.004307,1.000000,-0.003040,-0.005133,-0.001240,-0.001753,-0.003929,-0.005981,...,-0.003285,-0.003929,-0.002775,0.029906,-0.009067,-0.001753,-0.002481,-0.001753,-0.002775,-0.002148
Arabian,0.007123,-0.001921,-0.004719,-0.003040,1.000000,-0.005625,-0.001358,-0.001921,-0.004306,-0.006554,...,-0.003599,-0.004306,-0.003040,-0.014907,-0.009935,-0.001921,-0.002719,-0.001921,-0.003040,-0.002354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tamil,-0.019440,-0.001108,-0.002722,-0.001753,-0.001921,-0.003244,-0.000783,-0.001108,-0.002483,-0.003780,...,-0.002076,-0.002483,-0.001753,-0.008597,-0.005730,1.000000,-0.001568,-0.001108,-0.001753,-0.001357
Tea,-0.027508,-0.001568,-0.003851,-0.002481,-0.002719,-0.004590,-0.001108,-0.001568,-0.003514,-0.005348,...,-0.002937,-0.003514,-0.002481,-0.012165,-0.008108,-0.001568,1.000000,-0.001568,-0.002481,-0.001921
Thai,0.051204,-0.001108,-0.002722,-0.001753,-0.001921,-0.003244,-0.000783,-0.001108,-0.002483,-0.003780,...,-0.002076,-0.002483,-0.001753,-0.008597,-0.005730,-0.001108,-0.001568,1.000000,-0.001753,-0.001357
Tibetan,-0.017348,-0.001753,-0.004307,-0.002775,-0.003040,-0.005133,-0.001240,-0.001753,-0.003929,-0.005981,...,-0.003285,-0.003929,-0.002775,-0.013605,-0.009067,-0.001753,-0.002481,-0.001753,1.000000,-0.002148


In [93]:
print(corr['Cook_Time'][corr['Cook_Time'].isin([max(corr['Cook_Time'][1:])])])
print(corr['Cook_Time'][corr['Cook_Time'].isin([min(corr['Cook_Time'][1:])])])

Bakery    0.115256
Name: Cook_Time, dtype: float64
Fast Food   -0.073005
Name: Cook_Time, dtype: float64


**Insight 3:** While it is of no surprise to most people, it is fitting that bakeries are more correlated with longer cook times, and fast food is negatively correlated with cook times. It is surprising that fast food is not more negative, although some of the non-typical fast food restaurants are categorized as fast food.