In [285]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [286]:
dataset = pd.read_csv('2020-XTern-DS.csv')
dataset.head()

Unnamed: 0,Restaurant,Latitude,Longitude,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Cook_Time
0,ID_6321,39.262605,-85.837372,"Fast Food, Rolls, Burger, Salad, Wraps",$20.00,$50.00,3.5,12,4,30 minutes
1,ID_2882,39.775933,-85.740581,"Ice Cream, Desserts",$10.00,$50.00,3.5,11,4,30 minutes
2,ID_1595,39.253436,-85.123779,"Italian, Street Food, Fast Food",$15.00,$50.00,3.6,99,30,65 minutes
3,ID_5929,39.029841,-85.33205,"Mughlai, North Indian, Chinese",$25.00,$99.00,3.7,176,95,30 minutes
4,ID_6123,39.882284,-85.517407,"Cafe, Beverages",$20.00,$99.00,3.2,521,235,65 minutes


In [287]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Restaurant     2019 non-null   object 
 1   Latitude       2019 non-null   float64
 2   Longitude      2019 non-null   float64
 3   Cuisines       2019 non-null   object 
 4   Average_Cost   2019 non-null   object 
 5   Minimum_Order  2019 non-null   object 
 6   Rating         2019 non-null   object 
 7   Votes          2019 non-null   object 
 8   Reviews        2019 non-null   object 
 9   Cook_Time      2019 non-null   object 
dtypes: float64(2), object(8)
memory usage: 157.9+ KB


In [288]:
dataset.shape

(2019, 10)

In [289]:
restaurant = dataset['Restaurant']
dataset[restaurant.isin(restaurant[restaurant.duplicated()])].sort_values('Restaurant')

Unnamed: 0,Restaurant,Latitude,Longitude,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Cook_Time
261,ID_1004,39.417004,-85.148521,"Fast Food, Sandwich",$10.00,$50.00,3.0,4,1,30 minutes
814,ID_1004,39.704981,-85.426350,Street Food,$5.00,$50.00,-,-,-,30 minutes
669,ID_1074,39.871351,-85.479279,"Mithai, Fast Food",$15.00,$50.00,3.5,6,2,30 minutes
1176,ID_1074,39.255173,-85.367516,"Mithai, Street Food",$15.00,$50.00,3.7,6,-,30 minutes
399,ID_1140,39.550107,-85.426366,"American, Fast Food",$20.00,$50.00,3.8,80,27,30 minutes
...,...,...,...,...,...,...,...,...,...,...
1859,ID_861,39.689933,-85.487838,Biryani,$15.00,$50.00,2.8,8,3,30 minutes
1231,ID_861,39.521080,-85.053231,Biryani,$15.00,$50.00,2.8,8,3,30 minutes
255,ID_911,39.017742,-85.690338,Biryani,$10.00,$50.00,NEW,-,-,30 minutes
932,ID_911,39.211634,-85.808586,"North Indian, Biryani",$15.00,$50.00,3.5,28,5,30 minutes


In [290]:
dataset[dataset['Restaurant'].duplicated()].sort_values('Restaurant')

Unnamed: 0,Restaurant,Latitude,Longitude,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Cook_Time
814,ID_1004,39.704981,-85.426350,Street Food,$5.00,$50.00,-,-,-,30 minutes
1176,ID_1074,39.255173,-85.367516,"Mithai, Street Food",$15.00,$50.00,3.7,6,-,30 minutes
589,ID_1140,39.714757,-85.682464,"Burger, Fast Food, Beverages",$20.00,$50.00,4.0,2037,1062,65 minutes
719,ID_1140,39.567018,-85.452531,"American, Fast Food",$20.00,$50.00,3.9,570,127,45 minutes
537,ID_1156,39.068996,-85.823280,"Burger, Fast Food, Desserts, Beverages",$15.00,$50.00,-,-,-,30 minutes
...,...,...,...,...,...,...,...,...,...,...
1409,ID_8506,39.452312,-85.713132,"Fast Food, Street Food",$10.00,$50.00,3.3,24,10,45 minutes
1055,ID_8606,39.577662,-85.442455,"South Indian, Beverages",$25.00,$50.00,3.2,14,13,30 minutes
1859,ID_861,39.689933,-85.487838,Biryani,$15.00,$50.00,2.8,8,3,30 minutes
932,ID_911,39.211634,-85.808586,"North Indian, Biryani",$15.00,$50.00,3.5,28,5,30 minutes


## Data cleaning

Some data is also missing some ratings and prices. We can replace the columns of restaurants that have a - with a 0 since a lack of number of ratings and average rating means no one has reviewed them yet. The same applies for NEW.

`Average_Cost` and `Minimum_Order` are each preceded with $, so they need to be converted into numbers. The same applies for `Cook_Time`.

We have some non-unique restaurant rows. We can merge the rows that have the same restaurant id, summing their numbers of reviews and averaging their ratings.

In [291]:
func = lambda x: max(x)
def rating_mean(series):
    # non_zeroes = list(series.filter(lambda x: x != 0))
    arr = []
    for s in series:
        if s != 0:
            arr.append(s)
    mean = sum(arr)/len(arr) if len(arr) != 0 else 0
    return mean

dataset = dataset.replace('-', 0)
dataset = dataset.replace(['-', 'NEW', 'Opening Soon'], 0)
dataset[dataset.columns[4:6]] = dataset[dataset.columns[4:6]].replace('[\$,]', '', regex=True).astype(float)
dataset[dataset.columns[9:]] = dataset[dataset.columns[9:]].replace('[minutes,]', '', regex=True).astype(float)

to_numeric = ['Latitude', 'Longitude', 'Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews', 'Cook_Time']
dataset[to_numeric] = dataset[to_numeric].apply(pd.to_numeric)

restaurant = dataset['Restaurant']
dataset = dataset.groupby('Restaurant').aggregate({
    'Latitude': 'max',
    'Longitude': 'max',
    'Cuisines': max,
    'Average_Cost': 'mean',
    'Minimum_Order': 'mean',
    'Rating': rating_mean, 
    'Votes': 'sum', 
    'Reviews': 'sum', 
    'Cook_Time': 'mean',
    }).reset_index()


In [292]:
dataset

Unnamed: 0,Restaurant,Latitude,Longitude,Cuisines,Average_Cost,Minimum_Order,Rating,Votes,Reviews,Cook_Time
0,ID_1000,39.829351,-85.249351,"Bohri, Biryani",25.0,50.0,3.7,96,55,30.0
1,ID_1004,39.704981,-85.148521,Street Food,7.5,50.0,3.0,4,1,30.0
2,ID_1005,39.444192,-85.842006,"Street Food, Fast Food",15.0,50.0,4.3,253,118,30.0
3,ID_1007,39.620668,-85.114727,"North Indian, Continental, Italian",25.0,50.0,0.0,0,0,30.0
4,ID_1013,39.615411,-85.273855,"Desserts, Ice Cream",10.0,50.0,3.3,5,0,30.0
...,...,...,...,...,...,...,...,...,...,...
1802,ID_986,39.063546,-85.028933,"Coffee, Beverages, Sandwich",20.0,50.0,0.0,0,0,30.0
1803,ID_988,39.697670,-85.024031,"Chinese, North Indian, Mughlai",35.0,50.0,3.9,2768,1369,30.0
1804,ID_989,39.029861,-85.078761,"Cafe, Chinese, Italian, North Indian, Juices, ...",30.0,50.0,4.1,1177,696,45.0
1805,ID_99,39.689777,-85.088266,"South Indian, Street Food, Fast Food",10.0,50.0,3.8,101,38,30.0
