# Prediction of restaurant ratings: EDA Notebook 

In [1]:
# install folium
!pip install folium -q

In [2]:
import pandas as pd
import folium
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import re
import seaborn as sns
sns.set_style('white')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
df = pd.read_csv("../data/data_clean_new.csv", 
                          encoding='utf_8', 
                          dtype = 'unicode',
                          parse_dates = True,
                          infer_datetime_format = True,
                          low_memory=False)
df = df.drop("Unnamed: 0", axis = 1)

We need to convert all the type of cuisines to booleans since they are stored as string for now.

In [4]:
for column in df.columns :
    if df[column][0] == 'True' or df[column][0] =='False':
    #This tests if the column is a boolean by using the first row for efficiency    
        df[column] = df[column]=='True'
        
    #for some columns we have NaN, in that case we test if we find a True or False value
    elif "True" in df[column].values :
        df[column] = df[column]=='True'
    elif "False" in df[column].values :
        df[column] = df[column]=='True'
    

In [5]:
df["review_count"].astype(int).plot(kind="hist", logx=True, logy=True)

ValueError: invalid literal for int() with base 10: '0.014979029358897545'

In [None]:
df["review_count"]= df["review_count"].astype(int)
df["name_length"]= df["name_length"].astype(int)
df['stars']=df['stars'].astype(float)
df.Price = pd.to_numeric(df.Price, errors='coerce')
df = df[np.isfinite(df['Price'])]

In [None]:
df["American (New)"].dtypes

Let's take a look at the first 5 rows

In [None]:
df["American (New)"].value_counts()


We can see that the cuisine type did not convert to boolean because they have a value of 0 or 1

In [None]:
cuisine_type = ["American (New)","American (Traditional)","Arts & Entertainment","Asian Fusion","Bakeries","Barbeque","Bars",
"Beer","Breakfast & Brunch","Buffets","Burgers","Cafes","Canadian (New)","Caribbean","Caterers","Chicken Wings",
"Chinese","Cocktail Bars","Coffee & Tea","Comfort Food","Delis","Desserts","Diners","Ethnic Food",
"Event Planning & Services","Fast Food","Food","Food Delivery Services","French","Gastropubs","Gluten-Free",
"Greek","Grocery","Halal","Hot Dogs","Ice Cream & Frozen Yogurt","Indian","Italian","Japanese","Juice Bars & Smoothies",
"Korean","Latin American","Lounges","Mediterranean","Mexican","Middle Eastern","Nightlife","Pizza","Pubs",
"Salad","Sandwiches","Seafood","Soup","Specialty Food","Sports Bars","Steakhouses","Sushi Bars","Tex-Mex",
"Thai","Vegan","Vegetarian","Vietnamese","Wine & Spirits","Wine Bars"]

ambiance = ["romantic","intimate","classy","hipster","divey","touristy","trendy","upscale","casual"]



Change data type to boolean for cuisine_type

In [None]:
for column in df[cuisine_type] :
    df[column] = df[column]=="1"

In [None]:
df[cuisine_type].dtypes

## Heat map
Let's start by visualizing where the restaurants in our dataset are located

In [None]:
from folium import plugins
from folium.plugins import HeatMap


# Make an empty map
m = folium.Map(location=[20,0], tiles="Stamen Toner", zoom_start=2)

# Ensure you're handing it floats
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)
 
# Filter the DF for rows, then columns, then remove NaNs
heat_df = df[['latitude', 'longitude']]
heat_df = heat_df.dropna(axis=0, subset=['latitude','longitude'])

# List comprehension to make out list of lists
heat_data = [[row['latitude'],row['longitude']] for index, row in heat_df.iterrows()]

# Plot it on the map
HeatMap(heat_data).add_to(m)


# show the map
m

## Description of the ratings

In [None]:
df['stars']=df['stars'].astype(float)


box_plot_data = df['stars']
plt.boxplot(box_plot_data)
plt.show()
    

print(df.stars.describe())


# Predicting of restaurants ratings: EDA Notebook

In [None]:
df["stars"].hist(bins=8,
  range=(1,5), # The lowest is 1 star
  density=False,  # show raw counts
  figsize=(15,5), 
  alpha = 0.8 # make the plot 20% transparent
 )

In [None]:
df["stars"].plot(
    kind='kde', 
    color='Black', 
    xlim=(1,5), 
    figsize=(15,5)
)

In [None]:
r1 = df[["business_id","stars"]].groupby(["stars"]).count()
r1.plot.bar(x=None, y=None)

In [None]:
np.log10(df["review_count"]).hist()

In [None]:
df.Price.hist(bins=9)

In [None]:
x = df["Price"].values
plt.hist(x)

In [None]:
ax = df.hist(column="Price", by='stars',bins=4, grid=False, figsize=(10,12), layout=(3,3), sharex=True, zorder=2, rwidth=0.9)

In [None]:
plt.subplot(9,1,1)
1star_plot = plt.bar()

In [None]:
r2 = df[ambiance].sum()
r2.plot.bar(x=None, y=None)
r2.describe()




In [None]:
r3 = df[cuisine_type].sum()
r3.plot.bar(x=None, y=None, figsize = (15, 5))





In [None]:

#sns.jointplot(x=df["city"], y=df.stars)

TODO:

Top 10 cuisines by avg. ratings


In [None]:
top10_cuisines = list(df[cuisine_type].sum().sort_values(ascending=False).index[3:13])
top10_cuisines

In [None]:
df2 = df.copy()
st = df2.loc[:, ['American (Traditional)',
 'Sandwiches',
 'Fast Food',
 'Pizza',
 'Breakfast & Brunch',
 'Burgers',
 'American (New)',
 'Italian',
 'Mexican',
 'Chinese']].stack()

all_ids = pd.Series(st.index.get_level_values(1), 
                          st.index.get_level_values(0),
                          name='top cuisines')[st.values]

df2 = df2.join(all_ids, how='left').dropna()

In [None]:
avg_ratings_cuisine = pd.DataFrame(df2.groupby("top cuisines")["stars"].mean())
avg_ratings_cuisine

In [None]:
import seaborn as sns
sns.barplot(data= avg_ratings_cuisine, x = "stars", y= avg_ratings_cuisine.index )

In [None]:
sns.boxplot(data = df2[["top cuisines", "stars"]], x= "stars", y="top cuisines")

In [None]:
r4 = df[test]
r4.plot.bar(x=None, y="stars", figsize = (15, 5))

In [None]:
df.Price = pd.to_numeric(df.Price, errors='coerce')

In [None]:
grouped = df[["city", "Price"]].groupby('city').mean().reset_index()
print(grouped.sort_values('Price', ascending=False))

In [None]:
grouped = df.sort_values(['Price'],ascending=False).groupby('city').mean()
print(grouped.Price)
#grouped = df.groupby("city")
#grouped.nlargest("Price")
#print(grouped.Price.agg([np.mean, np.std]))

Back to the <a href="http://localhost:8888/notebooks/Documents/GitHub/DMML2019_Team_Tissot/code/Main.ipynb#EDA">Main Notebook</a>