#Exploring NYC restaurant data with Python
##New York City Food Inspections
by Olivia Limone

In [None]:
#importing libraries

import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt


##Data Source
I got my data from NYC Open Data. The data set I used was [DOHMH New York City Restaurant Inspection Results](https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j). 

In [None]:
#saving my API call as a string
#I will only be looking at data from Manhattan, so filtering where boro = "Manhattan" 
api_call = "https://data.cityofnewyork.us/resource/43nn-pn8j.csv?$query=SELECT * WHERE boro = 'Manhattan' LIMIT 200000"

#fixing spaces in URL
formatted_api_call = api_call.replace(" ","%20")

food_inspections = pd.read_csv(formatted_api_call)
food_inspections

##Data Analysis

###Question 1: What is the distribution of letter grades among restaurants in Manhattan?




Step 1: Count up the number of letter grades by each type.

In [None]:
#counting the number of letter grades by the column 'grade'
grade_totals = food_inspections.groupby(['grade']).camis.count() #"camis" is the unique identifier for each restaurant

grade_totals

Step 2: Create a bar graph to graph these different values, with each grade on the x-axis and number of restaurants with that grade on the y-axis. 

In [None]:
#making a bar chart portraying this information using matplotlib
objects = ('A', 'B', 'C', 'G', 'N', 'P', 'Z')
y_pos = np.arange(len(objects))
performance = grade_totals

plt.bar(y_pos, performance, align='center', alpha=0.5, color=('#e2a7d9', '#ed7117', '#fef8b1', '#000000', '#ed7117', '#ec9707', '#fadadd' ),  edgecolor='blue')
plt.xticks(y_pos, objects)
plt.ylabel('Number of Restuarants')
plt.xlabel('Grade')
plt.title('Grade Distribution of Restuarants in Manhattan')

#adding text labels to each bar 
for i, v in enumerate(performance):
  plt.text(i - 0.3, v + .01, str(v))

#displaying graph 
plt.show()

#source for help with matplotlib code: https://pythonspot.com/matplotlib-bar-chart/ and https://matplotlib.org/api/pyplot_api.html and https://stackoverflow.com/questions/30228069/how-to-display-the-value-of-the-bar-on-each-bar-with-pyplot-barh
#help with colors: https://python-graph-gallery.com/3-control-color-of-barplots/


###Question 2: What are the most common health violations among restuarants in Manhattan?

Step 1: Count the number of health violations by type, and then filter out the top five.

In [None]:
violation_totals = (
  food_inspections
    .loc[:, ['violation_code','camis']]
    .groupby(['violation_code'])
    .count()
    .reset_index()
    .sort_values("camis",ascending= False)
)

#filtering out just the top 5 values
top_5 = violation_totals.nlargest(5,'camis')

In [None]:
top_5

Step 2: Create a bar graph portraying each violation code (on the x-axis) and the frequency in which they occured (on the y-axis).

In [None]:
#making a bar chart portraying this information using matplotlib
x = top_5['violation_code']
y = top_5['camis']

plt.bar(x,y, color=['pink', 'red', 'green', 'yellow', 'cyan'])
plt.ylabel('Number of violations')
plt.xlabel('Violation Code')
plt.title('Top 5 Health Violations in Manhattan')

#adding text labels to each bar 
for i, v in enumerate(y):
  plt.text(i - .3 , v + .6, str(v))

#displaying graph 
plt.show()

###Question 3: What is the grade breakdown by cuisine?

Step 1: Count the frequency of grades by cuisine type.

In [None]:
grade_BD_by_cuisine = food_inspections.groupby(['cuisine_description', 'grade']).agg(n_gradeBD = ('camis', "count")).sort_values(["n_gradeBD"],ascending=False).reset_index()

grade_BD_by_cuisine


Step 2: Filter out a select few types of cuisines and the main three grades (A,B,C) in order to create a bar graph that is meaningful and easy to look at. 

In [None]:
#First, I keep only A,B,C grades in the dataset
new_grade_BD_by_cuisine = grade_BD_by_cuisine[grade_BD_by_cuisine['grade'].isin(['A','B','C'])]
#Then I select six types of popular cuisines as you can see below.
new_grade_BD_by_selected_cuisine = new_grade_BD_by_cuisine[new_grade_BD_by_cuisine['cuisine_description'].isin(['American', 'Café/Coffee/Tea', 'Italian', 'Chinese', 'Japanese', 'Pizza'])]

new_grade_BD_by_selected_cuisine

Step 3: Pivot the table so A, B, and C have their own columns, and that the selected cuisine types each have their own row.

In [None]:
selected_cuisine_pivoted = new_grade_BD_by_selected_cuisine.pivot(index = 'cuisine_description', columns = 'grade', values = 'n_gradeBD').reset_index()

selected_cuisine_pivoted

Step 4: Create a bar graph with cuisine types on the x-axis, and each grade's frequency is represented by a different color bar.

In [None]:
#Making the graph where types of cuisine is on the x-axis and each grade is a bar

y1 = selected_cuisine_pivoted['A']
y2 = selected_cuisine_pivoted['B']
y3 = selected_cuisine_pivoted['C']
x_labels = selected_cuisine_pivoted['cuisine_description']
X = np.arange(6)

plt.xticks(X,x_labels) 
plt.bar(X - 0.20, y1, label="A", color = '#FFB6C1', width = 0.20)
plt.bar(X, y2, label="B", color = '#CCCCFF', width = 0.20)
plt.bar(X + 0.20, y3, label="C", color = '#FCEEA7', width = 0.20)
plt.xlabel("Cuisine")
plt.ylabel("Number of Grades")
plt.xticks(rotation = 90) #rotating because the cuisine names are too long
plt.title("Grade breakdown by cuisine in Manhattan")
plt.legend()
plt.show()
