### Import NYC Restaurant Inpsection Dataset

We retrieved the latest NYC Restaurant Inspection Data from NYC Open Data

In [1]:
import pandas as pd
import numpy as np
df_rest = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv')
df_rest.head()

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,50014069,RED LOBSTER,Manhattan,261,W 125TH ST,10027.0,2122801930,Seafood,03/26/2018,Violations were cited in the following area(s).,...,11/26/2019,Cycle Inspection / Initial Inspection,40.809827,-73.950336,110.0,9.0,22200.0,1089941.0,1019310000.0,MN11
1,50000419,NATUREWORKS,Manhattan,43-45,W 55 STREET,10019.0,2123333020,American,08/05/2019,Violations were cited in the following area(s).,...,11/26/2019,Cycle Inspection / Initial Inspection,40.762279,-73.976363,105.0,4.0,10400.0,1034820.0,1012710000.0,MN17
2,50052197,TERRACE WEST,Brooklyn,3052-3078,W 21ST ST,11224.0,9173049069,American,08/18/2017,Violations were cited in the following area(s).,...,11/26/2019,Pre-permit (Operational) / Initial Inspection,40.573493,-73.987441,313.0,47.0,35200.0,3189655.0,3070710000.0,BK21
3,50063522,CECCONI'S,Brooklyn,55,WATER ST,11201.0,6462152136,Italian,07/10/2017,Violations were cited in the following area(s).,...,11/26/2019,Pre-permit (Operational) / Initial Inspection,40.703294,-73.992047,302.0,33.0,2100.0,3000019.0,3000260000.0,BK38
4,50087131,VICTORIA G'S PIZZERIA,Queens,8905,METROPOLITAN AVE,11374.0,7186852250,Pizza,05/28/2019,Violations were cited in the following area(s).,...,11/26/2019,Trans Fat / Initial Inspection,40.712156,-73.861772,405.0,30.0,63700.0,4437647.0,4031760000.0,QN19


In [2]:
df_rest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396950 entries, 0 to 396949
Data columns (total 26 columns):
CAMIS                    396950 non-null int64
DBA                      396282 non-null object
BORO                     396950 non-null object
BUILDING                 396697 non-null object
STREET                   396939 non-null object
ZIPCODE                  391384 non-null float64
PHONE                    396933 non-null object
CUISINE DESCRIPTION      396950 non-null object
INSPECTION DATE          396950 non-null object
ACTION                   395234 non-null object
VIOLATION CODE           390845 non-null object
VIOLATION DESCRIPTION    387585 non-null object
CRITICAL FLAG            387585 non-null object
SCORE                    379729 non-null float64
GRADE                    200533 non-null object
GRADE DATE               198964 non-null object
RECORD DATE              396950 non-null object
INSPECTION TYPE          395234 non-null object
Latitude                

### Let's clean the deduped dataset a bit more...

In [4]:
# format date fields as datetime
date_cols = ['RECORD DATE', 'INSPECTION DATE']

for col in date_cols:
    df_rest[col] = pd.to_datetime(df_rest[col])

In [5]:
# drop restaurants that don't have restaurant grade ABC 
df_rest = df_rest[df_rest['GRADE'].isin(['A','B','C'])]

In [6]:
# drop records where lat/lng are nulls
df_rest = df_rest[(df_rest['Latitude'].notnull() & df_rest['Longitude'].notnull())]

In [7]:
# drop restaurants where DBA (name of restaurant) is null
df_rest = df_rest[df_rest['DBA'].notnull()]

In [8]:
# check for number of unique restaurants
df_rest['CAMIS'].nunique()

24946

In [9]:
df_rest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191555 entries, 2 to 396949
Data columns (total 26 columns):
CAMIS                    191555 non-null int64
DBA                      191555 non-null object
BORO                     191555 non-null object
BUILDING                 191550 non-null object
STREET                   191555 non-null object
ZIPCODE                  188672 non-null float64
PHONE                    191545 non-null object
CUISINE DESCRIPTION      191555 non-null object
INSPECTION DATE          191555 non-null datetime64[ns]
ACTION                   191555 non-null object
VIOLATION CODE           190808 non-null object
VIOLATION DESCRIPTION    190011 non-null object
CRITICAL FLAG            190011 non-null object
SCORE                    191555 non-null float64
GRADE                    191555 non-null object
GRADE DATE               191555 non-null object
RECORD DATE              191555 non-null datetime64[ns]
INSPECTION TYPE          191555 non-null object
Latitude

### Let's visualize the number of restaurants by the Inspection Grade and Borough using Altair

In [12]:
import altair as alt
from vega_datasets import data

In [75]:
df_rest.groupby(['BORO', 'GRADE'])['CAMIS'].nunique().reset_index()

Unnamed: 0,BORO,GRADE,CAMIS
0,Bronx,A,2210
1,Bronx,B,557
2,Bronx,C,194
3,Brooklyn,A,6103
4,Brooklyn,B,1291
5,Brooklyn,C,510
6,Manhattan,A,9738
7,Manhattan,B,1888
8,Manhattan,C,735
9,Queens,A,5569


In [77]:
grade_count = df_rest.groupby(['BORO', 'GRADE'])['CAMIS'].nunique().reset_index()

alt.Chart(grade_count).mark_bar(cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3).encode(
    x='GRADE',
    y='CAMIS',
    color='GRADE',
    column='BORO')

### Let's visualize the number of restaurants by grade over time by Inspection Date


In [15]:
df_dates = df_rest.copy()
df_dates.index = df_dates['INSPECTION DATE']
grades_ts = df_dates.groupby('GRADE').resample('MS')['CAMIS'].count().reset_index()

In [16]:
grades_ts

Unnamed: 0,GRADE,INSPECTION DATE,CAMIS
0,A,2013-06-01,1
1,A,2013-07-01,0
2,A,2013-08-01,0
3,A,2013-09-01,0
4,A,2013-10-01,0
5,A,2013-11-01,0
6,A,2013-12-01,0
7,A,2014-01-01,0
8,A,2014-02-01,0
9,A,2014-03-01,0


In [17]:
highlight = alt.selection(type='single', on='mouseover',
                          fields=['GRADE'], nearest=True)

base = alt.Chart(grades_ts).encode(
    x='INSPECTION DATE:T',
    y='CAMIS:Q',
    color='GRADE:N'
)

points = base.mark_circle().encode(
    opacity=alt.value(0)
).add_selection(
    highlight
).properties(
    width=600
)

lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1), alt.value(3))
)

points + lines

### Create a map to visualize the restaurants with NYC neighborhood boundaries (NTA)

In [18]:
import altair as alt
from vega_datasets import data

nyc = alt.topo_feature('https://raw.githubusercontent.com/grantpezeshki/NYC-topojson/master/NTA.topojson', 
                       feature='objects')

In [19]:
alt.topo_feature?

In [20]:
airports = data.airports.url

### Count by Cuisine Description (Type of Restaurant)

In [26]:
#Count Types of Resturaunts
df_rest.groupby(['CUISINE DESCRIPTION']).count().reset_index().head()

Unnamed: 0,CUISINE DESCRIPTION,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,Afghan,103,103,103,103,103,103,103,103,103,...,103,103,103,103,103,103,103,103,103,103
1,African,626,626,626,626,626,604,626,626,626,...,626,626,626,626,604,604,604,604,626,604
2,American,42627,42627,42627,42627,42627,41539,42627,42627,42627,...,42627,42627,42627,42627,41539,41539,41539,41336,42627,41539
3,Armenian,162,162,162,162,162,162,162,162,162,...,162,162,162,162,162,162,162,162,162,162
4,Asian,2779,2779,2779,2779,2779,2754,2779,2779,2779,...,2779,2779,2779,2779,2754,2754,2754,2734,2779,2754


### Group by Cuisine & Boro

In [28]:
#Group by Cuisine & Boro
df_rest.groupby(['CUISINE DESCRIPTION','BORO']).count().reset_index().sort_values('CUISINE DESCRIPTION')

Unnamed: 0,CUISINE DESCRIPTION,BORO,CAMIS,DBA,BUILDING,STREET,ZIPCODE,PHONE,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,Afghan,Brooklyn,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
1,Afghan,Manhattan,33,33,33,33,33,33,33,33,...,33,33,33,33,33,33,33,33,33,33
2,Afghan,Queens,50,50,50,50,50,50,50,50,...,50,50,50,50,50,50,50,50,50,50
3,African,Bronx,206,206,206,206,206,206,206,206,...,206,206,206,206,206,206,206,206,206,206
4,African,Brooklyn,201,201,201,201,197,201,201,201,...,201,201,201,201,197,197,197,197,201,197
5,African,Manhattan,185,185,185,185,167,185,185,185,...,185,185,185,185,167,167,167,167,185,167
6,African,Queens,30,30,30,30,30,30,30,30,...,30,30,30,30,30,30,30,30,30,30
7,African,Staten Island,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
12,American,Staten Island,1559,1559,1559,1559,1531,1559,1559,1559,...,1559,1559,1559,1559,1531,1531,1531,1522,1559,1531
11,American,Queens,7092,7092,7092,7092,6712,7092,7092,7092,...,7092,7092,7092,7092,6712,6712,6712,6625,7092,6712


### Count Cuisine by Grade

In [30]:
#count cuisine by grade
df_rest.groupby(['CUISINE DESCRIPTION'])['GRADE'].count().sort_values(ascending = False)

CUISINE DESCRIPTION
American                                                            42627
Chinese                                                             19396
Café/Coffee/Tea                                                     10656
Pizza                                                                8528
Italian                                                              7745
Mexican                                                              7575
Latin (Cuban, Dominican, Puerto Rican, South & Central American)     7459
Japanese                                                             6619
Bakery                                                               6127
Caribbean                                                            6016
Spanish                                                              5251
Pizza/Italian                                                        3894
Chicken                                                              3691
Donuts            

In [93]:
#count cuisine by grade
top_cuisine = df_rest.groupby(['CUISINE DESCRIPTION'])['GRADE'].count().sort_values(ascending = False).head(10)
top_cuisine

CUISINE DESCRIPTION
American                                                            42627
Chinese                                                             19396
Café/Coffee/Tea                                                     10656
Pizza                                                                8528
Italian                                                              7745
Mexican                                                              7575
Latin (Cuban, Dominican, Puerto Rican, South & Central American)     7459
Japanese                                                             6619
Bakery                                                               6127
Caribbean                                                            6016
Name: GRADE, dtype: int64

In [94]:
alt.Chart(df_rest).mark_bar(color ='blue').encode(
    x=alt.X('sum(GRADE):Q', 
    axis=alt.Axis(title ='Number of Inspection Score')),

    y=alt.Y('CUISINE DESCRIPTION:N', 
        axis=alt.Axis(title ='Cuisine Description'),
        sort=alt.EncodingSortField(
            field="GRADE",  # The field to use for the sort
            op="sum",  # The operation to run on the field prior to sorting
            order="descending")  # The order to sort in
    )

)

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.Chart(...)

### C Grade by Boro

In [64]:
#Filter to C Grades by Boro
c_grade = df_rest.query("GRADE == 'C'")

c_borough = (
    c_grade
      .groupby('BORO')
      .count()
      .reset_index()
      )
c_borough

Unnamed: 0,BORO,CAMIS,DBA,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,Bronx,1003,1003,1003,1003,995,1003,1003,1003,1003,...,1003,1003,1003,1003,995,995,995,995,1003,995
1,Brooklyn,2262,2262,2262,2262,2259,2262,2262,2262,2262,...,2262,2262,2262,2262,2259,2259,2259,2257,2262,2259
2,Manhattan,3725,3725,3725,3725,3689,3725,3725,3725,3725,...,3725,3725,3725,3725,3689,3689,3689,3685,3725,3689
3,Queens,2075,2075,2075,2075,2044,2075,2075,2075,2075,...,2075,2075,2075,2075,2044,2044,2044,2019,2075,2044
4,Staten Island,237,237,237,237,237,237,237,237,237,...,237,237,237,237,237,237,237,237,237,237


In [82]:
#Graph C Grades by Boro


alt.Chart(c_borough).mark_bar().encode(
    y=alt.Y('GRADE',
    axis=alt.Axis(title ='Number of C Grades')
    ),

  x=alt.X('BORO',
    axis=alt.Axis(title='Borough')
  ),
)


From this graph, we see that Manhattan has significantly more C Grades than the other boroughs, but Manhattan also has 
the most restaurants compared to other boroughs

### Filter to C grade by Zipcode in Manhattan

In [67]:
#Filter to C Grades by zipcode in Manhattan
# c_grade = df_rest.query("GRADE == 'C'")
c_grade_manhattan = c_grade.query("BORO == 'Manhattan'")
# c_grade_manhattan

c_grade_zip_manhattan = (
    c_grade_manhattan
        .groupby("ZIPCODE")
        .count()
        .reset_index()
        )

c_grade_zip_manhattan

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
91,40805827,LAHORE DELICATESSEN,Manhattan,132,CROSBY STREET,10012.0,2129651777,Bangladeshi,2019-07-08,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.724777,-73.996212,102.0,1.0,4300.0,1007945.0,1.005110e+09,MN24
290,41462643,OBAO,Manhattan,222,EAST 53 STREET,10022.0,2123085588,Thai,2019-07-22,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.757400,-73.968712,106.0,4.0,9800.0,1038542.0,1.013260e+09,MN19
326,40394392,PICCOLO ANGOLO,Manhattan,621,HUDSON STREET,10014.0,2122299177,Italian,2019-07-24,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.738048,-74.005618,102.0,3.0,7900.0,1011419.0,1.006250e+09,MN23
363,40388818,YAMA JAPANESE RESTAURANT',Manhattan,122,EAST 17 STREET,10003.0,2124750969,Japanese,2017-12-06,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.736000,-73.987721,105.0,2.0,5000.0,1017830.0,1.008720e+09,MN21
397,50003242,ROUTE 66 AMERICAN BBQ,Manhattan,79,PEARL ST,10004.0,2129431602,American,2019-07-24,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.703980,-74.010250,101.0,1.0,900.0,1000837.0,1.000290e+09,MN25
442,41298790,SERAFINA AT TIME HOTEL,Manhattan,224,WEST 49 STREET,10019.0,2122471000,American,2018-06-26,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.761007,-73.985052,105.0,3.0,12500.0,1024782.0,1.010200e+09,MN17
523,50080949,CAFE CINQ,Manhattan,5,PENN PLZ,10001.0,2122375036,Delicatessen,2018-12-18,Violations were cited in the following area(s).,...,2019-11-26,Pre-permit (Operational) / Re-inspection,40.751854,-73.993716,104.0,3.0,10300.0,1013547.0,1.007570e+09,MN13
537,50049228,LENWICH,Manhattan,60,W 48TH ST,10036.0,2128716677,Soups & Sandwiches,2018-09-05,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.758199,-73.980320,105.0,4.0,9600.0,1034500.0,1.012630e+09,MN17
686,41703394,PIONEERS BAR & LOUNGE,Manhattan,134,WEST 29 STREET,10001.0,2127142222,American,2019-02-04,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.747114,-73.991190,105.0,3.0,9500.0,1015129.0,1.008040e+09,MN17
702,50051274,LUDLOW COFFEE SUPPLY,Manhattan,176,LUDLOW ST,10002.0,6465259421,Café/Coffee/Tea,2017-08-11,Violations were cited in the following area(s).,...,2019-11-26,Cycle Inspection / Re-inspection,40.721714,-73.987514,103.0,1.0,3001.0,1005417.0,1.004120e+09,MN27


In [99]:
c_grade_manhattan

c_manhattan_cuisine = (
    c_grade_manhattan
      .groupby('CUISINE DESCRIPTION')
      .count()
      .reset_index()
      )
c_manhattan_cuisine

Unnamed: 0,CUISINE DESCRIPTION,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,INSPECTION DATE,ACTION,...,RECORD DATE,INSPECTION TYPE,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,African,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
1,American,846,846,846,846,846,832,846,846,846,...,846,846,846,846,832,832,832,829,846,832
2,Armenian,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
3,Asian,118,118,118,118,118,118,118,118,118,...,118,118,118,118,118,118,118,118,118,118
4,Australian,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
5,Bagels/Pretzels,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
6,Bakery,96,96,96,96,96,96,96,96,96,...,96,96,96,96,96,96,96,96,96,96
7,Bangladeshi,13,13,13,13,13,13,13,13,13,...,13,13,13,13,13,13,13,13,13,13
8,Barbecue,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
9,Café/Coffee/Tea,127,127,127,127,127,127,127,127,127,...,127,127,127,127,127,127,127,127,127,127


In [107]:
#Grade C Grades by cuisine type in manhattan
bars = alt.Chart(c_manhattan_cuisine).mark_bar(color ='blue').encode(
    x=alt.X('sum(GRADE):Q', 
    axis=alt.Axis(title ='# of C Grades')),

    y=alt.Y('CUISINE DESCRIPTION:N', 
        axis=alt.Axis(title ='Cuisine'),
        sort=alt.EncodingSortField(
            field="GRADE",  # The field to use for the sort
            op="sum",  # The operation to run on the field prior to sorting
            order="descending")  # The order to sort in
    )

)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='sum(GRADE):Q'
)

(bars + text).properties(height=900)



In [72]:
#Grade C Grades by Zipcode in Manhattan
alt.Chart(c_grade_zip_manhattan).mark_bar(color ='blue').encode(
    x=alt.X('sum(GRADE):Q', 
    axis=alt.Axis(title ='Number of C Grades by Zipcode in Manhattan')),

    y=alt.Y('ZIPCODE:N', 
        axis=alt.Axis(title ='Zipcode'),
        sort=alt.EncodingSortField(
            field="GRADE",  # The field to use for the sort
            op="sum",  # The operation to run on the field prior to sorting
            order="descending")  # The order to sort in
    )

)