In [40]:
#Importing Libraries

import pandas as pd
import numpy as np

from chart_studio import plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)
import plotly.graph_objs as go
import plotly.express as px

In [41]:
#Importing the csv files

inspections_fact = pd.read_csv('inspections_fact.csv')
establishments_dimension = pd.read_csv('establishments_dimension.csv')
inspection_point_dimension = pd.read_csv('inspection_point_dimension.csv')
violations_dimension = pd.read_csv('violations_dimension.csv')

In [None]:
violations_dimension.head()

In [42]:
### Analysis on the what the top Inspection comments are : To understand the most common problems.

# Checking count of unique inspector comments. 
len(violations_dimension['Inspector_Comments'].unique())

#The inspector comments descriptions are very specific. There are 97358 unique comments

97358

In [43]:
# Since inspector comments are unique we are selecting only first 200 characters from the string for ease of reading and getting the gist.

violations_dimension['Resized_Inspector_Comments'] = violations_dimension['Inspector_Comments'].str[:200]
violations_dimension.drop('Inspector_Comments',1,inplace=True)

comments_data = violations_dimension[['Resized_Inspector_Comments']].groupby(['Resized_Inspector_Comments']).agg({'Resized_Inspector_Comments':'count'})
comments_data.columns = ['Counts']
comments_data.reset_index(inplace=True)
comments_data.columns = ['Resized_Inspector_Comments','Counts']
comments_data.sort_values('Counts',ascending=True,inplace=True)


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



In [44]:
fig1 = px.bar(comments_data[comments_data.shape[0]-10:], x='Counts', y='Resized_Inspector_Comments', orientation='h', height=500, width=1500)
fig1.update_layout(xaxis_title="Number of Violations", yaxis_title="Inspection Comments", title="Common Health Violations in Chicago")
fig1.update_traces(marker_color='darkcyan')
fig1.show()

In [45]:
#Analysis on the type of Facilities

#Extracting all the different types of facilities and ploting them as a pie chart. Clubbing facilities with less than 1% as Others

facility_types = establishments_dimension['Facility_Type'].value_counts().keys().tolist()
facility_count = establishments_dimension['Facility_Type'].value_counts().tolist()

final_types = []
final_count = []
others_count = 0
one_percent = 0.01 * establishments_dimension.shape[0]
for count, facility_type in zip(facility_count, facility_types):
    if count > one_percent:
        final_types.append(facility_type)
        final_count.append(count)
    else:
        others_count += count
        
final_types.append('Others')
final_count.append(others_count)

# figure
fig = {
    "data": [{
        "values": final_count,
        "labels": final_types,
        "hoverinfo": "label+percent",
        "hole": .5,
        "type": "pie"
        },
    ],
    "layout": {
        "title": "Types of facilities",
        "width": 800,
        "height": 800
    }
}

iplot(fig)

In [46]:
#Analysis on the number of facilitis that passed the inspection and the one's that did not. 

data = [
    go.Bar(
        x = inspections_fact['Results'].value_counts().keys().tolist(),
        y = inspections_fact['Results'].value_counts().tolist(),
        marker = dict(
            color = [
                'rgb(0,100, 0)', 
                'rgb(0,100, 0)',
                'rgb(255, 0, 0)',
                'rgb(255, 0, 0)',
                'rgb(255, 0, 0)'
            ]
        )
    )
]

layout = go.Layout(
    title = 'Inspection Results',
)

fig = go.Figure(data = data, layout = layout)
iplot(fig, filename = 'inspections')

In [47]:
#Analysis on the top facilities that are categorised as high risk

high_risk=establishments_dimension[establishments_dimension.Risk=='Risk 1 (High)']
high_risk.head()

data = [
    go.Bar(
        x = establishments_dimension['Facility_Type'].value_counts()[:10].keys().tolist(),
        y = establishments_dimension['Facility_Type'].value_counts()[:10].tolist(),
        marker = dict(
            color = [
                'rgb(0,100, 0)', 
                'rgb(0,100, 0)',
                'rgb(255, 0, 0)',
                'rgb(255, 0, 0)',
                'rgb(255, 0, 0)'
            ]
        )
    )
]

layout = go.Layout(
    title = 'Top 10 Facility Type with High risk',
)

fig = go.Figure(data = data, layout = layout)
iplot(fig, filename = 'highrisk')



In [48]:
#Analysis of top 20 facility and their results.

high_risk = establishments_dimension[establishments_dimension.Risk=='Risk 1 (High)']
high_risk.head()

data = [
    go.Bar(
        x = establishments_dimension['Facility_Type'].value_counts()[:20].keys().tolist(),
        y = inspections_fact['Results'],
        marker = dict(
            color = [
                'rgb(0,100, 0)', 
                'rgb(0,100, 0)',
                'rgb(255, 0, 0)',
                'rgb(255, 0, 0)',
                'rgb(255, 0, 0)'
            ]
        )
    )
]

layout = go.Layout(
    title = 'Top 10 Facility Type and their corresponding inspection results',
)

fig = go.Figure(data = data, layout = layout)
iplot(fig, filename = 'highrisk')
