In [1]:
import os
import pandas as pd
import pandas_profiling
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [2]:
pollution_file = 'pollution.csv'

In [3]:
pollution_df = pd.read_csv(pollution_file)

In [4]:
pollution_df.head()

Unnamed: 0,facility_name,doc_ctrl_num,reporting_year,chemical_name,release_estimate_amount,carcinogen_chem_ind,clean_air_act_chem_ind,chem_ind_3350,street_address,city,county,county_code,state,zip,region,total_release
0,CCI MANUFACTURING IL CORP,1305203000000.0,2005,ETHYLENE GLYCOL,3930.0,N,Y,N,15550 CANAL BANK RD,LEMONT,COOK,17031,IL,60439,5,3930.0
1,GE MATHIS CO,1305203000000.0,2005,CHROMIUM,0.0,N,Y,Y,6100 S OAK PARK AVE,CHICAGO,COOK,17031,IL,60638,5,0.0
2,ARMACELL LLC,1305203000000.0,2005,DECABROMODIPHENYL OXIDE,0.0,N,N,N,16800 S CANAL ST,SOUTH HOLLAND,COOK,17031,IL,60473,5,0.0
3,WEC CO,1305203000000.0,2005,COPPER,0.99,N,N,N,2606 RT 2 S,OREGON,OGLE,17141,IL,61061,5,0.99
4,MICKEY TRUCK BODIES INC,1305203000000.0,2005,"SULFURIC ACID (1994 AND AFTER ""ACID AEROSOLS"" ...",10.0,N,N,N,14661 OLD COLONIAL RD,BLOOMINGTON,MCLEAN,17113,IL,61704,5,10.0


In [5]:
# report = pollution_df.profile_report(title='Pandas Profiling Report')
# report.to_file(output_file='output.html')

# Grouped by Facility

In [66]:
facility_grouped = pollution_df.groupby(by='facility_name')

In [67]:
facility_grouped = facility_grouped.sum().dropna(how='any')


In [68]:
facility_grouped = facility_grouped.sort_values(by = ['total_release'], ascending = False)

In [69]:
facility_grouped_gtz = facility_grouped.loc[facility_grouped['total_release'] > 0]

In [70]:
facility_grouped_gtz= facility_grouped_gtz[['total_release']]

In [71]:
facility_grouped_gtz = facility_grouped_gtz.head(10)
x = np.array(facility_grouped_gtz.index)
y = np.array(facility_grouped_gtz['total_release'])

In [76]:
fig = px.bar(facility_grouped_gtz, x = x, y = y)
fig.update_layout(title = 'Top 10 Facilities by Release Amount',
                 xaxis=dict(title= 'Facility', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))

# Grouped  by Chemical

In [78]:
chemical_grouped = pollution_df.groupby(by = 'chemical_name')

In [79]:
chemical_grouped = chemical_grouped.sum().dropna(how='any')

In [80]:
chemical_grouped = chemical_grouped.sort_values(by = ['total_release'], ascending = False)

In [81]:
chemical_grouped_gtz = chemical_grouped.loc[chemical_grouped['total_release'] > 0]

In [82]:
chemical_grouped_gtz = chemical_grouped_gtz[['total_release']]

In [83]:
chemical_grouped_gtz = chemical_grouped_gtz.head(10)
x = np.array(chemical_grouped_gtz.index)
y = np.array(chemical_grouped_gtz['total_release'])

In [84]:
fig = px.bar(chemical_grouped_gtz, x = x, y = y)
fig.update_layout(title = 'Top 10 Chemicals Released by Amount',
                 xaxis=dict(title= 'Chemical', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))
fig.show()

# Grouped by Year

In [86]:
year_grouped = pollution_df.groupby(by = 'reporting_year')

In [87]:
year_grouped = year_grouped.sum().dropna(how='any')

In [88]:
year_grouped = year_grouped.sort_values(by = ['total_release'], ascending = False)

In [89]:
year_grouped_gtz = year_grouped.loc[year_grouped['total_release'] > 0]

In [90]:
year_grouped_gtz = year_grouped_gtz[['total_release']]

In [91]:
year_grouped_gtz.head()
x = np.array(year_grouped_gtz.index)
y = np.array(year_grouped_gtz['total_release'])

In [93]:
fig = px.bar(year_grouped_gtz, x = x, y= y)
fig.update_layout(title_text = 'Total Chemicals Released by Year',
                  xaxis=dict(title= 'Year', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))
fig.show()

# Grouped by Carcinogen

In [94]:
carc_grouped = pollution_df.groupby(by = 'carcinogen_chem_ind')

In [95]:
carc_grouped = carc_grouped.sum().dropna(how= 'any')

In [96]:
carc_grouped = carc_grouped.sort_values(by = ['total_release'], ascending = False)

In [97]:
carc_grouped_gtz = carc_grouped.loc[carc_grouped['total_release'] > 0]

In [98]:
carc_grouped_gtz = carc_grouped_gtz[['total_release']]

In [99]:
carc_grouped_gtz.head()

Unnamed: 0_level_0,total_release
carcinogen_chem_ind,Unnamed: 1_level_1
N,404722400.0
Y,36912010.0


In [100]:
labels = carc_grouped_gtz.index
values = carc_grouped_gtz['total_release']

In [101]:
fig = go.Figure(data=[go.Pie(labels =labels, values = values, hole =.5, pull = .1)])
fig.update_layout(title_text = 'Total Release of Chemicals known to be Carcinogens')
fig.show()

# Grouped by Clean Air

In [35]:
clean_air_grouped = pollution_df.groupby(by = 'clean_air_act_chem_ind')

In [36]:
clean_air_grouped = clean_air_grouped.sum().dropna(how='any')

In [37]:
clean_air_grouped = clean_air_grouped.sort_values(by = ['total_release'], ascending = False)

In [38]:
clean_air_grouped_gtz = clean_air_grouped.loc[clean_air_grouped['total_release'] > 0]

In [39]:
clean_air_grouped_gtz = clean_air_grouped_gtz[['total_release']]

In [40]:
clean_air_grouped_gtz.head()

Unnamed: 0_level_0,total_release
clean_air_act_chem_ind,Unnamed: 1_level_1
Y,307459200.0
N,134175100.0


In [41]:
labels = clean_air_grouped_gtz.index
values = clean_air_grouped_gtz['total_release']

In [42]:
fig = go.Figure(data=[go.Pie(labels =labels, values = values, hole =.5, pull = .1)])
fig.update_layout(title_text = 'Total Release of Chemicals monitored by the Clean Air Act')
fig.show()

# Grouped by 3350 Chemical

In [43]:
chem_3350_grouped = pollution_df.groupby(by ='chem_ind_3350')

In [44]:
chem_3350_grouped = chem_3350_grouped.sum().dropna(how = 'any')

In [45]:
chem_3350_grouped = chem_3350_grouped.sort_values(by = ['total_release'], ascending = False)

In [46]:
chem_3350_grouped_gtz = chem_3350_grouped.loc[chem_3350_grouped['total_release'] > 0]

In [47]:
chem_3350_grouped_gtz = chem_3350_grouped_gtz[['total_release']]

In [48]:
chem_3350_grouped_gtz.head()

Unnamed: 0_level_0,total_release
chem_ind_3350,Unnamed: 1_level_1
N,421023800.0
Y,20610570.0


In [49]:
labels = chem_3350_grouped_gtz.index
values = chem_3350_grouped_gtz['total_release']

In [50]:
fig = go.Figure(data=[go.Pie(labels =labels, values = values, hole =.5, pull = .1)])
fig.update_layout(title_text = 'Total Release of 3350 Chemicals')
fig.show()

# Grouped by County

In [102]:
county_grouped = pollution_df.groupby(by = 'county')

In [103]:
county_grouped = county_grouped.sum().dropna(how='any')

In [104]:
county_grouped = county_grouped.sort_values(by = ['total_release'], ascending = False)

In [105]:
county_grouped_gtz = county_grouped.loc[county_grouped['total_release'] > 0]

In [106]:
county_grouped_gtz = county_grouped_gtz[['total_release']]

In [107]:
county_grouped_gtz =county_grouped_gtz.head(10)

In [108]:
values = np.array(county_grouped_gtz['total_release'])
labels = np.array(county_grouped_gtz.index)
size = county_grouped_gtz['total_release']

In [109]:
# fig = px.scatter(x = labels, y = values)
# fig.show()

In [112]:
fig = px.bar(county_grouped_gtz, x = labels, y = values)
fig.update_layout(title_text = 'Top 10 Counties and Total Chemicals Released',
                  xaxis=dict(title= 'County', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))
fig.show()