In [159]:
import os
import pandas as pd
import pandas_profiling
import plotly
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [2]:
pollution_file = 'pollution.csv'

In [3]:
pollution_df = pd.read_csv(pollution_file)

In [4]:
pollution_df.head()

Unnamed: 0,facility_name,doc_ctrl_num,reporting_year,chemical_name,release_estimate_amount,carcinogen_chem_ind,clean_air_act_chem_ind,chem_ind_3350,street_address,city,county,county_code,state,zip,region,total_release
0,CCI MANUFACTURING IL CORP,1305203000000.0,2005,ETHYLENE GLYCOL,3930.0,N,Y,N,15550 CANAL BANK RD,LEMONT,COOK,17031,IL,60439,5,3930.0
1,GE MATHIS CO,1305203000000.0,2005,CHROMIUM,0.0,N,Y,Y,6100 S OAK PARK AVE,CHICAGO,COOK,17031,IL,60638,5,0.0
2,ARMACELL LLC,1305203000000.0,2005,DECABROMODIPHENYL OXIDE,0.0,N,N,N,16800 S CANAL ST,SOUTH HOLLAND,COOK,17031,IL,60473,5,0.0
3,WEC CO,1305203000000.0,2005,COPPER,0.99,N,N,N,2606 RT 2 S,OREGON,OGLE,17141,IL,61061,5,0.99
4,MICKEY TRUCK BODIES INC,1305203000000.0,2005,"SULFURIC ACID (1994 AND AFTER ""ACID AEROSOLS"" ...",10.0,N,N,N,14661 OLD COLONIAL RD,BLOOMINGTON,MCLEAN,17113,IL,61704,5,10.0


In [5]:
# report = pollution_df.profile_report(title='Pandas Profiling Report')
# report.to_file(output_file='output.html')

# Grouped by Facility

In [66]:
facility_grouped = pollution_df.groupby(by='facility_name')

In [67]:
facility_grouped = facility_grouped.sum().dropna(how='any')


In [68]:
facility_grouped = facility_grouped.sort_values(by = ['total_release'], ascending = False)

In [69]:
facility_grouped_gtz = facility_grouped.loc[facility_grouped['total_release'] > 0]

In [70]:
facility_grouped_gtz= facility_grouped_gtz[['total_release']]

In [71]:
facility_grouped_gtz = facility_grouped_gtz.head(10)
x = np.array(facility_grouped_gtz.index)
y = np.array(facility_grouped_gtz['total_release'])

In [76]:
fig = px.bar(facility_grouped_gtz, x = x, y = y)
fig.update_layout(title = 'Top 10 Facilities by Release Amount',
                 xaxis=dict(title= 'Facility', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))

# Grouped  by Chemical

In [204]:
chemical_grouped = pollution_df.groupby(by = 'chemical_name')

In [205]:
chemical_grouped = chemical_grouped.sum().dropna(how='any')

In [206]:
chemical_grouped = chemical_grouped.sort_values(by = ['total_release'], ascending = False)

In [207]:
chemical_grouped_gtz = chemical_grouped.loc[chemical_grouped['total_release'] > 0]

In [208]:
chemical_grouped_gtz = chemical_grouped_gtz[['total_release']]

In [209]:
chemical_grouped_gtz = chemical_grouped_gtz.head(10)
x = np.array(chemical_grouped_gtz.index)
y = np.array(chemical_grouped_gtz['total_release'])

In [210]:
fig = px.bar(chemical_grouped_gtz, x = x, y = y)
fig.update_layout(title = 'Top 10 Chemicals Released by Amount',
                 xaxis=dict(title= 'Chemical', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))
fig.show()
plotly.offline.plot(fig, filename='top_10_chemicals_amount.html', auto_open = False)

'top_10_chemicals_amount.html'

# Grouped by Year

In [197]:
year_grouped = pollution_df.groupby(by = 'reporting_year')

In [198]:
year_grouped = year_grouped.sum().dropna(how='any')

In [199]:
year_grouped = year_grouped.sort_values(by = ['total_release'], ascending = False)

In [200]:
year_grouped_gtz = year_grouped.loc[year_grouped['total_release'] > 0]

In [201]:
year_grouped_gtz = year_grouped_gtz[['total_release']]

In [202]:
year_grouped_gtz.head()
x = np.array(year_grouped_gtz.index)
y = np.array(year_grouped_gtz['total_release'])

In [203]:
fig = px.bar(year_grouped_gtz, x = x, y= y)
fig.update_layout(title_text = 'Total Chemicals Released by Year',
                  xaxis=dict(title= 'Year', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))
fig.show()
plotly.offline.plot(fig, filename='top_chemicals_by_year.html', auto_open = False)

'top_chemicals_by_year.html'

# Grouped by Carcinogen

In [189]:
carc_grouped = pollution_df.groupby(by = 'carcinogen_chem_ind')

In [190]:
carc_grouped = carc_grouped.sum().dropna(how= 'any')

In [191]:
carc_grouped = carc_grouped.sort_values(by = ['total_release'], ascending = False)

In [192]:
carc_grouped_gtz = carc_grouped.loc[carc_grouped['total_release'] > 0]

In [193]:
carc_grouped_gtz = carc_grouped_gtz[['total_release']]

In [194]:
carc_grouped_gtz.head()

Unnamed: 0_level_0,total_release
carcinogen_chem_ind,Unnamed: 1_level_1
N,404722400.0
Y,36912010.0


In [195]:
labels = carc_grouped_gtz.index
values = carc_grouped_gtz['total_release']

In [196]:
fig = go.Figure(data=[go.Pie(labels =labels, values = values, hole =.5, pull = .1)])
fig.update_layout(title_text = 'Total Release of Chemicals known to be Carcinogens')
fig.show()
plotly.offline.plot(fig, filename='carcinogen_amount.html', auto_open = False)

'carcinogen_amount.html'

# Grouped by Clean Air

In [181]:
clean_air_grouped = pollution_df.groupby(by = 'clean_air_act_chem_ind')

In [182]:
clean_air_grouped = clean_air_grouped.sum().dropna(how='any')

In [183]:
clean_air_grouped = clean_air_grouped.sort_values(by = ['total_release'], ascending = False)

In [184]:
clean_air_grouped_gtz = clean_air_grouped.loc[clean_air_grouped['total_release'] > 0]

In [185]:
clean_air_grouped_gtz = clean_air_grouped_gtz[['total_release']]

In [186]:
clean_air_grouped_gtz.head()

Unnamed: 0_level_0,total_release
clean_air_act_chem_ind,Unnamed: 1_level_1
Y,307459200.0
N,134175100.0


In [187]:
labels = clean_air_grouped_gtz.index
values = clean_air_grouped_gtz['total_release']

In [188]:
fig = go.Figure(data=[go.Pie(labels =labels, values = values, hole =.5, pull = .1)])
fig.update_layout(title_text = 'Total Release of Chemicals monitored by the Clean Air Act')
fig.show()
plotly.offline.plot(fig, filename='clean_air_act_amount.html', auto_open = False)

'clean_air_act_amount.html'

# Grouped by 3350 Chemical

In [173]:
chem_3350_grouped = pollution_df.groupby(by ='chem_ind_3350')

In [174]:
chem_3350_grouped = chem_3350_grouped.sum().dropna(how = 'any')

In [175]:
chem_3350_grouped = chem_3350_grouped.sort_values(by = ['total_release'], ascending = False)

In [176]:
chem_3350_grouped_gtz = chem_3350_grouped.loc[chem_3350_grouped['total_release'] > 0]

In [177]:
chem_3350_grouped_gtz = chem_3350_grouped_gtz[['total_release']]

In [178]:
chem_3350_grouped_gtz.head()

Unnamed: 0_level_0,total_release
chem_ind_3350,Unnamed: 1_level_1
N,421023800.0
Y,20610570.0


In [179]:
labels = chem_3350_grouped_gtz.index
values = chem_3350_grouped_gtz['total_release']

In [180]:
fig = go.Figure(data=[go.Pie(labels =labels, values = values, hole =.5, pull = .1)])
fig.update_layout(title_text = 'Total Release of 3350 Chemicals')
fig.show()
plotly.offline.plot(fig, filename='chemicals_3350.html', auto_open = False)

'chemicals 3350.html'

# Grouped by County

In [164]:
county_grouped = pollution_df.groupby(by = 'county')

In [165]:
county_grouped = county_grouped.sum().dropna(how='any')

In [166]:
county_grouped = county_grouped.sort_values(by = ['total_release'], ascending = False)

In [167]:
county_grouped_gtz = county_grouped.loc[county_grouped['total_release'] > 0]

In [168]:
county_grouped_gtz = county_grouped_gtz[['total_release']]

In [169]:
county_grouped_gtz =county_grouped_gtz.head(10)

In [170]:
x = np.array(county_grouped_gtz.index)
y = np.array(county_grouped_gtz['total_release'])
size = county_grouped_gtz['total_release']
y

array([56953026.73335309, 47034546.05228688, 46327526.87668534,
       31341142.37987931, 18254106.32648428, 13883288.91586049,
       13810589.879     , 13209686.894     , 12007121.6207    ,
       11340300.87214009])

In [171]:
# fig = px.scatter(x = labels, y = values)
# fig.show()

In [172]:
fig = px.bar(county_grouped_gtz, x = x, y = y)
fig.update_layout(title_text = 'Top 10 Counties and Total Chemicals Released',
                  xaxis=dict(title= 'County', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))
fig.show()
plotly.offline.plot(fig, filename='top_10_counties_amount.html', auto_open = False)

'top_10_counties_amount.html'

# In Depth Carcinogens

In [160]:
carc_df = pollution_df.loc[pollution_df['carcinogen_chem_ind'] == 'Y']
carc_df.head()

Unnamed: 0,facility_name,doc_ctrl_num,reporting_year,chemical_name,release_estimate_amount,carcinogen_chem_ind,clean_air_act_chem_ind,chem_ind_3350,street_address,city,county,county_code,state,zip,region,total_release
5,STREATOR DEPENDABLE,1305203000000.0,2005,NICKEL,0.0,Y,Y,Y,410 W BROADWAY AVE,STREATOR,LASALLE,17099,IL,61364,5,
6,VAUGHAN & BUSHNELL MANUFACTURING CO,1305203000000.0,2005,LEAD,0.0,Y,Y,Y,201 W MAIN ST,BUSHNELL,MCDONOUGH,17109,IL,61422,5,
8,ENGIS CORP,1305203000000.0,2005,LEAD,0.75,Y,Y,Y,105 W. HINTZ ROAD,WHEELING,COOK,17031,IL,60090,5,0.75
15,ELECTRIC ENERGY INC,1305203000000.0,2005,DIOXIN AND DIOXIN-LIKE COMPOUNDS,0.0,Y,Y,N,2100 PORTLAND RD,JOPPA,MASSAC,17127,IL,62953,5,
17,PEORIA DISPOSAL CO #1,1305203000000.0,2005,POLYCHLORINATED BIPHENYLS,0.0,Y,Y,N,4349 W SOUTHPORT RD,PEORIA,PEORIA,17143,IL,61615,5,0.0


In [161]:
facility_carc_grouped = carc_df.groupby(by='facility_name').sum()
facility_carc_grouped = facility_carc_grouped[['total_release']].sort_values(by= 'total_release', ascending= False)
facility_carc_grouped = facility_carc_grouped.head(10)
facility_carc_grouped

Unnamed: 0_level_0,total_release
facility_name,Unnamed: 1_level_1
SABIC INNOVATIVE PLASTICS US LLC,3738250.0
TATE & LYLE DECATUR,2132662.0
BRP US INC,1553367.0
ADM DECATUR COMPLEX,1522089.0
IMPERIAL MARBLE CORP,1060922.0
INGREDION INC ARGO PLANT,958851.6
ADM,917209.8
FORD MOTOR CO CHICAGO ASSEMBLY,886879.6
WOOD RIVER REFINERY,846729.6
CROWNLINE BOATS,817611.3


In [162]:
x = np.array(facility_carc_grouped.index)
y = np.array(facility_carc_grouped['total_release'])

In [163]:
fig = px.bar(facility_carc_grouped, x = x, y = y)
fig.update_layout(title_text = 'Top 10 Facilities by Release Amount',
                  xaxis=dict(title= 'Facility Name', titlefont_size = 16),
                 yaxis=dict(title = 'Amount Released (lbs)', titlefont_size = 16, tickfont_size=14))
fig.show()
plotly.offline.plot(fig, filename='top_10_facilities_carcinogens.html', auto_open = False)

'top_10_facilities_carcinogens.html'