In [2]:
import pandas as pd

# About Programs

In [2]:
df = pd.read_csv('ICIS-AIR_PROGRAMS.csv')

In [3]:
df.head()

Unnamed: 0,PGM_SYS_ID,PROGRAM_CODE,PROGRAM_DESC,AIR_OPERATING_STATUS_CODE,AIR_OPERATING_STATUS_DESC
0,NH0000003300190201,CAASIP,State Implementation Plan for National Primary...,OPR,Operating
1,NH0000003300700012,CAASIP,State Implementation Plan for National Primary...,OPR,Operating
2,NH0000003300700013,CAASIP,State Implementation Plan for National Primary...,CLS,Permanently Closed
3,NH0000003300700013,CAANSPS,New Source Performance Standards,CLS,Permanently Closed
4,NH0000003300100001,CAAGACTM,40 CFR Part 63 Area Sources,OPR,Operating


In [3]:
def get_groupby_counts(df, col_name):
    tot_count = df[col_name].count()
    count_per_group = df[col_name].value_counts().to_frame()
    count_per_group['Proportion'] = count_per_group[col_name] / tot_count
    
    count_per_count = count_per_group[col_name].value_counts().to_frame()
    count_per_count['Proportion'] = count_per_count[col_name] / count_per_count[col_name].sum()
    return tot_count, count_per_group, count_per_count

In [59]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df, 'PGM_SYS_ID')

In [60]:
tot_count

388371

In [62]:
count_per_count

Unnamed: 0,PGM_SYS_ID,Proportion
1,148827,0.631793
2,44944,0.190794
3,27565,0.117018
4,7919,0.033617
5,3989,0.016934
6,1421,0.006032
7,520,0.002207
8,259,0.001099
9,90,0.000382
10,27,0.000115


In [35]:
# Over 60% of facilities are under 1 program program, over 80% are under 1-2 programs

In [63]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df, 'PROGRAM_DESC')

In [64]:
count_per_group

Unnamed: 0,PROGRAM_DESC,Proportion
State Implementation Plan for National Primary and Secondary Ambient Air Quality Standards,213347,0.549338
MACT Standards (40 CFR Part 63),50259,0.12941
New Source Performance Standards,45041,0.115974
Title V Permits,28005,0.072109
Federally-Enforceable State Operating Permit - Non Title V,9510,0.024487
Stratospheric Ozone Protection,8245,0.02123
New Source Review Permit Requirements,8159,0.021008
40 CFR Part 63 Area Sources,7194,0.018524
Prevention of Significant Deterioration of Air Quality,7094,0.018266
National Emission Standards for Hazardous Air Pollutants (40 CFR Part 61),4926,0.012684


In [None]:
# Over 50% are under SIP

In [66]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df, 'AIR_OPERATING_STATUS_DESC')

In [67]:
count_per_group

Unnamed: 0,AIR_OPERATING_STATUS_DESC,Proportion
Operating,277659,0.714932
Permanently Closed,101206,0.260591
Temporarily Closed,5444,0.014018
Planned Facility,2286,0.005886
Under Construction,1455,0.003746
Seasonal,321,0.000827


In [69]:
# Over 70% are operating, over a quarter are permanently closed!

# Program subparts

In [4]:
df3 = pd.read_csv('ICIS-AIR_PROGRAM_SUBPARTS.csv')

In [8]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df3, 'PGM_SYS_ID')

In [15]:
tot_count

154813

In [14]:
count_per_count

Unnamed: 0,PGM_SYS_ID,Proportion
1,40540,0.55673
2,16223,0.222788
3,5827,0.080021
7,3168,0.043506
4,2924,0.040155
5,1701,0.02336
8,851,0.011687
6,797,0.010945
9,246,0.003378
10,111,0.001524


In [None]:
# Not all facilities are in a sub-program
# Over half the facilities are in 1, but some are in 20+

In [16]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df3, 'AIR_PROGRAM_SUBPART_DESC')

In [17]:
count_per_group

Unnamed: 0,AIR_PROGRAM_SUBPART_DESC,Proportion
MACT Part 63 - Subpart ZZZZ - STATIONARY RECIPROCATING INTERNAL COMBUSTION ENGINES (RICE),20916,0.135105
NSPS Part 60 - Subpart JJJJ - STATIONARY SPARK IGNITION INTERNAL COMBUSTION ENGINES,11485,0.074186
NSPS Part 60 - Subpart A - GENERAL PROVISIONS,8919,0.057611
NSPS Part 60 - Subpart IIII - STATIONARY COMPRESSION IGNITION INTERNAL COMBUSTION ENGINES,8842,0.057114
MACT Part 63 - Subpart M - DRY CLEANERS PERCHLOROETHYLENE,8744,0.056481
NSPS Part 60 - Subpart Dc - SMALL INDUS-COMMER-INSTITUTL STEAM GENERATING UNITS,6286,0.040604
NSPS Part 60 - Subpart OOO - NONMETALLIC MINERAL PROCESSING PLANTS,6264,0.040462
MACT Part 63 - Subpart A - GENERAL PROVISIONS,5922,0.038253
MACT Part 63 - Subpart HH - OIL AND NATURAL GAS PRODUCTION FACILITIES,5896,0.038085
"NSPS Part 60 - Subpart KKK - EQUIPT LEAK VOC ONSHORE NATURAL GAS PROC PLNT CONSTRUCT, RECONSTRUCT, MOD 01/20/1984 - 08/23/2011",3903,0.025211


# About Facilities

In [3]:
df2 = pd.read_csv('ICIS-AIR_FACILITIES.csv')

In [7]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'PGM_SYS_ID')

In [8]:
tot_count

236895

In [12]:
count_per_group.shape

(236895, 2)

In [13]:
# Each facility is only listed once

In [14]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'STATE')

In [17]:
count_per_group

Unnamed: 0,STATE,Proportion
CO,29552,0.124747
IL,20532,0.086671
LA,12850,0.054243
NY,12548,0.052969
OK,11347,0.047899
MD,11092,0.046822
PA,10000,0.042213
VA,8924,0.037671
OH,8883,0.037498
KS,7240,0.030562


In [None]:
# Colorado and Illinois have 12%, 8% each

In [19]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'FACILITY_TYPE_CODE')

In [20]:
count_per_group

Unnamed: 0,FACILITY_TYPE_CODE,Proportion
POF,161843,0.770773
NON,34655,0.165043
CNG,3566,0.016983
COR,3257,0.015511
CTG,2992,0.014249
STF,1350,0.006429
FDF,1338,0.006372
DIS,588,0.0028
TRB,244,0.001162
MWD,118,0.000562


In [21]:
# What do these mean?

In [22]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'AIR_POLLUTANT_CLASS_DESC')

In [23]:
count_per_group

Unnamed: 0,AIR_POLLUTANT_CLASS_DESC,Proportion
Minor Emissions,173366,0.738133
Synthetic Minor Emissions,33415,0.14227
Major Emissions,19235,0.081896
Emissions classification unknown,7449,0.031715
Not applicable,1312,0.005586
Other,94,0.0004


In [24]:
# The majority are minor emissions

In [25]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'CURRENT_HPV')

In [26]:
count_per_group

Unnamed: 0,CURRENT_HPV,Proportion
No Viol,234442,0.989645
Viol,1333,0.005627
Addrs-State,366,0.001545
Unaddr-State,325,0.001372
Addrs-EPA,178,0.000751
Addrs-Local,166,0.000701
Unaddr-Local,55,0.000232
Unaddr-EPA,30,0.000127


In [None]:
# Vast majority no viols!

# Air Pollutants

In [27]:
df4 = pd.read_csv('ICIS-AIR_POLLUTANTS.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df4, 'PGM_SYS_ID')

In [29]:
tot_count 

890517

In [31]:
count_per_count

Unnamed: 0,PGM_SYS_ID,Proportion
1,89348,0.384468
2,48094,0.206950
3,22434,0.096534
6,12854,0.055311
4,11839,0.050944
5,10453,0.044980
7,7545,0.032466
8,7128,0.030672
9,4188,0.018021
10,3246,0.013968


In [None]:
# About 40% of facilities in the df have 1 pollutant, some have as many as 200
# Does not capture facilities with no pollutants

In [35]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df4,'POLLUTANT_DESC')

In [36]:
count_per_group

Unnamed: 0,POLLUTANT_DESC,Proportion
FACIL,124305,0.139587
VOLATILE ORGANIC COMPOUNDS (VOCS),115456,0.129651
TOTAL PARTICULATE MATTER,90744,0.101900
NITROGEN OXIDES NO2,70202,0.078833
Carbon monoxide,65150,0.073160
PARTICULATE MATTER < 10 UM,57478,0.064545
Sulfur dioxide,52570,0.059033
TOTAL HAZARDOUS AIR POLLUTANTS (HAPS),38393,0.043113
Formaldehyde,20166,0.022645
OTHER,13130,0.014744


In [43]:
#What are these pollutants?

In [37]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df4,'SRS_ID')

In [38]:
count_per_group

Unnamed: 0,SRS_ID,Proportion
761346,67850,0.091346
1647643.0,55940,0.075312
761346.0,47801,0.064354
1647619,39778,0.053553
167924,36013,0.048484
65052,35280,0.047497
1647643,34804,0.046857
167924.0,34188,0.046027
65052.0,29870,0.040214
150367.0,28605,0.038511


In [44]:
#What are these?

In [39]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df4,'AIR_POLLUTANT_CLASS_DESC')

In [40]:
count_per_group

Unnamed: 0,AIR_POLLUTANT_CLASS_DESC,Proportion
Minor Emissions,589355,0.661812
Synthetic Minor Emissions,112809,0.126678
Major Emissions,97786,0.109808
Emissions classification unknown,77775,0.087337
Not applicable,12712,0.014275
Other,80,9e-05


In [45]:
#Why are these different from the one under facilities? Because by pollutant?

# Full Compliance Evaluations (FCEs) and Partial Compliance Evaluations (PCEs) 

In [46]:
df5 = pd.read_csv('ICIS-AIR_FCES_PCES.csv')

In [50]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df5,'PGM_SYS_ID')

In [51]:
tot_count

1371977

In [52]:
#Over a million evaluations! 

In [49]:
count_per_count

Unnamed: 0,PGM_SYS_ID,Proportion
1,89348,0.384468
2,48094,0.206950
3,22434,0.096534
6,12854,0.055311
4,11839,0.050944
5,10453,0.044980
7,7545,0.032466
8,7128,0.030672
9,4188,0.018021
10,3246,0.013968


In [53]:
# 40% evaluated 1 time (do we know if any were not evaluated?)

In [54]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df5,'ACTIVITY_ID')

In [57]:
count_per_group

Unnamed: 0,ACTIVITY_ID,Proportion
3600489543,5,3.644376e-06
3600489535,5,3.644376e-06
3600489539,5,3.644376e-06
3600489591,5,3.644376e-06
3600489588,5,3.644376e-06
3600489545,5,3.644376e-06
3600489583,4,2.915501e-06
3600491997,4,2.915501e-06
3600489586,4,2.915501e-06
3600491998,4,2.915501e-06


In [58]:
# What is an activity id? why are there duplicates?

In [59]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df5,'STATE_EPA_FLAG')

In [60]:
count_per_group

Unnamed: 0,STATE_EPA_FLAG,Proportion
S,1209680,0.881706
L,120125,0.087556
E,42172,0.030738


In [61]:
# What does this mean?

In [64]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df5,'ACTIVITY_TYPE_DESC')

In [65]:
count_per_group  #Everything is inspection/evaluation

Unnamed: 0,ACTIVITY_TYPE_DESC,Proportion
Inspection/Evaluation,1371977,1.0


In [66]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df5,'COMP_MONITOR_TYPE_DESC')

In [67]:
count_per_group

Unnamed: 0,COMP_MONITOR_TYPE_DESC,Proportion
FCE On-Site,496876,0.3621606
PCE Off-Site,434206,0.316482
PCE On-Site,313063,0.2281839
PCE On-Site Record/Report Review,79186,0.05771671
PCE On-Site Monitoring/Sampling,31875,0.0232329
FCE Off-Site,11564,0.008428713
PCE On-Site Visible Emission Observation,3260,0.002376133
PCE On-Site Interview,1419,0.001034274
PCE On-Site CEMS/CMS Audit,527,0.0003841172
PCE On-Site Fenceline/Ambient Monitoring,1,7.288752e-07


In [68]:
#What do these mean?

In [69]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df5,'PROGRAM_CODES')

In [70]:
count_per_group   #All the different combinations? 833 of them!

Unnamed: 0,PROGRAM_CODES,Proportion
CAASIP,594528,4.333370e-01
CAATVP,134001,9.767008e-02
"CAANSPS, CAASIP",106359,7.752249e-02
"CAASIP, CAATVP",97541,7.109527e-02
"CAAMACT, CAASIP",55538,4.048030e-02
"CAAMACT, CAANSPS, CAASIP, CAATVP",39589,2.885546e-02
"CAAMACT, CAASIP, CAATVP",34616,2.523076e-02
"CAANSPS, CAASIP, CAATVP",33788,2.462725e-02
"CAAMACT, CAANSPS, CAASIP",29059,2.118040e-02
CAANSPS,24001,1.749375e-02


In [71]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df5,'ACTUAL_END_DATE')

In [72]:
count_per_group  # How can we make this useful for us?

Unnamed: 0,ACTUAL_END_DATE,Proportion
09-30-2003,2979,2.171319e-03
09-30-2005,2567,1.871023e-03
09-30-2004,2114,1.540842e-03
09-30-2002,1992,1.451919e-03
09-30-2008,1703,1.241274e-03
09-30-2009,1685,1.228155e-03
09-30-2013,1523,1.110077e-03
09-30-2010,1502,1.094771e-03
09-30-2011,1479,1.078006e-03
09-30-2014,1354,9.868970e-04


# Stack Tests

In [73]:
df6 = pd.read_csv('ICIS-AIR_STACK_TESTS.csv')

In [75]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df6,'PGM_SYS_ID')

In [76]:
tot_count

358508

In [78]:
count_per_count  #A quarter have 1 air-stacks test, some with over 600 (but are all facilities represented?) 

Unnamed: 0,PGM_SYS_ID,Proportion
1,7664,0.273734
2,4206,0.150225
3,3320,0.118580
4,1843,0.065826
6,1205,0.043039
5,1162,0.041503
7,792,0.028288
8,685,0.024466
9,558,0.019930
10,493,0.017608


In [79]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df6,'ACTIVITY_ID')

In [81]:
# Activity IDs are unique

In [82]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df6,'COMP_MONITOR_TYPE_DESC')

In [84]:
# All are Stack Tests

In [85]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df6,'STATE_EPA_FLAG')

In [86]:
count_per_group

Unnamed: 0,STATE_EPA_FLAG,Proportion
S,296275,0.826411
L,59412,0.16572
E,2821,0.007869


In [88]:
#What does this mean?

In [89]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df6,'ACTUAL_END_DATE')

In [90]:
count_per_group #How to make this useful?

Unnamed: 0,ACTUAL_END_DATE,Proportion
05/06/2008,359,0.001001
09/01/2006,343,0.000957
07/08/2014,291,0.000812
06/10/2014,287,0.000801
04/16/2014,278,0.000775
12/15/2009,278,0.000775
06/17/2014,274,0.000764
09/24/2013,270,0.000753
05/21/2014,257,0.000717
04/10/2014,253,0.000706


In [91]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df6,'POLLUTANT_CODES')

In [92]:
count_per_group  #How are these different from the air pollutants test?

Unnamed: 0,POLLUTANT_CODES,Proportion
TOTAL PARTICULATE MATTER,37678,0.160473
Carbon monoxide,35349,0.150553
NITROGEN OXIDES NO2,34777,0.148117
VOLATILE ORGANIC COMPOUNDS (VOCS),25734,0.109602
FACIL,15353,0.065389
PARTICULATE MATTER < 10 UM,11846,0.050453
Sulfur dioxide,11451,0.048770
NITROGEN OXIDES,10607,0.045176
VISIBLE EMISSIONS,9110,0.038800
NITROGEN OXIDES NO,3591,0.015294


In [93]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df6,'AIR_STACK_TEST_STATUS_DESC')

In [94]:
count_per_group  #Majority are pass

Unnamed: 0,AIR_STACK_TEST_STATUS_DESC,Proportion
Pass,300388,0.951471
Fail,12438,0.039397
Pending,2852,0.009034
Incomplete,31,9.8e-05


# TITLE V

In [95]:
df7 = pd.read_csv('ICIS-AIR_TITLEV_CERTS.csv')

In [99]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df7,'PGM_SYS_ID')

In [102]:
tot_count

1720800

In [103]:
count_per_count # facilities have more than one title v certification? why?

Unnamed: 0,PGM_SYS_ID,Proportion
5,1173,0.053946
10,1100,0.050589
80,1061,0.048795
85,958,0.044058
30,901,0.041437
20,876,0.040287
75,772,0.035504
90,763,0.035090
25,762,0.035044
95,733,0.033710


In [104]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df7,'ACTIVITY_ID')

In [105]:
count_per_group  #why duplicate activity ids?

Unnamed: 0,ACTIVITY_ID,Proportion
3600014564,35,0.000020
3600014562,35,0.000020
3600014568,30,0.000017
3600014560,30,0.000017
3600014567,30,0.000017
3600014563,30,0.000017
3600014565,30,0.000017
3600014559,30,0.000017
3600014566,25,0.000015
3600014573,25,0.000015


In [106]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df7,'COMP_MONITOR_TYPE_DESC')

In [107]:
 count_per_group

Unnamed: 0,COMP_MONITOR_TYPE_DESC,Proportion
TV ACC Receipt/Review,1720800,1.0


In [108]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df7,'FACILITY_RPT_DEVIATION_FLAG')

In [109]:
count_per_group

Unnamed: 0,FACILITY_RPT_DEVIATION_FLAG,Proportion
N,420835,0.665383
Y,211635,0.334617


In [110]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df7,'ACTUAL_END_DATE')

In [111]:
count_per_group

Unnamed: 0,ACTUAL_END_DATE,Proportion
01/30/2012,4915,0.002907
01/30/2013,4755,0.002812
01/30/2014,3935,0.002327
01/31/2011,3790,0.002242
01/30/2017,3690,0.002182
01/30/2008,3275,0.001937
01/30/2006,3245,0.001919
01/31/2012,3240,0.001916
01/30/2009,3120,0.001845
03/31/2008,3110,0.001839


# Formal actions

In [117]:
df8 = pd.read_csv('ICIS-AIR_FORMAL_ACTIONS.csv')

In [118]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df8, 'PGM_SYS_ID')

In [119]:
tot_count

77839

In [124]:
count_per_group.head()

Unnamed: 0,PGM_SYS_ID,Proportion
CASJV00006029S0037,273,0.003507
CASJV00006029S0033,264,0.003392
CASJV00006029S1547,226,0.002903
NJ0000003401500002,198,0.002544
CASJV00006029S1128,190,0.002441


In [121]:
count_per_count  #majority have one, but DOES NOT CAPTURE FACILITIES WITH NO ACTION

Unnamed: 0,PGM_SYS_ID,Proportion
1,16329,0.546340
2,6032,0.201820
3,2874,0.096159
4,1450,0.048514
5,892,0.029845
6,565,0.018904
7,355,0.011878
8,275,0.009201
9,194,0.006491
10,157,0.005253


In [125]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df8, 'ACTIVITY_ID')

In [126]:
count_per_group #why id not unique?

Unnamed: 0,ACTIVITY_ID,Proportion
3600354879,115,0.001477
3400279927,47,0.000604
33331,30,0.000385
3600139186,28,0.000360
130208,24,0.000308
3600900873,19,0.000244
32040,16,0.000206
111364,15,0.000193
31941,14,0.000180
176184,14,0.000180


In [127]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df8, 'ACTIVITY_TYPE_DESC')

In [128]:
count_per_group   #marjority are not judicial

Unnamed: 0,ACTIVITY_TYPE_DESC,Proportion
Administrative - Formal,73612,0.945696
Judicial,4227,0.054304


In [131]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df8, 'ENF_TYPE_DESC')

In [132]:
count_per_group

Unnamed: 0,ENF_TYPE_DESC,Proportion
Administrative Order,62742,0.806048
CAA 113D1 Action For Penalty,5821,0.074783
Civil Judicial Action,4201,0.05397
CAA 113A Admin Compliance Order (Non-Penalty),4076,0.052364
CAA 113D Withdrawn,591,0.007593
CAA 120 AO For Noncompliance Penalty,99,0.001272
CAA 113D1 Action For Penalty - 112(r) Expedited Settlement Program,49,0.00063
EPCRA 325 Action For Penalty,48,0.000617
CAA 113D Delayed Comp Ord (Old),34,0.000437
RCRA 3008A AO For Comp And/Or Penalty,32,0.000411


In [133]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df8, 'SETTLEMENT_ENTERED_DATE')

In [134]:
count_per_group

Unnamed: 0,SETTLEMENT_ENTERED_DATE,Proportion
10/20/2001,251,0.003225
08/16/2013,189,0.002429
03/07/2006,180,0.002313
12/29/2005,177,0.002275
09/27/2005,116,0.001491
03/21/2003,98,0.001259
07/01/2004,92,0.001182
11/22/2013,81,0.001041
04/29/2003,80,0.001028
04/01/1994,72,0.000925


In [135]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df8, 'PENALTY_AMOUNT')

In [136]:
count_per_group  #this would be useful as discrete variables

Unnamed: 0,PENALTY_AMOUNT,Proportion
0.00,28448,0.365472
1000.00,2046,0.026285
2000.00,1809,0.023240
500.00,1748,0.022457
5000.00,1561,0.020054
3000.00,1350,0.017343
10000.00,1219,0.015661
1500.00,1100,0.014132
4000.00,1033,0.013271
2500.00,866,0.011126


# Informal actions

In [139]:
df9 = pd.read_csv('ICIS-AIR_INFORMAL_ACTIONS.csv')

In [140]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df9, 'PGM_SYS_ID')

In [142]:
count_per_count  # majority have just 1 informal action against it (but does not capture those with none)

Unnamed: 0,PGM_SYS_ID,Proportion
1,25187,0.515282
2,10231,0.209309
3,4855,0.099325
4,2685,0.054930
5,1670,0.034165
6,1048,0.021440
7,767,0.015691
8,508,0.010393
9,367,0.007508
10,281,0.005749


In [143]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df9, 'ACTIVITY_ID')

In [144]:
count_per_group  #why not unique?

Unnamed: 0,ACTIVITY_ID,Proportion
3601009859,5,0.000039
3600537782,3,0.000023
3600751076,3,0.000023
2200014091,3,0.000023
3600609880,3,0.000023
3600579401,2,0.000015
3600068799,2,0.000015
3600609814,2,0.000015
3600609768,2,0.000015
3600499594,2,0.000015


In [145]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df9, 'ACTIVITY_TYPE_DESC')

In [146]:
count_per_group

Unnamed: 0,ACTIVITY_TYPE_DESC,Proportion
Administrative - Informal,129310,1.0


In [147]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df9, 'ENF_TYPE_DESC')

In [148]:
count_per_group

Unnamed: 0,ENF_TYPE_DESC,Proportion
Notice of Violation,124049,0.959315
Warning Letter,5189,0.040128
Letter to Regulated Entity,31,0.00024
Notice of Determination,18,0.000139
Notice of Noncompliance Issued,11,8.5e-05
Letter of Violation/ Warning Letter,6,4.6e-05
Show Cause Letter,2,1.5e-05
Information Request Letter,2,1.5e-05
Oral Notification of Violation,1,8e-06
Agency Enforcement Review,1,8e-06


In [149]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df9, 'ACHIEVED_DATE')

In [150]:
count_per_group

Unnamed: 0,ACHIEVED_DATE,Proportion
05/22/2012,288,0.002234
06/20/1997,164,0.001272
02/03/2006,133,0.001031
07/15/1994,118,0.000915
06/21/2000,111,0.000861
12/06/2006,107,0.000830
10/26/2009,106,0.000822
11/27/2007,97,0.000752
05/26/2015,94,0.000729
06/25/2014,92,0.000714


# Air violation history

In [151]:
df10 = pd.read_csv('ICIS-AIR_VIOLATION_HISTORY.csv')

In [152]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10, 'PGM_SYS_ID')

In [154]:
count_per_count  #majority of facilities just one violation

Unnamed: 0,PGM_SYS_ID,Proportion
1,14600,0.604755
2,4369,0.180971
3,1882,0.077955
4,1049,0.043451
5,616,0.025516
6,350,0.014498
7,267,0.011060
8,179,0.007414
9,122,0.005053
10,96,0.003976


In [155]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'ACTIVITY_ID')

In [157]:
# activity ids are unique

In [158]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'AGENCY_TYPE_DESC')

In [159]:
count_per_group

Unnamed: 0,AGENCY_TYPE_DESC,Proportion
State,43116,0.727623
Local,11323,0.191086
U.S. EPA,4810,0.081173
Tribal,7,0.000118


In [160]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'STATE_CODE')

In [161]:
count_per_group

Unnamed: 0,STATE_CODE,Proportion
CA,8956,0.164487
TX,4318,0.079305
PA,3915,0.071903
MI,3352,0.061563
NY,2812,0.051646
IL,2229,0.040938
OH,2220,0.040773
NC,2166,0.039781
CO,1988,0.036512
NJ,1687,0.030984


In [162]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'AIR_LCON_CODE')

In [163]:
count_per_group  #what is this?

Unnamed: 0,AIR_LCON_CODE,Proportion
SJV,4823,0.427041
SCA,1807,0.159996
BAA,1132,0.10023
ACH,597,0.05286
PAM,576,0.051001
PSC,218,0.019302
VCA,218,0.019302
SAC,150,0.013281
MDA,149,0.013193
SDS,134,0.011865


In [164]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'COMP_DETERMINATION_UID')

In [166]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'ENF_RESPONSE_POLICY_CODE')

In [167]:
count_per_group

Unnamed: 0,ENF_RESPONSE_POLICY_CODE,Proportion
HPV,35489,0.59891
FRV,23767,0.40109


In [168]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'PROGRAM_CODES')

In [169]:
count_per_group

Unnamed: 0,PROGRAM_CODES,Proportion
CAASIP,24544,0.414203
CAATVP,14081,0.237630
CAASIP CAATVP,4380,0.073917
CAANSPS,2630,0.044384
CAAMACT,2506,0.042291
CAANSPS CAASIP,1124,0.018969
CAAPSD,850,0.014345
CAAMACT CAASIP CAATVP,694,0.011712
CAANSR,690,0.011644
CAANSPS CAASIP CAATVP,643,0.010851


In [173]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df10,'POLLUTANT_DESCS')

In [174]:
count_per_group

Unnamed: 0,POLLUTANT_DESCS,Proportion
FACIL,18500,0.348268
VOLATILE ORGANIC COMPOUNDS (VOCS),6865,0.129236
TOTAL PARTICULATE MATTER,3604,0.067846
NITROGEN OXIDES NO2,2540,0.047816
PARTICULATE MATTER < 10 UM,2064,0.038855
ADMIN,1929,0.036314
Sulfur dioxide,1858,0.034977
TOTAL HAZARDOUS AIR POLLUTANTS (HAPS),1621,0.030516
Carbon monoxide,1413,0.026600
Chromium,977,0.018392


In [None]:
#There are dates in this file -- super useful