In [2]:
import pandas as pd

# About Programs

In [2]:
df = pd.read_csv('ICIS-AIR_PROGRAMS.csv')

In [3]:
df.head()

Unnamed: 0,PGM_SYS_ID,PROGRAM_CODE,PROGRAM_DESC,AIR_OPERATING_STATUS_CODE,AIR_OPERATING_STATUS_DESC
0,NH0000003300190201,CAASIP,State Implementation Plan for National Primary...,OPR,Operating
1,NH0000003300700012,CAASIP,State Implementation Plan for National Primary...,OPR,Operating
2,NH0000003300700013,CAASIP,State Implementation Plan for National Primary...,CLS,Permanently Closed
3,NH0000003300700013,CAANSPS,New Source Performance Standards,CLS,Permanently Closed
4,NH0000003300100001,CAAGACTM,40 CFR Part 63 Area Sources,OPR,Operating


In [3]:
def get_groupby_counts(df, col_name):
    tot_count = df[col_name].count()
    count_per_group = df[col_name].value_counts().to_frame()
    count_per_group['Proportion'] = count_per_group[col_name] / tot_count
    
    count_per_count = count_per_group[col_name].value_counts().to_frame()
    count_per_count['Proportion'] = count_per_count[col_name] / count_per_count[col_name].sum()
    return tot_count, count_per_group, count_per_count

In [59]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df, 'PGM_SYS_ID')

In [60]:
tot_count

388371

In [62]:
count_per_count

Unnamed: 0,PGM_SYS_ID,Proportion
1,148827,0.631793
2,44944,0.190794
3,27565,0.117018
4,7919,0.033617
5,3989,0.016934
6,1421,0.006032
7,520,0.002207
8,259,0.001099
9,90,0.000382
10,27,0.000115


In [35]:
# Over 60% of facilities are under 1 program program, over 80% are under 1-2 programs

In [63]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df, 'PROGRAM_DESC')

In [64]:
count_per_group

Unnamed: 0,PROGRAM_DESC,Proportion
State Implementation Plan for National Primary and Secondary Ambient Air Quality Standards,213347,0.549338
MACT Standards (40 CFR Part 63),50259,0.12941
New Source Performance Standards,45041,0.115974
Title V Permits,28005,0.072109
Federally-Enforceable State Operating Permit - Non Title V,9510,0.024487
Stratospheric Ozone Protection,8245,0.02123
New Source Review Permit Requirements,8159,0.021008
40 CFR Part 63 Area Sources,7194,0.018524
Prevention of Significant Deterioration of Air Quality,7094,0.018266
National Emission Standards for Hazardous Air Pollutants (40 CFR Part 61),4926,0.012684


In [None]:
# Over 50% are under SIP

In [66]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df, 'AIR_OPERATING_STATUS_DESC')

In [67]:
count_per_group

Unnamed: 0,AIR_OPERATING_STATUS_DESC,Proportion
Operating,277659,0.714932
Permanently Closed,101206,0.260591
Temporarily Closed,5444,0.014018
Planned Facility,2286,0.005886
Under Construction,1455,0.003746
Seasonal,321,0.000827


In [69]:
# Over 70% are operating, over a quarter are permanently closed!

# Program subparts

In [4]:
df3 = pd.read_csv('ICIS-AIR_PROGRAM_SUBPARTS.csv')

In [8]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df3, 'PGM_SYS_ID')

In [15]:
tot_count

154813

In [14]:
count_per_count

Unnamed: 0,PGM_SYS_ID,Proportion
1,40540,0.55673
2,16223,0.222788
3,5827,0.080021
7,3168,0.043506
4,2924,0.040155
5,1701,0.02336
8,851,0.011687
6,797,0.010945
9,246,0.003378
10,111,0.001524


In [None]:
# Not all facilities are in a sub-program
# Over half the facilities are in 1, but some are in 20+

In [16]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df3, 'AIR_PROGRAM_SUBPART_DESC')

In [17]:
count_per_group

Unnamed: 0,AIR_PROGRAM_SUBPART_DESC,Proportion
MACT Part 63 - Subpart ZZZZ - STATIONARY RECIPROCATING INTERNAL COMBUSTION ENGINES (RICE),20916,0.135105
NSPS Part 60 - Subpart JJJJ - STATIONARY SPARK IGNITION INTERNAL COMBUSTION ENGINES,11485,0.074186
NSPS Part 60 - Subpart A - GENERAL PROVISIONS,8919,0.057611
NSPS Part 60 - Subpart IIII - STATIONARY COMPRESSION IGNITION INTERNAL COMBUSTION ENGINES,8842,0.057114
MACT Part 63 - Subpart M - DRY CLEANERS PERCHLOROETHYLENE,8744,0.056481
NSPS Part 60 - Subpart Dc - SMALL INDUS-COMMER-INSTITUTL STEAM GENERATING UNITS,6286,0.040604
NSPS Part 60 - Subpart OOO - NONMETALLIC MINERAL PROCESSING PLANTS,6264,0.040462
MACT Part 63 - Subpart A - GENERAL PROVISIONS,5922,0.038253
MACT Part 63 - Subpart HH - OIL AND NATURAL GAS PRODUCTION FACILITIES,5896,0.038085
"NSPS Part 60 - Subpart KKK - EQUIPT LEAK VOC ONSHORE NATURAL GAS PROC PLNT CONSTRUCT, RECONSTRUCT, MOD 01/20/1984 - 08/23/2011",3903,0.025211


# About Facilities

In [3]:
df2 = pd.read_csv('ICIS-AIR_FACILITIES.csv')

In [7]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'PGM_SYS_ID')

In [8]:
tot_count

236895

In [12]:
count_per_group.shape

(236895, 2)

In [13]:
# Each facility is only listed once

In [14]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'STATE')

In [17]:
count_per_group

Unnamed: 0,STATE,Proportion
CO,29552,0.124747
IL,20532,0.086671
LA,12850,0.054243
NY,12548,0.052969
OK,11347,0.047899
MD,11092,0.046822
PA,10000,0.042213
VA,8924,0.037671
OH,8883,0.037498
KS,7240,0.030562


In [None]:
# Colorado and Illinois have 12%, 8% each

In [19]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'FACILITY_TYPE_CODE')

In [20]:
count_per_group

Unnamed: 0,FACILITY_TYPE_CODE,Proportion
POF,161843,0.770773
NON,34655,0.165043
CNG,3566,0.016983
COR,3257,0.015511
CTG,2992,0.014249
STF,1350,0.006429
FDF,1338,0.006372
DIS,588,0.0028
TRB,244,0.001162
MWD,118,0.000562


In [21]:
# What do these mean?

In [22]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'AIR_POLLUTANT_CLASS_DESC')

In [23]:
count_per_group

Unnamed: 0,AIR_POLLUTANT_CLASS_DESC,Proportion
Minor Emissions,173366,0.738133
Synthetic Minor Emissions,33415,0.14227
Major Emissions,19235,0.081896
Emissions classification unknown,7449,0.031715
Not applicable,1312,0.005586
Other,94,0.0004


In [24]:
# The majority are minor emissions

In [25]:
tot_count, count_per_group, count_per_count = get_groupby_counts(df2, 'CURRENT_HPV')

In [26]:
count_per_group

Unnamed: 0,CURRENT_HPV,Proportion
No Viol,234442,0.989645
Viol,1333,0.005627
Addrs-State,366,0.001545
Unaddr-State,325,0.001372
Addrs-EPA,178,0.000751
Addrs-Local,166,0.000701
Unaddr-Local,55,0.000232
Unaddr-EPA,30,0.000127


In [None]:
# Vast majority no viols!