# Generate feature: count of previous years' HPVs

In [178]:
import csv
import pandas as pd
import numpy as np
import processing 
from datetime import datetime as dt
import pylab as pl
import matplotlib.pyplot as plt
%matplotlib inline

In [179]:
def convert_to_datetime(series_row, date_format):
    if str(series_row) == 'nan':
        return float('nan')
    return dt.strptime(series_row, date_format)
    

def convert_to_month(series_row):
    if str(series_row) == 'NaT' or str(series_row)== 'nan':
        return float('nan')
    else:
        return str(series_row.month)

def convert_to_year(series_row):
    if str(series_row) == 'NaT' or str(series_row)== 'nan':
        return float('nan')
    else:
        return str(series_row.year)

def get_month_year_col(df, date_column, date_format):
    df[date_column+'_datetime'] = df[date_column].apply(convert_to_datetime, date_format=date_format)
    df[date_column+'_month'] = df[date_column+'_datetime'].apply(convert_to_month)
    df[date_column+'_year'] = df[date_column+'_datetime'].apply(convert_to_year)
    return df

In [180]:
# Read files
violhist = pd.read_csv('ICIS-AIR_VIOLATION_HISTORY.csv')
fce = pd.read_csv('ICIS-AIR_FCES_PCES.csv')

In [181]:
violhist.columns

Index(['PGM_SYS_ID', 'ACTIVITY_ID', 'AGENCY_TYPE_DESC', 'STATE_CODE',
       'AIR_LCON_CODE', 'COMP_DETERMINATION_UID', 'ENF_RESPONSE_POLICY_CODE',
       'PROGRAM_CODES', 'PROGRAM_DESCS', 'POLLUTANT_CODES', 'POLLUTANT_DESCS',
       'EARLIEST_FRV_DETERM_DATE', 'HPV_DAYZERO_DATE', 'HPV_RESOLVED_DATE'],
      dtype='object')

In [182]:
fce.columns

Index(['PGM_SYS_ID', 'ACTIVITY_ID', 'STATE_EPA_FLAG', 'ACTIVITY_TYPE_CODE',
       'ACTIVITY_TYPE_DESC', 'COMP_MONITOR_TYPE_CODE',
       'COMP_MONITOR_TYPE_DESC', 'ACTUAL_END_DATE', 'PROGRAM_CODES'],
      dtype='object')

In [183]:
#Change date columns
violhist = get_month_year_col(violhist, 'EARLIEST_FRV_DETERM_DATE', '%m-%d-%Y')
violhist = get_month_year_col(violhist, 'HPV_DAYZERO_DATE', '%m-%d-%Y')
violhist = get_month_year_col(violhist, 'HPV_RESOLVED_DATE', '%m-%d-%Y')

In [184]:
#removing FRVs
violhist = violhist[violhist.ENF_RESPONSE_POLICY_CODE != 'FRV']

In [185]:
#Steps before merge
violhist['year'] = violhist['HPV_DAYZERO_DATE_year']
fce = get_month_year_col(fce, 'ACTUAL_END_DATE', '%m-%d-%Y')

In [186]:
#Left merge into violation history file
merged_hpv_fce = pd.merge(violhist, fce, how='left', left_on=['PGM_SYS_ID', 'HPV_DAYZERO_DATE'], right_on=['PGM_SYS_ID','ACTUAL_END_DATE'])

In [187]:
merged_hpv_fce.head(3)

Unnamed: 0,PGM_SYS_ID,ACTIVITY_ID_x,AGENCY_TYPE_DESC,STATE_CODE,AIR_LCON_CODE,COMP_DETERMINATION_UID,ENF_RESPONSE_POLICY_CODE,PROGRAM_CODES_x,PROGRAM_DESCS,POLLUTANT_CODES,...,STATE_EPA_FLAG,ACTIVITY_TYPE_CODE,ACTIVITY_TYPE_DESC,COMP_MONITOR_TYPE_CODE,COMP_MONITOR_TYPE_DESC,ACTUAL_END_DATE,PROGRAM_CODES_y,ACTUAL_END_DATE_datetime,ACTUAL_END_DATE_month,ACTUAL_END_DATE_year
0,CT0000000900700108,3400302038,State,CT,,CT000A0000090070010800028,HPV,CAASIP,State Implementation Plan for National Primary...,300000005 300000323,...,,,,,,,,,,
1,CT0000000900900110,3400302044,State,CT,,CT000A0000090090011000026,HPV,CAASIP,State Implementation Plan for National Primary...,300000005 300000323,...,,,,,,,,,,
2,CT0000000900900110,3400302045,State,CT,,CT000A0000090090011000037,HPV,CAASIP,State Implementation Plan for National Primary...,10461 300000005,...,,,,,,,,,,


In [188]:
# Find violations that resulted from something other than an inspection 
viol_by_other = merged_hpv_fce
viol_by_other.COMP_MONITOR_TYPE_CODE.fillna(0, inplace=True)   #this will be NaN because it was not inspected
viol_by_other = viol_by_other[viol_by_other['COMP_MONITOR_TYPE_CODE'] == 0]  #violations not resulting from inspections

In [190]:
# Get the columns needed
viol_by_other = viol_by_other[['PGM_SYS_ID','year']]
viol_other_year = viol_by_other.groupby(['PGM_SYS_ID','year']).size().reset_index() # to get count of HPV by year
violhist2 = violhist[['PGM_SYS_ID','year']]

In [192]:
# Outer merge
merged_viols = pd.merge(violhist2,viol_other_year, how = 'outer', on = ['PGM_SYS_ID','year'])
merged_viols.rename(columns={'year': 'Year', 0:'NonInspection_HPV_Count'}, inplace=True)
merged_viols.NonInspection_HPV_Count.fillna(0, inplace=True)

In [197]:
print (merged_viols.shape, violhist.shape)

(35489, 3) (35489, 24)


In [199]:
merged_viols.isnull().sum()  #34 rows are missing years (from original file)

PGM_SYS_ID                  0
Year                       34
NonInspection_HPV_Count     0
dtype: int64

In [202]:
merged_viols = merged_viols.dropna(axis=0)

In [203]:
merged_viols.shape

(35455, 3)

In [204]:
merged_viols  #Final result: Count of non-inspection HPVs by facility by year

Unnamed: 0,PGM_SYS_ID,Year,NonInspection_HPV_Count
0,CT0000000900700108,1996,1.0
1,CT0000000900900110,1996,1.0
2,CT0000000900900110,1998,2.0
3,CT0000000900900110,1998,2.0
4,CT0000000900300125,2005,0.0
5,CT0000000900508907,2010,0.0
6,CT0000000900100078,2007,1.0
7,CT0000000901501125,2013,1.0
8,CT0000000900300171,1998,2.0
9,CT0000000900300171,1998,2.0
