# Notebook 1/: Data Processing  & Reduction

In [1]:
import pandas as pd
import datetime as dt
import numpy as np

### Read in and process full dataset

#### This _will_ take over an hour to run, the processed data is already available so running it is not recommended.

In [None]:
def quick_clean(data):
    chunk_mincol = data[['CAD_INCIDENT_ID', 'INCIDENT_DATETIME', 'INITIAL_CALL_TYPE', # Drop 9 extraneous columns
                   'INITIAL_SEVERITY_LEVEL_CODE', 'FINAL_CALL_TYPE',
                   'FINAL_SEVERITY_LEVEL_CODE', 'FIRST_ASSIGNMENT_DATETIME',
                   'VALID_DISPATCH_RSPNS_TIME_INDC', 'DISPATCH_RESPONSE_SECONDS_QY',
                   'FIRST_ACTIVATION_DATETIME', 'FIRST_ON_SCENE_DATETIME',
                   'VALID_INCIDENT_RSPNS_TIME_INDC', 'INCIDENT_RESPONSE_SECONDS_QY',
                   'INCIDENT_TRAVEL_TM_SECONDS_QY', 'FIRST_TO_HOSP_DATETIME',
                   'FIRST_HOSP_ARRIVAL_DATETIME', 'INCIDENT_CLOSE_DATETIME',
                   'HELD_INDICATOR', 'INCIDENT_DISPOSITION_CODE', 'BOROUGH',
                   'INCIDENT_DISPATCH_AREA', 'ZIPCODE']]
    # Convert INCIDENT_DATETIME from string to datetime format for processing ease
    chunk_mincol['INCIDENT_DATETIME'] = pd.to_datetime(chunk_mincol['INCIDENT_DATETIME']) 
    chunk_mincol['FIRST_ASSIGNMENT_DATETIME'] = pd.to_datetime(chunk_mincol['FIRST_ASSIGNMENT_DATETIME']) 
    chunk_mincol['FIRST_ACTIVATION_DATETIME'] = pd.to_datetime(chunk_mincol['FIRST_ACTIVATION_DATETIME']) 
    chunk_mincol['FIRST_ON_SCENE_DATETIME'] = pd.to_datetime(chunk_mincol['FIRST_ON_SCENE_DATETIME']) 
    chunk_mincol['FIRST_TO_HOSP_DATETIME'] = pd.to_datetime(chunk_mincol['FIRST_TO_HOSP_DATETIME']) 
    chunk_mincol['FIRST_HOSP_ARRIVAL_DATETIME'] = pd.to_datetime(chunk_mincol['FIRST_HOSP_ARRIVAL_DATETIME']) 
    chunk_mincol['INCIDENT_CLOSE_DATETIME'] = pd.to_datetime(chunk_mincol['INCIDENT_CLOSE_DATETIME']) 
    chunk_eve = chunk_mincol[chunk_mincol.INCIDENT_DATETIME.dt.hour >= 19] # Reduce data to evenings only
    chunk_nnz = chunk_eve[chunk_eve['ZIPCODE'].notnull()] # Save only not-null zipcodes
    chunk_nndc = chunk_nnz[chunk_nnz['INCIDENT_DISPOSITION_CODE'].notnull()] # Save only not-null disposition codes
    return chunk_nndc

**Notes:** The function above is run on 100_000-line chunks of data, then the resulting product is saved to a list and subsequently converted into a Pandas dataframe, and saved to a csv. Data not stored in GitHub repository due to large file size.

In [None]:
# with chunk-assistance from https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c

no_chunk = 0
chunk_list = []

for chunk in pd.read_csv('./data/EMS_Incident_Dispatch_Data.csv', iterator=True, chunksize=100000):
    chunk_clean = quick_clean(chunk) 
    chunk_list.append(chunk_clean)
    
    no_chunk += 1
    print(no_chunk)
    
# concat the list into dataframe 
df_concat = pd.concat(chunk_list)

# Save new dataframe to a csv
df_concat.to_csv('./data/EMS_dispatch_evenings.csv', index = False)