# Data Retreival and Aggregation

Dependent Variable (ADP): We will use boto3 package to get the historical daily inmates in custody data from our secure s3 bucket. *Note: you will not be able to retrieve this data unless you are a part of the DCJ organization and have configured your AWS credentials for your compputer.*

Independent Variables: We will also use the SoSQL language to grab exogenous variables from NYC Open Data Portal. 

Both datasets will then be aggregated to 30-day rolling averages or counts to be used in the analysis.

In [1]:
#import packages
import pandas as pd
import numpy as np
from functions import *
%matplotlib inline

### Step 1: Get Target Variable

In [2]:
#specify s3 bucket and file locations
bucket_name = 'doc-daily-inmates-in-custody'
folder_name = 'data/merged_files/'
file_name = 'agg_daily_pop.csv'

#get the daily merged file from s3 bucket
adp_df = get_file(bucket_name, folder_name, file_name)
adp_df.head()

Trying to get object from bucket
Got file contents from AWS S3
Saved file contents as dataframe


Unnamed: 0,snapshot_date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_CSP,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,Total Population,snapshot_month,snapshot_year
0,2016-06-02,158,5603,9,2706,108,1252,614,16,9206,...,63,6892,821,98,573,43,163,9836,6,2016
1,2016-06-03,157,5584,9,2676,106,1248,610,13,9157,...,63,6860,807,91,577,43,180,9780,6,2016
2,2016-06-04,159,5581,9,2661,112,1243,616,17,9132,...,63,6829,792,100,592,44,179,9765,6,2016
3,2016-06-05,161,5661,9,2692,108,1263,629,13,9252,...,64,6928,796,100,591,44,179,9894,6,2016
4,2016-06-06,158,5674,9,2702,107,1254,633,14,9257,...,65,6983,798,85,572,47,178,9904,6,2016


In [3]:
#aggregate to 30 day intervals
#set data type
adp_df['snapshot_date'] = pd.to_datetime(adp_df['snapshot_date'])
#get latest snapshot date
max_date = adp_df['snapshot_date'].max()
#aggregate to 30 day periods
_30day_interval_data = adp_df.resample('30D', on='snapshot_date',origin = max_date,closed='right',label = 'right').mean().round().fillna(0).reset_index()
_30day_interval_data = _30day_interval_data.rename(columns = {'snapshot_date':'End Date','Total Population':'ADP'})
_30day_interval_data['Start Date'] = _30day_interval_data['End Date'] - pd.to_timedelta(29, unit='D')

_30day_interval_data.head()

Unnamed: 0,End Date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,ADP,snapshot_month,snapshot_year,Start Date
0,2016-06-27,154.0,5614.0,8.0,2685.0,112.0,1239.0,625.0,14.0,9174.0,...,6896.0,800.0,95.0,564.0,42.0,160.0,9812.0,6.0,2016.0,2016-05-29
1,2016-07-27,150.0,5567.0,8.0,2706.0,99.0,1222.0,611.0,12.0,9128.0,...,6820.0,800.0,105.0,571.0,44.0,154.0,9752.0,7.0,2016.0,2016-06-28
2,2016-08-26,155.0,5543.0,10.0,2750.0,99.0,1207.0,643.0,14.0,9106.0,...,6838.0,803.0,106.0,572.0,39.0,124.0,9764.0,8.0,2016.0,2016-07-28
3,2016-09-25,159.0,5626.0,7.0,2814.0,93.0,1177.0,660.0,11.0,9205.0,...,6962.0,844.0,96.0,582.0,30.0,108.0,9877.0,9.0,2016.0,2016-08-27
4,2016-10-25,167.0,5576.0,5.0,2857.0,92.0,1117.0,644.0,7.0,9163.0,...,6862.0,825.0,104.0,529.0,40.0,169.0,9814.0,10.0,2016.0,2016-09-26


In [4]:
#also do 7 day rolling averages ~ weekly
#aggregate to 30 day periods
_7day_interval_data = adp_df.resample('7D', on='snapshot_date',origin = max_date,closed='right',label = 'right').mean().round().fillna(0).reset_index()
_7day_interval_data = _7day_interval_data.rename(columns = {'snapshot_date':'End Date','Total Population':'ADP'})
_7day_interval_data['Start Date'] = _7day_interval_data['End Date'] - pd.to_timedelta(6, unit='D')

_7day_interval_data.head()

Unnamed: 0,End Date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,ADP,snapshot_month,snapshot_year,Start Date
0,2016-06-06,159.0,5621.0,9.0,2687.0,108.0,1252.0,620.0,15.0,9201.0,...,6898.0,803.0,95.0,581.0,44.0,176.0,9836.0,6.0,2016.0,2016-05-31
1,2016-06-13,153.0,5608.0,9.0,2694.0,108.0,1236.0,630.0,13.0,9165.0,...,6934.0,798.0,87.0,552.0,39.0,160.0,9808.0,6.0,2016.0,2016-06-07
2,2016-06-20,149.0,5604.0,9.0,2694.0,110.0,1232.0,623.0,12.0,9162.0,...,6881.0,802.0,84.0,550.0,41.0,176.0,9798.0,6.0,2016.0,2016-06-14
3,2016-06-27,157.0,5624.0,7.0,2666.0,120.0,1239.0,624.0,14.0,9174.0,...,6870.0,797.0,112.0,577.0,45.0,134.0,9813.0,6.0,2016.0,2016-06-21
4,2016-07-04,154.0,5604.0,7.0,2694.0,111.0,1229.0,629.0,12.0,9159.0,...,6852.0,793.0,110.0,583.0,49.0,149.0,9800.0,7.0,2016.0,2016-06-28


In [5]:
_7day_pop_tot = _7day_interval_data[['Start Date','End Date','ADP']]
# for the weekly data, we are going to double check for any missing values and do
# a linear interpolation just in case. 
_7day_pop_tot = _7day_pop_tot.replace(0,np.nan)
_7day_pop_tot["ADP"] = _7day_pop_tot["ADP"].interpolate(method='linear', axis=0).ffill().bfill()


_7day_pop_tot.tail()

Unnamed: 0,Start Date,End Date,ADP
419,2024-06-11,2024-06-17,6363.0
420,2024-06-18,2024-06-24,6387.0
421,2024-06-25,2024-07-01,6424.0
422,2024-07-02,2024-07-08,6389.0
423,2024-07-09,2024-07-15,6378.0


In [6]:
#lets only look at the total population
_30day_pop_tot = _30day_interval_data[['Start Date','End Date','ADP']]
daily_pop = adp_df[['snapshot_date','Total Population']]
#save to csv to use in other scripts
daily_pop.to_csv("../Data/daily_pop.csv")
_7day_pop_tot.to_csv("../Data/_7_day_adp.csv")
_30day_pop_tot.to_csv("../Data/_30_day_adp.csv")

In [7]:
daily_pop.head()

Unnamed: 0,snapshot_date,Total Population
0,2016-06-02,9836
1,2016-06-03,9780
2,2016-06-04,9765
3,2016-06-05,9894
4,2016-06-06,9904


In [8]:
_30day_pop_tot.dtypes

Start Date    datetime64[ns]
End Date      datetime64[ns]
ADP                  float64
dtype: object

In [9]:
_30day_pop_tot.tail()

Unnamed: 0,Start Date,End Date,ADP
94,2024-02-17,2024-03-17,6250.0
95,2024-03-18,2024-04-16,6302.0
96,2024-04-17,2024-05-16,6323.0
97,2024-05-17,2024-06-15,6337.0
98,2024-06-16,2024-07-15,6394.0


### Step 2: Get IVs

* length of stay

* 30-day admission counts

* 30-day discharge counts

In [10]:
admit_url = 'https://data.cityofnewyork.us/resource/6teu-xtgp.json'
dis_url = 'https://data.cityofnewyork.us/resource/94ri-3ium.json'
first_st_date_adp = _30day_pop_tot.iloc[0]['Start Date'].date()
_30day_admit_df = get_agg_admit_dis_data(first_st_date_adp, admit_url, 'ADMITTED_DT',30)
_30day_dis_df = get_agg_admit_dis_data(first_st_date_adp, dis_url, 'DISCHARGED_DT',30)

In [11]:
_30day_admit_df.head()

Unnamed: 0,Start Date,admission_count,End Date,Month,Year,Days to Max Date
0,2016-05-29,4805,2016-06-27,5,2016,2954
1,2016-06-28,4765,2016-07-27,6,2016,2924
2,2016-07-28,4808,2016-08-26,7,2016,2894
3,2016-08-27,4695,2016-09-25,8,2016,2864
4,2016-09-26,4530,2016-10-25,9,2016,2834


In [12]:
_30day_admit_df.tail()

Unnamed: 0,Start Date,admission_count,End Date,Month,Year,Days to Max Date
93,2024-01-18,1942,2024-02-16,1,2024,164
94,2024-02-17,1966,2024-03-17,2,2024,134
95,2024-03-18,1925,2024-04-16,3,2024,104
96,2024-04-17,2004,2024-05-16,4,2024,74
97,2024-05-17,1985,2024-06-15,5,2024,44


In [13]:
_30day_dis_df.head()

Unnamed: 0,Start Date,discharge_count,End Date,Month,Year,Days to Max Date
0,2016-05-29,4676,2016-06-27,5,2016,2954
1,2016-06-28,4871,2016-07-27,6,2016,2924
2,2016-07-28,4773,2016-08-26,7,2016,2894
3,2016-08-27,4499,2016-09-25,8,2016,2864
4,2016-09-26,4661,2016-10-25,9,2016,2834


In [14]:
_30day_dis_df.tail()

Unnamed: 0,Start Date,discharge_count,End Date,Month,Year,Days to Max Date
93,2024-01-18,1972,2024-02-16,1,2024,164
94,2024-02-17,1770,2024-03-17,2,2024,134
95,2024-03-18,1960,2024-04-16,3,2024,104
96,2024-04-17,2014,2024-05-16,4,2024,74
97,2024-05-17,1903,2024-06-15,5,2024,44


In [15]:
#test los function
_30_day_los_df = get_los_data(first_st_date_adp,30)
_30_day_los_df.head()

Unnamed: 0,Start Date,Avg LOS Days,End Date,Discharge Month,Discharge Year,Days to Max Date
0,2016-05-29,60.455648,2016-06-27,5,2016,2954
1,2016-06-28,61.881628,2016-07-27,6,2016,2924
2,2016-07-28,59.610809,2016-08-26,7,2016,2894
3,2016-08-27,56.659513,2016-09-25,8,2016,2864
4,2016-09-26,64.245121,2016-10-25,9,2016,2834


In [16]:
_30_day_los_df.tail()

Unnamed: 0,Start Date,Avg LOS Days,End Date,Discharge Month,Discharge Year,Days to Max Date
93,2024-01-18,101.62443,2024-02-16,1,2024,164
94,2024-02-17,97.592655,2024-03-17,2,2024,134
95,2024-03-18,99.336053,2024-04-16,3,2024,104
96,2024-04-17,106.061417,2024-05-16,4,2024,74
97,2024-05-17,95.809148,2024-06-15,5,2024,44


In [17]:
#join to one dataframe and store as csv
_30day_joined_ivs_df = _30day_admit_df[['Start Date','End Date','admission_count']].merge(_30day_dis_df[['Start Date','End Date','discharge_count']],
                                                                                left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_30day_joined_ivs_df = _30day_joined_ivs_df.merge(_30_day_los_df[['Start Date','End Date','Avg LOS Days']],
                                    left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_30day_joined_ivs_df.head()

Unnamed: 0,Start Date,End Date,admission_count,discharge_count,Avg LOS Days
0,2016-05-29,2016-06-27,4805,4676,60.455648
1,2016-06-28,2016-07-27,4765,4871,61.881628
2,2016-07-28,2016-08-26,4808,4773,59.610809
3,2016-08-27,2016-09-25,4695,4499,56.659513
4,2016-09-26,2016-10-25,4530,4661,64.245121


In [18]:
_30day_joined_ivs_df.tail()

Unnamed: 0,Start Date,End Date,admission_count,discharge_count,Avg LOS Days
93,2024-01-18,2024-02-16,1942,1972,101.62443
94,2024-02-17,2024-03-17,1966,1770,97.592655
95,2024-03-18,2024-04-16,1925,1960,99.336053
96,2024-04-17,2024-05-16,2004,2014,106.061417
97,2024-05-17,2024-06-15,1985,1903,95.809148


In [19]:
#save to csv
_30day_joined_ivs_df.to_csv("../Data/_30_day_IVs.csv")

In [20]:
#repeat for 7 day
_7day_admit_df = get_agg_admit_dis_data(first_st_date_adp, admit_url, 'ADMITTED_DT',7)
_7day_dis_df = get_agg_admit_dis_data(first_st_date_adp, dis_url, 'DISCHARGED_DT',7)
_7day_los_df = get_los_data(first_st_date_adp,7)
#merge
_7day_joined_ivs_df = _7day_admit_df[['Start Date','End Date','admission_count']].merge(_7day_dis_df[['Start Date','End Date','discharge_count']],
                                                                                left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_7day_joined_ivs_df = _7day_joined_ivs_df.merge(_7day_los_df[['Start Date','End Date','Avg LOS Days']],
                                    left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_7day_joined_ivs_df.head()


Unnamed: 0,Start Date,End Date,admission_count,discharge_count,Avg LOS Days
0,2016-05-29,2016-06-04,1105,1051,56.14611
1,2016-06-05,2016-06-11,1102,1124,53.335993
2,2016-06-12,2016-06-18,1157,1155,59.375758
3,2016-06-19,2016-06-25,1164,1128,72.144504
4,2016-06-26,2016-07-02,1210,1251,61.297686


In [21]:
#save to csv
_7day_joined_ivs_df.to_csv("../Data/_7_day_IVs.csv")

In [22]:
#test complaint function
crime_data = get_crime_data(first_st_date_adp,30)
crime_data.head()

In [None]:
crime_data.tail()

Unnamed: 0,Start Date,total_felony_crimes,violent_felony_crimes,total_misdemeanor_crimes,murder_homicide_count,robbery_count,assault_count,burglary_count,rape_count,grand_larceny_count,grand_larceny_vehicle_count,weapons_count,End Date,Days to Max Date
90,2023-10-20,14995,3827,23489,24,1468,2260,1089,75,3843,1248,465,2023-11-18,163
91,2023-11-19,13591,3545,21881,38,1430,1996,1036,81,3644,1190,352,2023-12-18,133
92,2023-12-19,13326,3620,20622,16,1446,2074,1004,84,3269,1077,390,2024-01-17,103
93,2024-01-18,14287,3422,23758,25,1287,2026,1014,84,3673,1048,463,2024-02-16,73
94,2024-02-17,13473,3446,22956,24,1237,2102,1038,83,3202,943,464,2024-03-17,43


In [None]:
_30day_pop_tot.head(1)

Unnamed: 0,Start Date,End Date,ADP
0,2016-05-29,2016-06-27,9812.0


In [None]:
_30day_pop_tot.tail(1)

Unnamed: 0,Start Date,End Date,ADP
98,2024-06-16,2024-07-15,6394.0


In [None]:
crime_data.columns[1:-2].to_list()

['total_felony_crimes',
 'violent_felony_crimes',
 'total_misdemeanor_crimes',
 'murder_homicide_count',
 'robbery_count',
 'assault_count',
 'burglary_count',
 'rape_count',
 'grand_larceny_count',
 'grand_larceny_vehicle_count',
 'weapons_count']

In [None]:
crime_data = crime_data[['Start Date', 'End Date']+crime_data.columns[1:-2]]
crime_data['nonviolent_felony_crimes'] = crime_data['total_felony_crimes'] - crime_data['violent_felony_crimes']
print(crime_data.columns)

In [27]:
#save to csv
crime_data.to_csv("../Data/_30_day_crime_counts.csv")

In [28]:
arrest_data = get_arrest_data(first_st_date_adp, 30)
arrest_data.head()

Unnamed: 0,Start Date,total_felony_arrest,violent_felony_arrest,total_misdemeanor_arrest,arrest_murder_homicide_count,arrest_robbery_count,arrest_assault_count,arrest_burglary_count,arrest_rape_count,arrest_grand_larceny_count,arrest_grand_larceny_vehicle_count,arrest_weapons_count,End Date,Days to Max Date
0,2016-05-29,7523,2257,17027,76,804,1299,317,78,758,94,567,2016-06-27,2863
1,2016-06-28,7813,2457,15706,78,826,1480,366,73,884,99,544,2016-07-27,2833
2,2016-07-28,8021,2469,16830,107,868,1415,404,79,849,103,566,2016-08-26,2803
3,2016-08-27,7514,2184,16382,79,821,1225,409,59,753,110,624,2016-09-25,2773
4,2016-09-26,7955,2227,17433,75,876,1212,422,64,827,97,603,2016-10-25,2743


In [29]:
arrest_data.tail(1)

Unnamed: 0,Start Date,total_felony_arrest,violent_felony_arrest,total_misdemeanor_arrest,arrest_murder_homicide_count,arrest_robbery_count,arrest_assault_count,arrest_burglary_count,arrest_rape_count,arrest_grand_larceny_count,arrest_grand_larceny_vehicle_count,arrest_weapons_count,End Date,Days to Max Date
94,2024-02-17,8782,2763,11746,9,922,1775,584,57,978,176,563,2024-03-17,43


In [30]:
#save to csv
arrest_data[['Start Date', 'End Date']+arrest_data.columns[1:-2].to_list()].to_csv("../Data/_30_day_arrest_counts.csv")

In [31]:
#repeat for 7 day
_7day_crime_data = get_crime_data(first_st_date_adp,7)
_7day_arrest_data = get_arrest_data(first_st_date_adp,7)

#save to csv
_7day_crime_data[['Start Date', 'End Date']+crime_data.columns[1:-2].to_list()].to_csv("../Data/_7_day_crime_counts.csv")
_7day_arrest_data[['Start Date', 'End Date']+arrest_data.columns[1:-2].to_list()].to_csv("../Data/_7_day_arrest_counts.csv")