# Data Retreival and Aggregation

Dependent Variable (ADP): We will use boto3 package to get the historical daily inmates in custody data from our secure s3 bucket. *Note: you will not be able to retrieve this data unless you are a part of the DCJ organization and have configured your AWS credentials for your compputer.*

Independent Variables: We will also use the SoSQL language to grab exogenous variables from NYC Open Data Portal. 

Both datasets will then be aggregated to 30-day rolling averages or counts to be used in the analysis.

In [32]:
#import packages
import pandas as pd
import numpy as np
from functions import *
%matplotlib inline

### Step 1: Get Target Variable

In [33]:
#specify s3 bucket and file locations
bucket_name = 'doc-daily-inmates-in-custody'
folder_name = 'data/merged_files/'
file_name = 'agg_daily_pop.csv'

#get the daily merged file from s3 bucket
adp_df = get_file(bucket_name, folder_name, file_name)
adp_df.head()

Trying to get object from bucket
Got file contents from AWS S3
Saved file contents as dataframe


Unnamed: 0,snapshot_date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_CSP,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,Total Population,snapshot_month,snapshot_year
0,2016-06-02,158,5603,9,2706,108,1252,614,16,9206,...,63,6892,821,98,573,43,163,9836,6,2016
1,2016-06-03,157,5584,9,2676,106,1248,610,13,9157,...,63,6860,807,91,577,43,180,9780,6,2016
2,2016-06-04,159,5581,9,2661,112,1243,616,17,9132,...,63,6829,792,100,592,44,179,9765,6,2016
3,2016-06-05,161,5661,9,2692,108,1263,629,13,9252,...,64,6928,796,100,591,44,179,9894,6,2016
4,2016-06-06,158,5674,9,2702,107,1254,633,14,9257,...,65,6983,798,85,572,47,178,9904,6,2016


In [34]:
#aggregate to 30 day intervals
#set data type
adp_df['snapshot_date'] = pd.to_datetime(adp_df['snapshot_date'])
#get latest snapshot date
max_date = adp_df['snapshot_date'].max()
#aggregate to 30 day periods
_30day_interval_data = adp_df.resample('30D', on='snapshot_date',origin = max_date,closed='right',label = 'right').mean().round().fillna(0).reset_index()
_30day_interval_data = _30day_interval_data.rename(columns = {'snapshot_date':'End Date','Total Population':'ADP'})
_30day_interval_data['Start Date'] = _30day_interval_data['End Date'] - pd.to_timedelta(29, unit='D')

_30day_interval_data.head()

Unnamed: 0,End Date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,ADP,snapshot_month,snapshot_year,Start Date
0,2016-06-28,154.0,5614.0,8.0,2685.0,112.0,1239.0,625.0,13.0,9174.0,...,6894.0,800.0,95.0,564.0,43.0,160.0,9813.0,6.0,2016.0,2016-05-30
1,2016-07-28,149.0,5565.0,8.0,2706.0,98.0,1221.0,611.0,13.0,9124.0,...,6816.0,800.0,106.0,570.0,43.0,153.0,9748.0,7.0,2016.0,2016-06-29
2,2016-08-27,156.0,5543.0,9.0,2754.0,99.0,1206.0,644.0,14.0,9108.0,...,6842.0,803.0,104.0,573.0,39.0,124.0,9767.0,8.0,2016.0,2016-07-29
3,2016-09-26,160.0,5629.0,7.0,2817.0,93.0,1177.0,661.0,11.0,9210.0,...,6967.0,845.0,96.0,581.0,30.0,108.0,9882.0,9.0,2016.0,2016-08-28
4,2016-10-26,167.0,5573.0,5.0,2858.0,92.0,1115.0,643.0,7.0,9160.0,...,6857.0,824.0,103.0,528.0,40.0,173.0,9810.0,10.0,2016.0,2016-09-27


In [35]:
#also do 7 day rolling averages ~ weekly
#aggregate to 30 day periods
_7day_interval_data = adp_df.resample('7D', on='snapshot_date',origin = max_date,closed='right',label = 'right').mean().round().fillna(0).reset_index()
_7day_interval_data = _7day_interval_data.rename(columns = {'snapshot_date':'End Date','Total Population':'ADP'})
_7day_interval_data['Start Date'] = _7day_interval_data['End Date'] - pd.to_timedelta(6, unit='D')

_7day_interval_data.head()

Unnamed: 0,End Date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,ADP,snapshot_month,snapshot_year,Start Date
0,2016-06-05,159.0,5607.0,9.0,2684.0,108.0,1252.0,617.0,15.0,9187.0,...,6877.0,804.0,97.0,583.0,44.0,175.0,9819.0,6.0,2016.0,2016-05-30
1,2016-06-12,154.0,5610.0,9.0,2692.0,108.0,1238.0,629.0,13.0,9168.0,...,6929.0,798.0,88.0,556.0,40.0,164.0,9811.0,6.0,2016.0,2016-06-06
2,2016-06-19,149.0,5607.0,9.0,2695.0,109.0,1231.0,624.0,12.0,9165.0,...,6893.0,801.0,86.0,548.0,41.0,172.0,9801.0,6.0,2016.0,2016-06-13
3,2016-06-26,156.0,5617.0,7.0,2668.0,118.0,1240.0,624.0,14.0,9168.0,...,6864.0,799.0,105.0,572.0,44.0,142.0,9805.0,6.0,2016.0,2016-06-20
4,2016-07-03,154.0,5610.0,7.0,2689.0,113.0,1231.0,629.0,12.0,9162.0,...,6858.0,794.0,110.0,584.0,49.0,145.0,9804.0,6.0,2016.0,2016-06-27


In [36]:
_7day_pop_tot = _7day_interval_data[['Start Date','End Date','ADP']]
# for the weekly data, we are going to double check for any missing values and do
# a linear interpolation just in case. 
_7day_pop_tot = _7day_pop_tot.replace(0,np.nan)
_7day_pop_tot["ADP"] = _7day_pop_tot["ADP"].interpolate(method='linear', axis=0).ffill().bfill()


_7day_pop_tot.tail()

Unnamed: 0,Start Date,End Date,ADP
415,2024-05-13,2024-05-19,6301.0
416,2024-05-20,2024-05-26,6299.0
417,2024-05-27,2024-06-02,6358.0
418,2024-06-03,2024-06-09,6354.0
419,2024-06-10,2024-06-16,6366.0


In [37]:
#lets only look at the total population
_30day_pop_tot = _30day_interval_data[['Start Date','End Date','ADP']]
daily_pop = adp_df[['snapshot_date','Total Population']]
#save to csv to use in other scripts
daily_pop.to_csv("../Data/daily_pop.csv")
_7day_pop_tot.to_csv("../Data/_7_day_adp.csv")
_30day_pop_tot.to_csv("../Data/_30_day_adp.csv")

In [38]:
daily_pop.head()

Unnamed: 0,snapshot_date,Total Population
0,2016-06-02,9836
1,2016-06-03,9780
2,2016-06-04,9765
3,2016-06-05,9894
4,2016-06-06,9904


In [39]:
_30day_pop_tot.dtypes

Start Date    datetime64[ns]
End Date      datetime64[ns]
ADP                  float64
dtype: object

In [40]:
_30day_pop_tot.tail()

Unnamed: 0,Start Date,End Date,ADP
93,2024-01-19,2024-02-17,6181.0
94,2024-02-18,2024-03-18,6256.0
95,2024-03-19,2024-04-17,6302.0
96,2024-04-18,2024-05-17,6321.0
97,2024-05-18,2024-06-16,6341.0


### Step 2: Get IVs

* length of stay

* 30-day admission counts

* 30-day discharge counts

In [41]:
admit_url = 'https://data.cityofnewyork.us/resource/6teu-xtgp.json'
dis_url = 'https://data.cityofnewyork.us/resource/94ri-3ium.json'
first_st_date_adp = _30day_pop_tot.iloc[0]['Start Date'].date()
_30day_admit_df = get_agg_admit_dis_data(first_st_date_adp, admit_url, 'ADMITTED_DT',30)
_30day_dis_df = get_agg_admit_dis_data(first_st_date_adp, dis_url, 'DISCHARGED_DT',30)

In [42]:
_30day_admit_df.head()

Unnamed: 0,Start Date,admission_count,End Date,Month,Year,Days to Max Date
0,2016-05-30,4825,2016-06-28,5,2016,2923
1,2016-06-29,4774,2016-07-28,6,2016,2893
2,2016-07-29,4783,2016-08-27,7,2016,2863
3,2016-08-28,4672,2016-09-26,8,2016,2833
4,2016-09-27,4619,2016-10-26,9,2016,2803


In [43]:
_30day_admit_df.tail()

Unnamed: 0,Start Date,admission_count,End Date,Month,Year,Days to Max Date
92,2023-12-20,1677,2024-01-18,12,2023,163
93,2024-01-19,1921,2024-02-17,1,2024,133
94,2024-02-18,1961,2024-03-18,2,2024,103
95,2024-03-19,1962,2024-04-17,3,2024,73
96,2024-04-18,1982,2024-05-17,4,2024,43


In [44]:
_30day_dis_df.head()

Unnamed: 0,Start Date,discharge_count,End Date,Month,Year,Days to Max Date
0,2016-05-30,4851,2016-06-28,5,2016,2923
1,2016-06-29,4847,2016-07-28,6,2016,2893
2,2016-07-29,4640,2016-08-27,7,2016,2863
3,2016-08-28,4617,2016-09-26,8,2016,2833
4,2016-09-27,4694,2016-10-26,9,2016,2803


In [45]:
_30day_dis_df.tail()

Unnamed: 0,Start Date,discharge_count,End Date,Month,Year,Days to Max Date
92,2023-12-20,1573,2024-01-18,12,2023,163
93,2024-01-19,1912,2024-02-17,1,2024,133
94,2024-02-18,1820,2024-03-18,2,2024,103
95,2024-03-19,1946,2024-04-17,3,2024,73
96,2024-04-18,2052,2024-05-17,4,2024,43


In [46]:
#test los function
_30_day_los_df = get_los_data(first_st_date_adp,30)
_30_day_los_df.head()

Unnamed: 0,Start Date,Avg LOS Days,End Date,Discharge Month,Discharge Year,Days to Max Date
0,2016-05-30,61.091824,2016-06-28,5,2016,2923
1,2016-06-29,61.526544,2016-07-28,6,2016,2893
2,2016-07-29,59.230968,2016-08-27,7,2016,2863
3,2016-08-28,57.64491,2016-09-26,8,2016,2833
4,2016-09-27,63.246173,2016-10-26,9,2016,2803


In [47]:
_30_day_los_df.tail()

Unnamed: 0,Start Date,Avg LOS Days,End Date,Discharge Month,Discharge Year,Days to Max Date
92,2023-12-20,96.071202,2024-01-18,12,2023,163
93,2024-01-19,99.113434,2024-02-17,1,2024,133
94,2024-02-18,100.065898,2024-03-18,2,2024,103
95,2024-03-19,97.841294,2024-04-17,3,2024,73
96,2024-04-18,105.554798,2024-05-17,4,2024,43


In [48]:
#join to one dataframe and store as csv
_30day_joined_ivs_df = _30day_admit_df[['Start Date','End Date','admission_count']].merge(_30day_dis_df[['Start Date','End Date','discharge_count']],
                                                                                left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_30day_joined_ivs_df = _30day_joined_ivs_df.merge(_30_day_los_df[['Start Date','End Date','Avg LOS Days']],
                                    left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_30day_joined_ivs_df.head()

Unnamed: 0,Start Date,End Date,admission_count,discharge_count,Avg LOS Days
0,2016-05-30,2016-06-28,4825,4851,61.091824
1,2016-06-29,2016-07-28,4774,4847,61.526544
2,2016-07-29,2016-08-27,4783,4640,59.230968
3,2016-08-28,2016-09-26,4672,4617,57.64491
4,2016-09-27,2016-10-26,4619,4694,63.246173


In [49]:
_30day_joined_ivs_df.tail()

Unnamed: 0,Start Date,End Date,admission_count,discharge_count,Avg LOS Days
92,2023-12-20,2024-01-18,1677,1573,96.071202
93,2024-01-19,2024-02-17,1921,1912,99.113434
94,2024-02-18,2024-03-18,1961,1820,100.065898
95,2024-03-19,2024-04-17,1962,1946,97.841294
96,2024-04-18,2024-05-17,1982,2052,105.554798


In [50]:
#save to csv
_30day_joined_ivs_df.to_csv("../Data/_30_day_IVs.csv")

In [51]:
#repeat for 7 day
_7day_admit_df = get_agg_admit_dis_data(first_st_date_adp, admit_url, 'ADMITTED_DT',7)
_7day_dis_df = get_agg_admit_dis_data(first_st_date_adp, dis_url, 'DISCHARGED_DT',7)
_7day_los_df = get_los_data(first_st_date_adp,7)
#merge
_7day_joined_ivs_df = _7day_admit_df[['Start Date','End Date','admission_count']].merge(_7day_dis_df[['Start Date','End Date','discharge_count']],
                                                                                left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_7day_joined_ivs_df = _7day_joined_ivs_df.merge(_7day_los_df[['Start Date','End Date','Avg LOS Days']],
                                    left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
_7day_joined_ivs_df.head()


Unnamed: 0,Start Date,End Date,admission_count,discharge_count,Avg LOS Days
0,2016-05-30,2016-06-05,1097,1058,56.356742
1,2016-06-06,2016-06-12,1101,1105,53.759928
2,2016-06-13,2016-06-19,1151,1166,59.657216
3,2016-06-20,2016-06-26,1175,1133,71.740512
4,2016-06-27,2016-07-03,1199,1244,61.011281


In [52]:
#save to csv
_7day_joined_ivs_df.to_csv("../Data/_7_day_IVs.csv")

In [53]:
#test complaint function
crime_data = get_crime_data(first_st_date_adp,30)
crime_data.head()

Unnamed: 0,Start Date,total_felony_crimes,murder_homicide_count,robbery_count,assault_count,burglary_count,rape_count,grand_larceny_count,grand_larceny_vehicle_count,weapons_count,End Date,Days to Max Date
0,2016-05-30,12730,33,1272,1865,1094,154,3777,577,438,2016-06-28,2862
1,2016-06-29,13042,40,1342,2070,1095,150,3834,592,399,2016-07-28,2832
2,2016-07-29,13173,34,1379,1997,1123,137,3785,652,446,2016-08-27,2802
3,2016-08-28,12678,34,1326,1875,1063,114,3711,605,468,2016-09-26,2772
4,2016-09-27,12661,30,1309,1625,1120,122,3783,549,504,2016-10-26,2742


In [54]:
crime_data.tail()

Unnamed: 0,Start Date,total_felony_crimes,murder_homicide_count,robbery_count,assault_count,burglary_count,rape_count,grand_larceny_count,grand_larceny_vehicle_count,weapons_count,End Date,Days to Max Date
90,2023-10-21,14908,24,1465,2263,1088,74,3798,1240,459,2023-11-19,162
91,2023-11-20,13611,37,1433,1992,1031,83,3653,1178,344,2023-12-19,132
92,2023-12-20,13355,16,1431,2053,1003,83,3282,1082,410,2024-01-18,102
93,2024-01-19,14236,26,1298,2042,1022,84,3638,1041,451,2024-02-17,72
94,2024-02-18,13481,23,1232,2116,1022,83,3212,952,460,2024-03-18,42


In [55]:
_30day_pop_tot.head(1)

Unnamed: 0,Start Date,End Date,ADP
0,2016-05-30,2016-06-28,9813.0


In [56]:
_30day_pop_tot.tail(1)

Unnamed: 0,Start Date,End Date,ADP
97,2024-05-18,2024-06-16,6341.0


In [57]:
crime_data.columns[1:-2].to_list()

['total_felony_crimes',
 'murder_homicide_count',
 'robbery_count',
 'assault_count',
 'burglary_count',
 'rape_count',
 'grand_larceny_count',
 'grand_larceny_vehicle_count',
 'weapons_count']

In [58]:
#save to csv
crime_data[['Start Date', 'End Date']+crime_data.columns[1:-2].to_list()].to_csv("../Data/_30_day_crime_counts.csv")

In [59]:
arrest_data = get_arrest_data(first_st_date_adp, 30)
arrest_data.head()

Unnamed: 0,Start Date,total_felony_arrest,arrest_murder_homicide_count,arrest_robbery_count,arrest_assault_count,arrest_burglary_count,arrest_rape_count,arrest_grand_larceny_count,arrest_grand_larceny_vehicle_count,arrest_weapons_count,End Date,Days to Max Date
0,2016-05-30,7630,86,809,1320,331,81,779,93,570,2016-06-28,2862
1,2016-06-29,7866,82,819,1479,364,70,879,96,557,2016-07-28,2832
2,2016-07-29,7918,110,872,1402,396,79,841,105,554,2016-08-27,2802
3,2016-08-28,7505,77,820,1225,413,58,753,109,617,2016-09-26,2772
4,2016-09-27,8091,75,892,1216,424,67,855,94,617,2016-10-26,2742


In [60]:
arrest_data.tail(1)

Unnamed: 0,Start Date,total_felony_arrest,arrest_murder_homicide_count,arrest_robbery_count,arrest_assault_count,arrest_burglary_count,arrest_rape_count,arrest_grand_larceny_count,arrest_grand_larceny_vehicle_count,arrest_weapons_count,End Date,Days to Max Date
94,2024-02-18,8837,130,923,1806,583,62,987,173,562,2024-03-18,42


In [61]:
#save to csv
arrest_data[['Start Date', 'End Date']+arrest_data.columns[1:-2].to_list()].to_csv("../Data/_30_day_arrest_counts.csv")

In [62]:
#repeat for 7 day
_7day_crime_data = get_crime_data(first_st_date_adp,7)
_7day_arrest_data = get_arrest_data(first_st_date_adp,7)

#save to csv
_7day_crime_data[['Start Date', 'End Date']+crime_data.columns[1:-2].to_list()].to_csv("../Data/_7_day_crime_counts.csv")
_7day_arrest_data[['Start Date', 'End Date']+arrest_data.columns[1:-2].to_list()].to_csv("../Data/_7_day_arrest_counts.csv")