# Data Retreival and Aggregation

Dependent Variable (ADP): We will use boto3 package to get the historical daily inmates in custody data from our secure s3 bucket. *Note: you will not be able to retrieve this data unless you are a part of the DCJ organization and have configured your AWS credentials for your compputer.*

Independent Variables: We will also use the SoSQL language to grab exogenous variables from NYC Open Data Portal. 

Both datasets will then be aggregated to 30-day rolling averages or counts to be used in the analysis.

In [1]:
#import packages
import pandas as pd
from functions import *
%matplotlib inline

### Step 1: Get Target Variable

In [2]:
#specify s3 bucket and file locations
bucket_name = 'doc-daily-inmates-in-custody'
folder_name = 'data/merged_files/'
file_name = 'agg_daily_pop.csv'

#get the daily merged file from s3 bucket
adp_df = get_file(bucket_name, folder_name, file_name)
adp_df.head()

Trying to get object from bucket
Got file contents from AWS S3
Saved file contents as dataframe


Unnamed: 0,snapshot_date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_CSP,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,Total Population,snapshot_month,snapshot_year
0,2021-07-26,105,3484,9,1648,33,648,295,20,5612,...,11,4456,778,79,264,4,110,5927,7,2021
1,2021-07-27,106,3469,9,1642,31,643,296,18,5586,...,12,4437,771,84,267,4,101,5900,7,2021
2,2021-07-28,106,3478,9,1648,31,640,292,18,5602,...,11,4437,769,84,269,4,110,5912,7,2021
3,2021-07-29,108,3470,9,1649,34,638,291,22,5595,...,10,4436,772,91,264,4,99,5908,7,2021
4,2021-07-30,108,3455,9,1630,33,638,285,21,5567,...,10,4412,761,83,265,4,107,5873,7,2021


In [3]:
#aggregate to 30 day intervals
#set data type
adp_df['snapshot_date'] = pd.to_datetime(adp_df['snapshot_date'])
#get latest snapshot date
max_date = adp_df['snapshot_date'].max()
#aggregate to 30 day periods
interval_data = adp_df.resample('30D', on='snapshot_date',origin = max_date,closed='right',label = 'right').mean().round().fillna(0).reset_index()
interval_data = interval_data.rename(columns = {'snapshot_date':'End Date','Total Population':'ADP'})
interval_data['Start Date'] = interval_data['End Date'] - pd.to_timedelta(29, unit='D')

interval_data.head()

Unnamed: 0,End Date,race_A,race_B,race_I,race_O,race_U,race_W,gender_F,gender_Gender Unknown,gender_M,...,inmate_status_code_DE,inmate_status_code_DEP,inmate_status_code_DNS,inmate_status_code_DPV,inmate_status_code_SCO,inmate_status_code_SSR,ADP,snapshot_month,snapshot_year,Start Date
0,2021-08-15,106.0,3474.0,9.0,1653.0,28.0,644.0,289.0,17.0,5609.0,...,4463.0,756.0,83.0,267.0,3.0,100.0,5914.0,8.0,2021.0,2021-07-17
1,2021-09-14,103.0,3538.0,11.0,1698.0,28.0,644.0,297.0,17.0,5710.0,...,4592.0,757.0,67.0,272.0,3.0,92.0,6023.0,8.0,2021.0,2021-08-16
2,2021-10-14,100.0,3418.0,11.0,1653.0,29.0,597.0,277.0,14.0,5517.0,...,4625.0,669.0,58.0,183.0,3.0,112.0,5808.0,9.0,2021.0,2021-09-15
3,2021-11-13,91.0,3293.0,11.0,1557.0,24.0,550.0,257.0,13.0,5255.0,...,4537.0,529.0,72.0,138.0,2.0,134.0,5526.0,10.0,2021.0,2021-10-15
4,2021-12-13,100.0,3230.0,13.0,1533.0,22.0,512.0,243.0,10.0,5157.0,...,4487.0,455.0,91.0,103.0,2.0,137.0,5410.0,12.0,2021.0,2021-11-14


In [4]:
#lets only look at the total population
_30day_pop_tot = interval_data[['Start Date','End Date','ADP']]

#save to csv to use in other scripts
_30day_pop_tot.to_csv("../Data/_30_day_adp.csv")

In [5]:
_30day_pop_tot.dtypes

Start Date    datetime64[ns]
End Date      datetime64[ns]
ADP                  float64
dtype: object

In [6]:
_30day_pop_tot.tail()

Unnamed: 0,Start Date,End Date,ADP
28,2023-11-04,2023-12-03,6152.0
29,2023-12-04,2024-01-02,6061.0
30,2024-01-03,2024-02-01,6124.0
31,2024-02-02,2024-03-02,6217.0
32,2024-03-03,2024-04-01,6290.0


### Step 2: Get IVs

* length of stay

* 30-day admission counts

* 30-day discharge counts

In [7]:
admit_url = 'https://data.cityofnewyork.us/resource/6teu-xtgp.json'
dis_url = 'https://data.cityofnewyork.us/resource/94ri-3ium.json'
first_st_date_adp = _30day_pop_tot.iloc[0]['Start Date'].date()
admit_df = get_agg_admit_dis_data(first_st_date_adp, admit_url, 'ADMITTED_DT')
dis_df = get_agg_admit_dis_data(first_st_date_adp, dis_url, 'DISCHARGED_DT')

In [8]:
admit_df.head()

Unnamed: 0,Start Date,End Date,Month,Year,Days to Max Date,Adjusted Admission Count
0,2021-07-17,2021-08-15,7,2021,988,1524
1,2021-08-16,2021-09-14,8,2021,958,1396
2,2021-09-15,2021-10-14,9,2021,928,1329
3,2021-10-15,2021-11-13,10,2021,898,1263
4,2021-11-14,2021-12-13,11,2021,868,1273


In [9]:
admit_df.tail()

Unnamed: 0,Start Date,End Date,Month,Year,Days to Max Date,Adjusted Admission Count
28,2023-11-04,2023-12-03,11,2023,148,1741
29,2023-12-04,2024-01-02,12,2023,118,1518
30,2024-01-03,2024-02-01,1,2024,88,1930
31,2024-02-02,2024-03-02,2,2024,58,1930
32,2024-03-03,2024-04-01,3,2024,28,1930


In [10]:
dis_df.head()

Unnamed: 0,Start Date,End Date,Month,Year,Days to Max Date,Adjusted Discharge Count
0,2021-07-17,2021-08-15,7,2021,988,1343
1,2021-08-16,2021-09-14,8,2021,958,1282
2,2021-09-15,2021-10-14,9,2021,928,1780
3,2021-10-15,2021-11-13,10,2021,898,1351
4,2021-11-14,2021-12-13,11,2021,868,1271


In [11]:
#test los function
los_df = get_los_data(first_st_date_adp)
los_df.head()

Unnamed: 0,Start Date,End Date,Discharge Month,Discharge Year,Adjusted Avg LOS Days
0,2021-07-17,2021-08-15,7,2021,123
1,2021-08-16,2021-09-14,8,2021,110
2,2021-09-15,2021-10-14,9,2021,114
3,2021-10-15,2021-11-13,10,2021,104
4,2021-11-14,2021-12-13,11,2021,120


In [12]:
#rename column names in both dfs
admit_df = admit_df.rename(columns = {'Adjusted Admission Count': 'admission_count'})
admit_df = admit_df.drop(columns = ['Days to Max Date','Year','Month'])
dis_df = dis_df.rename(columns = {'Adjusted Discharge Count': 'discharge_count'})
dis_df = dis_df.drop(columns = ['Days to Max Date','Year','Month'])
los_df = los_df.rename(columns = {'Adjusted Avg LOS Days': 'los_days'})
los_df = los_df.drop(columns = ['Discharge Year', 'Discharge Month'])

print(admit_df.columns)
print(dis_df.columns)
print(los_df.columns)

Index(['Start Date', 'End Date', 'admission_count'], dtype='object')
Index(['Start Date', 'End Date', 'discharge_count'], dtype='object')
Index(['Start Date', 'End Date', 'los_days'], dtype='object')


In [13]:
#join to one dataframe and store as csv
joined_ivs_df = admit_df.merge(dis_df, left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
joined_ivs_df = joined_ivs_df.merge(los_df, left_on = ['Start Date','End Date'], right_on = ['Start Date','End Date'])
joined_ivs_df.head()

Unnamed: 0,Start Date,End Date,admission_count,discharge_count,los_days
0,2021-07-17,2021-08-15,1524,1343,123
1,2021-08-16,2021-09-14,1396,1282,110
2,2021-09-15,2021-10-14,1329,1780,114
3,2021-10-15,2021-11-13,1263,1351,104
4,2021-11-14,2021-12-13,1273,1271,120


In [14]:
joined_ivs_df.tail()

Unnamed: 0,Start Date,End Date,admission_count,discharge_count,los_days
28,2023-11-04,2023-12-03,1741,1691,96
29,2023-12-04,2024-01-02,1518,1707,106
30,2024-01-03,2024-02-01,1930,1781,100
31,2024-02-02,2024-03-02,1930,1829,96
32,2024-03-03,2024-04-01,1930,1805,98


In [15]:
#save to csv
joined_ivs_df.to_csv("../Data/_30_day_IVs.csv")