In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from scipy.stats import norm

import urllib.request, json
import requests
import urllib.parse
import datetime

# Retrieve Covariates Data from NYC Open Data Portal

Aggregate the following datasets to monthly counts. The question would be whether or not we want to normalize these based on either city population (crime) and inmate population. The issue with normalizing on inmate population for the inmate/staff related incidents is that we do not have aggregated monthly jail population, just change in population. We should also consider using scaling laws to normalize crime counts rather than a per capita metric.


### Socrata API urls:

* Crime Rates: `https://data.cityofnewyork.us/resource/qgea-i56i.json`


* Inmate Assault on Staff: `https://data.cityofnewyork.us/resource/erra-pzy8.json`


* Inmate Incidents - Inmate Fights: `https://data.cityofnewyork.us/resource/k548-32d3.json`


* Inmate Incidents - Slashing and Stabbing: `https://data.cityofnewyork.us/resource/gakf-suji.json`

#### Covariate 1: Crime Counts

In [19]:
#retrieve monthly crime counts from Jan 1, 2014 through most recent month
crime_historical = 'https://data.cityofnewyork.us/resource/qgea-i56i.json'
crime_yeartodate = 'https://data.cityofnewyork.us/resource/5uac-w243.json'

In [20]:
query = (crime_historical+'?'
            "$select=min(CMPLNT_FR_DT),max(CMPLNT_FR_DT)"
            "&$where=CMPLNT_FR_DT >= '2014-01-01T00:00:00'")
query = query.replace(" ", "%20")
print(query)
response = urllib.request.urlopen(query)
data = json.loads(response.read())

https://data.cityofnewyork.us/resource/qgea-i56i.json?$select=min(CMPLNT_FR_DT),max(CMPLNT_FR_DT)&$where=CMPLNT_FR_DT%20>=%20'2014-01-01T00:00:00'


In [21]:
data

[{'min_CMPLNT_FR_DT': '2014-01-01T00:00:00.000',
  'max_CMPLNT_FR_DT': '2022-12-31T00:00:00.000'}]

In [22]:
query = (crime_yeartodate+'?'
            "$select=min(CMPLNT_FR_DT),max(CMPLNT_FR_DT)"
            "&$where=CMPLNT_FR_DT >= '2023-01-01T00:00:00'")
query = query.replace(" ", "%20")
print(query)
response = urllib.request.urlopen(query)
data = json.loads(response.read())

https://data.cityofnewyork.us/resource/5uac-w243.json?$select=min(CMPLNT_FR_DT),max(CMPLNT_FR_DT)&$where=CMPLNT_FR_DT%20>=%20'2023-01-01T00:00:00'


In [23]:
data

[{'min_CMPLNT_FR_DT': '2023-01-01T00:00:00.000',
  'max_CMPLNT_FR_DT': '2023-03-31T00:00:00.000'}]

In [24]:
def get_crime_data(url,min_date_str):
    query = (url+'?'
            "$select=LAW_CAT_CD,CMPLNT_FR_DT,COUNT(DISTINCT CMPLNT_NUM) as CMPLNT_COUNT"
            "&$where=CMPLNT_FR_DT >='{}'".format(min_date_str)+
            "&$group=LAW_CAT_CD, CMPLNT_FR_DT" 
            "&$limit=100000")
    query = query.replace(" ", "%20")
    print(query)
    response = urllib.request.urlopen(query)
    data = json.loads(response.read())
    
    #store in dataframe
    df = pd.DataFrame(data,columns = data[0].keys())
    
    return df

In [25]:
historic = get_crime_data(crime_historical,'2014-01-01T00:00:00')
historic.head()

https://data.cityofnewyork.us/resource/qgea-i56i.json?$select=LAW_CAT_CD,CMPLNT_FR_DT,COUNT(DISTINCT%20CMPLNT_NUM)%20as%20CMPLNT_COUNT&$where=CMPLNT_FR_DT%20>='2014-01-01T00:00:00'&$group=LAW_CAT_CD,%20CMPLNT_FR_DT&$limit=100000


Unnamed: 0,LAW_CAT_CD,CMPLNT_FR_DT,CMPLNT_COUNT
0,FELONY,2014-01-01T00:00:00.000,748
1,FELONY,2014-01-02T00:00:00.000,378
2,FELONY,2014-01-03T00:00:00.000,338
3,FELONY,2014-01-04T00:00:00.000,302
4,FELONY,2014-01-05T00:00:00.000,296


In [26]:
yr2date = get_crime_data(crime_yeartodate,str(historic.CMPLNT_FR_DT.max()))
yr2date.head()

https://data.cityofnewyork.us/resource/5uac-w243.json?$select=LAW_CAT_CD,CMPLNT_FR_DT,COUNT(DISTINCT%20CMPLNT_NUM)%20as%20CMPLNT_COUNT&$where=CMPLNT_FR_DT%20>='2022-12-31T00:00:00.000'&$group=LAW_CAT_CD,%20CMPLNT_FR_DT&$limit=100000


Unnamed: 0,LAW_CAT_CD,CMPLNT_FR_DT,CMPLNT_COUNT
0,FELONY,2022-12-31T00:00:00.000,182
1,FELONY,2023-01-01T00:00:00.000,535
2,FELONY,2023-01-02T00:00:00.000,419
3,FELONY,2023-01-03T00:00:00.000,509
4,FELONY,2023-01-04T00:00:00.000,534


In [27]:
daily_crime = pd.concat([historic,yr2date])
daily_crime.head()

Unnamed: 0,LAW_CAT_CD,CMPLNT_FR_DT,CMPLNT_COUNT
0,FELONY,2014-01-01T00:00:00.000,748
1,FELONY,2014-01-02T00:00:00.000,378
2,FELONY,2014-01-03T00:00:00.000,338
3,FELONY,2014-01-04T00:00:00.000,302
4,FELONY,2014-01-05T00:00:00.000,296


In [28]:
daily_crime.tail()

Unnamed: 0,LAW_CAT_CD,CMPLNT_FR_DT,CMPLNT_COUNT
268,VIOLATION,2023-03-27T00:00:00.000,214
269,VIOLATION,2023-03-28T00:00:00.000,220
270,VIOLATION,2023-03-29T00:00:00.000,202
271,VIOLATION,2023-03-30T00:00:00.000,207
272,VIOLATION,2023-03-31T00:00:00.000,153


In [29]:
daily_crime.dtypes

LAW_CAT_CD      object
CMPLNT_FR_DT    object
CMPLNT_COUNT    object
dtype: object

In [30]:
#clean up date column, add year and month columns for aggregation
daily_crime['LAW_CAT_CD'] = daily_crime['LAW_CAT_CD'].astype(str) 
daily_crime['CMPLNT_FR_DT'] = pd.to_datetime(daily_crime['CMPLNT_FR_DT'])
daily_crime['CMPLNT_COUNT'] = daily_crime['CMPLNT_COUNT'].astype(int)

#new columns for month and year
daily_crime['CMPLNT_YR'] = daily_crime['CMPLNT_FR_DT'].dt.year
daily_crime['CMPLNT_MO'] = daily_crime['CMPLNT_FR_DT'].dt.month
daily_crime['CMPLNT_FR_DT'] = daily_crime['CMPLNT_FR_DT'].dt.date
daily_crime.head()

Unnamed: 0,LAW_CAT_CD,CMPLNT_FR_DT,CMPLNT_COUNT,CMPLNT_YR,CMPLNT_MO
0,FELONY,2014-01-01,748,2014,1
1,FELONY,2014-01-02,378,2014,1
2,FELONY,2014-01-03,338,2014,1
3,FELONY,2014-01-04,302,2014,1
4,FELONY,2014-01-05,296,2014,1


In [31]:
#save daily to csv
daily_crime.to_csv('../Data/daily_crime_count_bytype.csv')

In [43]:
#aggregate to monthly
monthly_crime = daily_crime.groupby(by = ['CMPLNT_YR','CMPLNT_MO','LAW_CAT_CD']).agg({'CMPLNT_COUNT':sum}).reset_index()
monthly_crime.head()

Unnamed: 0,CMPLNT_YR,CMPLNT_MO,LAW_CAT_CD,CMPLNT_COUNT
0,2014,1,FELONY,12443
1,2014,1,MISDEMEANOR,22457
2,2014,1,VIOLATION,4525
3,2014,2,FELONY,10547
4,2014,2,MISDEMEANOR,19948


In [44]:
monthly_crime = monthly_crime.pivot(index = ['CMPLNT_YR','CMPLNT_MO'],columns = 'LAW_CAT_CD',values = 'CMPLNT_COUNT').reset_index().rename_axis(None, axis=1)
monthly_crime.head()

Unnamed: 0,CMPLNT_YR,CMPLNT_MO,FELONY,MISDEMEANOR,VIOLATION
0,2014,1,12443,22457,4525
1,2014,2,10547,19948,4069
2,2014,3,11953,23873,4860
3,2014,4,11847,23104,4859
4,2014,5,13133,25445,5945


In [46]:
monthly_crime['Total_Crimes'] = monthly_crime[['FELONY','MISDEMEANOR','VIOLATION']].sum(axis = 1)
monthly_crime = monthly_crime.rename(columns = {'CMPLNT_YR':'year',
                                      'CMPLNT_MO':'month',
                                      'FELONY':'Felony_Crimes',
                                      'MISDEMEANOR':'Misdemeanor_Crimes',
                                      'VIOLATION':'Violation_Crimes'})
monthly_crime.head()

Unnamed: 0,year,month,Felony_Crimes,Misdemeanor_Crimes,Violation_Crimes,Total_Crimes
0,2014,1,12443,22457,4525,39425
1,2014,2,10547,19948,4069,34564
2,2014,3,11953,23873,4860,40686
3,2014,4,11847,23104,4859,39810
4,2014,5,13133,25445,5945,44523


In [47]:
monthly_crime.to_csv('../Data/monthly_crime_counts_by_type.csv')

#### Covariate 2: Inmate Assaults on Staff

In [48]:
staff_assualts = 'https://data.cityofnewyork.us/resource/erra-pzy8.json'

In [49]:
query = (staff_assualts+'?'
        "$select=*"
        "&$limit=10000")
query = query.replace(" ", "%20")
response = urllib.request.urlopen(query)
data = json.loads(response.read())

#store in dataframe
df = pd.DataFrame(data,columns = data[0].keys())

In [50]:
df.dtypes

incident_id      object
reported_dt      object
incident_type    object
reason           object
dtype: object

In [51]:
#convert to appropriate data types
df['incident_id'] = df['incident_id'].astype(int)
df['reported_dt'] = pd.to_datetime(df['reported_dt']).dt.date

In [52]:
#aggregate to daily & monthly counts
daily_staff_assaults = df.groupby(by = 'reported_dt').agg({'incident_id':'nunique'}).reset_index()
daily_staff_assaults.head()

Unnamed: 0,reported_dt,incident_id
0,2016-01-06,1
1,2016-01-08,1
2,2016-01-14,1
3,2016-01-15,1
4,2016-01-25,1


In [53]:
daily_staff_assaults.dtypes

reported_dt    object
incident_id     int64
dtype: object

In [54]:
daily_staff_assaults['reported_dt'] = pd.to_datetime(df['reported_dt'])
daily_staff_assaults['reported_yr'] = daily_staff_assaults['reported_dt'].dt.year
daily_staff_assaults['reported_mo'] = daily_staff_assaults['reported_dt'].dt.month
daily_staff_assaults['reported_dt'] = daily_staff_assaults['reported_dt'].dt.date

In [56]:
monthly_staff_assaults = daily_staff_assaults.groupby(by = ['reported_yr','reported_mo']).agg({'incident_id':sum}).reset_index()
monthly_staff_assaults = monthly_staff_assaults.rename(columns = {'reported_yr':'year',
                                                        'reported_mo':'month',
                                                        'incident_id':'Staff_Assaults'})
monthly_staff_assaults.head()

Unnamed: 0,year,month,Staff_Assaults
0,2016,1,13
1,2016,2,139
2,2016,3,9
3,2016,4,178
4,2016,5,167


In [57]:
#save to csv
monthly_staff_assaults.to_csv('../Data/monthly_assaults_on_DOC_staff.csv')

#### Covariate 3 & 4: Inmate Incidents

In [58]:
#stabbings and slashings
ss_incidents = 'https://data.cityofnewyork.us/resource/gakf-suji.json'
#fights
fights = 'https://data.cityofnewyork.us/resource/k548-32d3.json'

In [59]:
query = (ss_incidents+'?'
        "$select=*"
        "&$limit=10000")
query = query.replace(" ", "%20")
response = urllib.request.urlopen(query)
data = json.loads(response.read())

#store in dataframe
ss_df = pd.DataFrame(data,columns = data[0].keys())
ss_df = ss_df.rename(columns = {'reported_dt':'date'})
ss_df.head()

Unnamed: 0,incident_id,date,incident_type
0,76075,2016-01-31T21:17:00.000,Slashing
1,76130,2016-02-02T09:43:00.000,Slashing
2,76177,2016-02-03T15:56:00.000,Slashing
3,76348,2016-02-10T02:49:00.000,Stabbing
4,76488,2016-02-14T19:34:00.000,Slashing


In [60]:
query = (fights+'?'
        "$select=*"
        "&$limit=10000")
query = query.replace(" ", "%20")
response = urllib.request.urlopen(query)
data = json.loads(response.read())

#store in dataframe
fights_df = pd.DataFrame(data,columns = data[0].keys())
fights_df['incident_type'] = 'Inmate_Fight'
fights_df = fights_df.rename(columns = {'incident_dt':'date'})
fights_df.head()

Unnamed: 0,incident_id,date,incident_type
0,2970,2016-02-01T20:55:00.000,Inmate_Fight
1,2971,2016-02-01T11:35:00.000,Inmate_Fight
2,2972,2016-02-01T17:30:00.000,Inmate_Fight
3,2973,2016-02-01T20:25:00.000,Inmate_Fight
4,2974,2016-02-01T05:10:00.000,Inmate_Fight


In [61]:
inmate_incidents = pd.concat([ss_df,fights_df])

In [62]:
inmate_incidents.dtypes

incident_id      object
date             object
incident_type    object
dtype: object

In [63]:
inmate_incidents['incident_id'] = inmate_incidents['incident_id'].astype(int)
inmate_incidents['date'] = pd.to_datetime(inmate_incidents['date'])

#add month and year columns to aggregate
inmate_incidents['year'] = inmate_incidents['date'].dt.year
inmate_incidents['month'] = inmate_incidents['date'].dt.month
inmate_incidents.head()

Unnamed: 0,incident_id,date,incident_type,year,month
0,76075,2016-01-31 21:17:00,Slashing,2016,1
1,76130,2016-02-02 09:43:00,Slashing,2016,2
2,76177,2016-02-03 15:56:00,Slashing,2016,2
3,76348,2016-02-10 02:49:00,Stabbing,2016,2
4,76488,2016-02-14 19:34:00,Slashing,2016,2


In [65]:
#aggregate to monthly
monthly_inmate_incidents = inmate_incidents.groupby(by = ['year','month','incident_type']).agg({'incident_id':'nunique'}).reset_index()
monthly_inmate_incidents = monthly_inmate_incidents.pivot(index = ['year','month'],columns = 'incident_type',values = 'incident_id').reset_index().rename_axis(None, axis=1)
monthly_inmate_incidents['Total_Inmate_Incidents'] = monthly_inmate_incidents[['Slashing','Stabbing','Inmate_Fight']].sum(axis = 1)
monthly_inmate_incidents = monthly_inmate_incidents.rename(columns = {'Slashing': 'Inmate_Slashing','Stabbing':'Inmate_Stabbing'})

monthly_inmate_incidents.head()

Unnamed: 0,year,month,Inmate_Fight,Inmate_Slashing,Inmate_Stabbing,Total_Inmate_Incidents
0,2016,1,,1.0,,1.0
1,2016,2,405.0,8.0,1.0,414.0
2,2016,3,,1.0,,1.0
3,2016,4,439.0,11.0,1.0,451.0
4,2016,5,445.0,7.0,1.0,453.0


In [66]:
monthly_inmate_incidents.to_csv('../Data/monthly_inmate_incidents.csv')

#### Covariate 5: Arraignment Data

This data is stored outside of this repository due to privacy constraints

In [2]:
file_path = r"C:\Users\emjoh\OneDrive\Documents\DCJ Shared Drive Data\NYC_Arr_2018_2022.csv"

In [3]:
nyc_arr = pd.read_csv(file_path, index_col = 0)
nyc_arr.head()

  nyc_arr = pd.read_csv(file_path, index_col = 0)


Unnamed: 0,ginysids,giifid,id,sex,race.ethnicity,age.at.arrest,age.group,zip,county,precinct,...,partial.surety.bond_01,unsecured.app.bond_01,unsecured.surety.bond_01,secured.app.bond_01,secured.surety.bond_01,bail.set.date,cash.set,bail.post.date,cash.posted,comboid
1,B05D354F1CAC5A8DE871899393871D7D8B8F8A77411844...,74603AB7708F4F5C9827B32B298C8603C7E816A82D238D...,F449C676099DDEC1778C0D4485DE1D15F258D9FDE048A3...,Female,White,32.0,25-34,11207.0,Kings,,...,,,,,,,,,,B05D354F1CAC5A8DE871899393871D7D8B8F8A77411844...
2,635F74C5388A1E49E450DABB542DA242E5330C2587BE54...,18F2265B6D480CBF7AC53559B680B8F739737DFB315A69...,0F541D510893B4F98AC76F3E4232138CC49E09DB657E46...,Male,White,29.0,25-34,,Kings,84.0,...,,,,,,,,,,635F74C5388A1E49E450DABB542DA242E5330C2587BE54...
3,0329B94DB802E2D9ECDF58396B9933A15B80BAE2BEAF44...,0B15D8ADE603BA5D4A76480A951C17319B457862943814...,9A21D0221EDBAE066CDDB5305F637AEE8DF5FAEB4B2A37...,Male,Black,19.0,18-20,11212.0,Kings,84.0,...,,,,,,,,,,0329B94DB802E2D9ECDF58396B9933A15B80BAE2BEAF44...
4,635F74C5388A1E49E450DABB542DA242E5330C2587BE54...,9FDF0946900CADA0BB94C5CFB1D508749CC98AAB9D3C49...,741480CF329014CB91D56FD0D0C0269D4DFA0151D7BFEF...,Male,,28.0,25-34,,Kings,75.0,...,,,,,,,,,,635F74C5388A1E49E450DABB542DA242E5330C2587BE54...
5,635F74C5388A1E49E450DABB542DA242E5330C2587BE54...,6F311200CE42C151E5A295FBACE0291CE50C17FC83613E...,FC2536176635A634B0328BADC34C5A4961270D68F2612D...,Male,,,,,Kings,,...,,,,,,,,,,635F74C5388A1E49E450DABB542DA242E5330C2587BE54...


In [4]:
nyc_arr.columns

Index(['ginysids', 'giifid', 'id', 'sex', 'race.ethnicity', 'age.at.arrest',
       'age.group', 'zip', 'county', 'precinct', 'arrestdate', 'arrestyr',
       'arrest.charge', 'arrest.charge.class', 'arrest.charge.level',
       'arr.arrgn.days', 'arrgntdate', 'arrgntyr', 'arrgnt.mo',
       'arrgnt.charge', 'arrgnt.charge.attempted', 'arrgnt.charge.class',
       'arrgnt.charge.level', 'arraignment.type', 'arraignment.release.status',
       'datflag', 'vfo', 'dvflag', 'release.status_01', 'release.status_02',
       'release.status_03', 'release.status_04', 'release.status_05',
       'release.status_06', 'release.status_07', 'release.status_08',
       'release.status_09', 'release.status_10', 'first.app.date_01',
       'bail.action.date_01', 'bail.action_01', 'bail.condition_01',
       'bail.condition_02', 'em_01', 'ins.bond_01', 'credit.set_01',
       'partial.app.bond_01', 'partial.surety.bond_01',
       'unsecured.app.bond_01', 'unsecured.surety.bond_01',
       'secured.app

In [5]:
nyc_arr['arraignment.type'].unique()

array(['DAT', 'Pre-arraignment Deposition', 'Regular', 'Criminal Summons',
       'Domestic Violence Case', 'Family Court', 'Hospital Arraignment',
       'Juvenile Violent Felony Offender',
       'Domestic Violence Case - Not Arraigned',
       'Violation of Probation Transfer from Another County',
       'Domestic Violence Case - Hospital Arraignment',
       'Transferred from Summons Part'], dtype=object)

In [6]:
nyc_arr['arraignment.release.status'].unique()

array([nan, 'ROR', 'Bail Not Posted', 'Remand', 'Non-Monetary Conditions',
       'Bail Set'], dtype=object)

In [7]:
#this column seems to hold the charge description we are most interested in
nyc_arr['arrgnt.charge.class'].unique()

array(['A Misdemeanor', 'B Felony', 'Unclassified Misdemeanor',
       'D Felony', 'B Misdemeanor', 'Violation', 'Infraction', 'C Felony',
       'E Felony', nan, 'A Felony'], dtype=object)

In [8]:
nyc_arr['arrgnt.charge.level'].unique()

array(['Misdemeanor', 'Felony', 'Violation', 'Infraction', nan],
      dtype=object)

In [9]:
print(len(nyc_arr['comboid'].unique()),len(nyc_arr['id'].unique()),len(nyc_arr['giifid'].unique()),len(nyc_arr['ginysids'].unique()))

706934 706701 706701 377995


In [20]:
#count the unique ids for each yr-mo and charge class
df = nyc_arr.copy()[['id','arrgnt.charge.level','arrgntdate', 'arrgntyr', 'arrgnt.mo']]
df['arrgnt.charge.level'] = df['arrgnt.charge.level'].fillna('Other')

df.head()

Unnamed: 0,id,arrgnt.charge.level,arrgntdate,arrgntyr,arrgnt.mo
1,F449C676099DDEC1778C0D4485DE1D15F258D9FDE048A3...,Misdemeanor,7/30/2018,2018,7
2,0F541D510893B4F98AC76F3E4232138CC49E09DB657E46...,Felony,1/2/2018,2018,1
3,9A21D0221EDBAE066CDDB5305F637AEE8DF5FAEB4B2A37...,Misdemeanor,11/26/2018,2018,11
4,741480CF329014CB91D56FD0D0C0269D4DFA0151D7BFEF...,Misdemeanor,3/5/2018,2018,3
5,FC2536176635A634B0328BADC34C5A4961270D68F2612D...,Misdemeanor,4/11/2018,2018,4


In [21]:
monthly_arr = df.groupby(by = ['arrgntyr', 'arrgnt.mo','arrgnt.charge.level']).agg({'id':'nunique'}).reset_index()
monthly_arr = monthly_arr.pivot(index = ['arrgntyr', 'arrgnt.mo'],columns = 'arrgnt.charge.level',values = 'id').reset_index().rename_axis(None, axis=1)
monthly_arr['Total_Arraignments'] = monthly_arr[['Other','Misdemeanor', 'Felony', 'Violation', 'Infraction']].sum(axis = 1)

monthly_arr = monthly_arr.rename(columns = {'arrgntyr':'year',
                                            'arrgnt.mo':'month'})
monthly_arr.head()

Unnamed: 0,year,month,Felony,Infraction,Misdemeanor,Other,Violation,Total_Arraignments
0,2018,1,3406,155,16328,65,437,20391
1,2018,2,3141,141,14384,64,440,18170
2,2018,3,3276,157,15493,64,388,19378
3,2018,4,3162,149,14286,60,399,18056
4,2018,5,3454,160,14517,59,459,18649


In [22]:
monthly_arr.to_csv('../Data/monthly_arr_by_charge_level.csv')

In [23]:
#count the unique ids for each yr-mo and release status
df = nyc_arr.copy()[['id','arraignment.release.status','arrgntdate', 'arrgntyr', 'arrgnt.mo']]
df.head()

Unnamed: 0,id,arraignment.release.status,arrgntdate,arrgntyr,arrgnt.mo
1,F449C676099DDEC1778C0D4485DE1D15F258D9FDE048A3...,,7/30/2018,2018,7
2,0F541D510893B4F98AC76F3E4232138CC49E09DB657E46...,ROR,1/2/2018,2018,1
3,9A21D0221EDBAE066CDDB5305F637AEE8DF5FAEB4B2A37...,,11/26/2018,2018,11
4,741480CF329014CB91D56FD0D0C0269D4DFA0151D7BFEF...,,3/5/2018,2018,3
5,FC2536176635A634B0328BADC34C5A4961270D68F2612D...,ROR,4/11/2018,2018,4


In [25]:
df['arraignment.release.status'] = df['arraignment.release.status'].fillna('Other')
df.head()

Unnamed: 0,id,arraignment.release.status,arrgntdate,arrgntyr,arrgnt.mo
1,F449C676099DDEC1778C0D4485DE1D15F258D9FDE048A3...,Other,7/30/2018,2018,7
2,0F541D510893B4F98AC76F3E4232138CC49E09DB657E46...,ROR,1/2/2018,2018,1
3,9A21D0221EDBAE066CDDB5305F637AEE8DF5FAEB4B2A37...,Other,11/26/2018,2018,11
4,741480CF329014CB91D56FD0D0C0269D4DFA0151D7BFEF...,Other,3/5/2018,2018,3
5,FC2536176635A634B0328BADC34C5A4961270D68F2612D...,ROR,4/11/2018,2018,4


In [26]:
monthly_arr = df.groupby(by = ['arrgntyr', 'arrgnt.mo','arraignment.release.status']).agg({'id':'nunique'}).reset_index()
monthly_arr = monthly_arr.pivot(index = ['arrgntyr', 'arrgnt.mo'],columns = 'arraignment.release.status',values = 'id').reset_index().rename_axis(None, axis=1)
monthly_arr['Total_Arraignments'] = monthly_arr[['Other','ROR', 'Bail Not Posted', 'Remand', 'Non-Monetary Conditions','Bail Set']].sum(axis = 1)

monthly_arr = monthly_arr.rename(columns = {'arrgntyr':'year',
                                            'arrgnt.mo':'month'})
monthly_arr.head()

Unnamed: 0,year,month,Bail Not Posted,Bail Set,Non-Monetary Conditions,Other,ROR,Remand,Total_Arraignments
0,2018,1,2684,369,426,5921,10866,125,20391
1,2018,2,2290,334,413,4773,10244,116,18170
2,2018,3,2332,392,461,5037,11058,97,19377
3,2018,4,2291,413,465,4608,10184,95,18056
4,2018,5,2487,419,493,4579,10571,101,18650


In [27]:
monthly_arr.to_csv('../Data/monthly_arr_by_release_status.csv')