In [14]:
## imports 
import pandas as pd
import numpy as np
import yaml
import plotnine
from plotnine import *

## way to connect to mysql 
import mysql.connector

## function to feed path name to load
## credentials
def load_creds(path: str):
    with open(path, 'r') as stream:
        try:
            creds = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return(creds)


# Preliminary: define connection and read sample of data

In [15]:
## read in creds; change the path name if stored
## elsewhere
creds = load_creds("../private_data/creds_forclass.yaml")

creds['practice_database']

{'db_user': 'qss20',
 'db_password': 'pZqJ8TEwdinXQOk3',
 'port': 3306,
 'database': 'rjohnson',
 'host': 'rc-db.dartmouth.edu'}

In [16]:
## connect to the database
cnx = mysql.connector.connect(user=creds['practice_database']['db_user'], 
                            password=creds['practice_database']['db_password'],
                            port=creds['practice_database']['port'],
                            database= creds['practice_database']['database'],
                            host = creds['practice_database']['host'])
cnx

<mysql.connector.connection.MySQLConnection at 0x7f92eb787070>

In [17]:
## define a query to pull first 5 rows
## from caseinit table
sample_case_q = """
select * 
from caseinit 
limit 5
"""

In [18]:
## feed read sql query the query
## and my database connection
read_sample_d = pd.read_sql_query(sample_case_q, cnx)

## print columns
print(read_sample_d.columns)
read_sample_d.INCIDENT_CITY.value_counts()

Index(['CASE_ID', 'CASE_PARTICIPANT_ID', 'RECEIVED_DATE', 'OFFENSE_CATEGORY',
       'PRIMARY_CHARGE_FLAG', 'CHARGE_ID', 'CHARGE_VERSION_ID',
       'CHARGE_OFFENSE_TITLE', 'CHARGE_COUNT', 'CHAPTER', 'ACT', 'SECTION',
       'CLASS', 'AOIC', 'EVENT', 'EVENT_DATE', 'FINDING_NO_PROBABLE_CAUSE',
       'ARRAIGNMENT_DATE', 'BOND_DATE_INITIAL', 'BOND_DATE_CURRENT',
       'BOND_TYPE_INITIAL', 'BOND_TYPE_CURRENT', 'BOND_AMOUNT_INITIAL',
       'BOND_AMOUNT_CURRENT', 'BOND_ELECTRONIC_MONITOR_FLAG_INITIAL',
       'BOND_ELECTROINIC_MONITOR_FLAG_CURRENT', 'AGE_AT_INCIDENT', 'RACE',
       'GENDER', 'INCIDENT_CITY', 'INCIDENT_BEGIN_DATE', 'INCIDENT_END_DATE',
       'LAW_ENFORCEMENT_AGENCY', 'LAW_ENFORCEMENT_UNIT', 'ARREST_DATE',
       'FELONY_REVIEW_DATE', 'FELONY_REVIEW_RESULT',
       'UPDATED_OFFENSE_CATEGORY', 'is_in_diversion'],
      dtype='object')


Chicago         2
Orland Hills    1
Elmwood Park    1
Wheeling        1
Name: INCIDENT_CITY, dtype: int64

In [50]:
## define a query to pull first 5 rows
## from caseinit table
sample_divert_q = """
select * 
from divert 
limit 5
"""
## feed read sql query the query
## and my database connection
read_sample_d2 = pd.read_sql_query(sample_divert_q, cnx)
read_sample_d2.head()

Unnamed: 0,CASE_ID,CASE_PARTICIPANT_ID,RECEIVED_DATE,OFFENSE_CATEGORY,DIVERSION_PROGRAM,REFERRAL_DATE,DIVERSION_COUNT,PRIMARY_CHARGE_OFFENSE_TITLE,STATUTE,RACE,GENDER,DIVERSION_RESULT,DIVERSION_CLOSED_DATE
0,382796674396,711100606974,01/07/2011 12:00:00 AM,Identity Theft,BR9,05/30/2012 12:00:00 AM,1,IDENTITY THEFT,720 ILCS 5/16G-15(a)(1),Black,Female,Graduated,05/30/2012 12:00:00 AM
1,382811359863,711141092656,01/10/2011 12:00:00 AM,Driving With Suspended Or Revoked License,DC,07/13/2011 12:00:00 AM,1,"FELONY DRIVING WHILE DRIVER?S LICENSE, PERMIT,...",625 ILCS 5/6-303(a),Black,Male,Failed,01/13/2012 12:00:00 AM
2,382879647285,711343992924,01/10/2011 12:00:00 AM,Narcotics,DS,04/26/2011 12:00:00 AM,1,POSSESSION OF A CONTROLLED SUBSTANCE,720 ILCS 570/402(c),White,Male,Graduated,04/26/2011 12:00:00 AM
3,382929724728,711486306229,01/14/2011 12:00:00 AM,Narcotics,DS,02/01/2011 12:00:00 AM,1,[POSSESSION OF CONTROLLED SUBSTANCE WITH INTEN...,720 ILCS 570/401(c)(2),Latinx,Male,Graduated,05/27/2011 12:00:00 AM
4,382981270718,711621541617,01/18/2011 12:00:00 AM,Narcotics,DS,02/10/2011 12:00:00 AM,1,POSSESSION OF A CONTROLLED SUBSTANCE,720 ILCS 570/402(c),Black,Male,Graduated,06/10/2011 12:00:00 AM


# Activity 1

- Create a new column -- `in_chicago` when pulling from the `caseinit` table that takes on the value of "YES" if INCIDENT_CITY = Chicago; "NO" otherwise (which represents incidents in Cook County suburbs outside the city limits)
- Use that column, along with the `is_in_diversion` column, to find the rate of diversions by whether the incident took place in Chicago or the suburbs
- Similarly, find the rate of diversions by city versus suburb and race 


In [19]:
query_createvar = """
select *,
CASE when INCIDENT_CITY = 'Chicago' then 'Yes'
ELSE 'No'
END in_chicago
from caseinit
"""

varcreate = pd.read_sql_query(query_createvar, cnx)
varcreate.head()
pd.crosstab(varcreate.in_chicago, varcreate.INCIDENT_CITY)



INCIDENT_CITY,Unnamed: 1_level_0,Alsip,Arlington Heights,Bartlett,Bedford Park,Bellwood,Berkeley,Berwyn,Blue Island,Bridgeview,...,Summit,Thornton,Tinley Park,Waukegan,Westchester,Western Springs,Wheeling,Wilmette,Winnetka,Worth
in_chicago,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
No,178,21,20,8,7,8,5,25,21,18,...,15,3,20,2,6,1,17,2,4,5
Yes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
## rj note: probably a more efficient way to do 
## that just creates the var once!
query_createvar = """
select distinct count_place, count_divert_place,
num.is_in_diversion, denom.in_chicago,
count_divert_place/count_place as prop_divert_place
from caseinit
inner join (select count(*) as count_place,
        in_chicago
        from caseinit
        inner join 
        (select CASE_ID as cid, CASE_PARTICIPANT_ID as cpid,
            CASE when INCIDENT_CITY = 'Chicago' then 'Yes'
            ELSE 'No'
            END in_chicago
            from caseinit) as newvar
        on newvar.cid = caseinit.CASE_ID
        and newvar.cpid = caseinit.CASE_PARTICIPANT_ID
    group by in_chicago) as denom
inner join (select count(*) as count_divert_place,
in_chicago, is_in_diversion
from caseinit
inner join 
(select CASE_ID as cid, CASE_PARTICIPANT_ID as cpid,
CASE when INCIDENT_CITY = 'Chicago' then 'Yes'
ELSE 'No'
END in_chicago
from caseinit) as newvar
on newvar.cid = caseinit.CASE_ID
and newvar.cpid = caseinit.CASE_PARTICIPANT_ID
group by in_chicago, is_in_diversion) as num
on num.in_chicago = denom.in_chicago
order by in_chicago
"""

count_incidents_city = pd.read_sql_query(query_createvar, cnx)
count_incidents_city

## 63% of crimes where the incident was in the suburbs
## get diverted compared to 45% of crimes where incident
## was in city

Unnamed: 0,count_place,count_divert_place,is_in_diversion,in_chicago,prop_divert_place
0,2234,1424,True,No,0.6374
1,2234,810,False,No,0.3626
2,3220,1470,True,Yes,0.4565
3,3220,1750,False,Yes,0.5435


# Activity 2 

- Use the following crosswalk and `CASE` to create a new variable `DIVERSION_PROGRAM_TEXT` that spells out the diversion programs

DC: Drug Court

DPPP: Drug Deferred Prosecution

DS: Drug School

RJCC: Restorative Justice

MHC: Mental Health Court

VC: Veteran Court

- Filter to Narcotics as the `UPDATED_OFFENSE_CATEGORY` and Black or White defendants

- Among defendants offered diversion (so defendants from caseinit also in the diversons table), the number sent to each diversion program (separated by RACE) 


In [54]:
## step 1; query creating var
dp_type = """
select *,
    CASE WHEN DIVERSION_PROGRAM = 'DC' THEN 'Drug Court'
    WHEN DIVERSION_PROGRAM = 'DPPP' THEN 'Drug Deferred Prosecution'
    WHEN DIVERSION_PROGRAM = 'DS' THEN 'Drug School'
    WHEN DIVERSION_PROGRAM = 'RJCC' THEN 'Restorative Justice'
    WHEN DIVERSION_PROGRAM = 'MHC' THEN 'Mental Health Court'
    WHEN DIVERSION_PROGRAM = 'VC' THEN 'Veteran Court'
ELSE 'Other'
END DIVERSION_PROGRAM_TEXT
from divert
"""

In [55]:
n = pd.read_sql_query(dp_type, cnx)
pd.crosstab(n.DIVERSION_PROGRAM,
           n.DIVERSION_PROGRAM_TEXT)

DIVERSION_PROGRAM_TEXT,Drug Court,Drug School,Mental Health Court,Other,Restorative Justice,Veteran Court
DIVERSION_PROGRAM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACT,0,0,0,64,0,0
ARI,0,0,0,148,0,0
BR9,0,0,0,561,0,0
DC,443,0,0,0,0,0
DDPP,0,0,0,471,0,0
DS,0,558,0,0,0,0
MHC,0,0,278,0,0,0
RJCC,0,0,0,0,30,0
VC,0,0,0,0,0,117


In [120]:
## step 2: embed that into larger query and do filtering/agg
dp_type = """
select count(*) as count_divert_group,
count(*)/count_race as prop_ofrace,
d.RACE, DIVERSION_PROGRAM_TEXT
from  (select *,
CASE WHEN DIVERSION_PROGRAM = 'DC' THEN 'Drug Court'
    WHEN DIVERSION_PROGRAM = 'DPPP' THEN 'Drug Deferred Prosecution'
    WHEN DIVERSION_PROGRAM = 'DS' THEN 'Drug School'
    WHEN DIVERSION_PROGRAM = 'RJCC' THEN 'Restorative Justice'
    WHEN DIVERSION_PROGRAM = 'MHC' THEN 'Mental Health Court'
    WHEN DIVERSION_PROGRAM = 'VC' THEN 'Veteran Court'
ELSE 'Other'
END DIVERSION_PROGRAM_TEXT
from divert) as d
inner join (
    select UPDATED_OFFENSE_CATEGORY,
    CASE_ID, CASE_PARTICIPANT_ID
    from caseinit
    ) as c
on c.CASE_ID = d.CASE_ID and
c.CASE_PARTICIPANT_ID = d.CASE_PARTICIPANT_ID
inner join (select count(*) as count_race, RACE
from divert
group by RACE) as denom
on denom.RACE = d.RACE
where d.RACE IN ("Black", "White")
and UPDATED_OFFENSE_CATEGORY = 'Narcotics'
group by RACE, DIVERSION_PROGRAM_TEXT
order by DIVERSION_PROGRAM_TEXT
"""

In [121]:
pd.read_sql_query(dp_type, cnx)

## similar n for drug court
## more white defendants sent to drug school
## more Black defendants sent to other program

Unnamed: 0,count_divert_group,prop_ofrace,RACE,DIVERSION_PROGRAM_TEXT
0,55,0.0751,White,Drug Court
1,132,0.0905,Black,Drug Court
2,230,0.3142,White,Drug School
3,153,0.1049,Black,Drug School
4,11,0.015,White,Mental Health Court
5,44,0.0302,Black,Mental Health Court
6,453,0.3105,Black,Other
7,123,0.168,White,Other
8,17,0.0117,Black,Restorative Justice
9,16,0.011,Black,Veteran Court
