In [1]:
import numpy as np
import pandas as pd

In [2]:
cases = pd.read_csv('data100k.csv')

In [3]:
cases.head(3).T

Unnamed: 0,0,1,2
person_id,102090000000110,343221000000125,343221000000125
HearingDate,2019-02-28,2009-12-07,2011-01-20
CodeSection,A.46.2-862,B.46.2-301,A.46.2-707
codesection,covered elsewhere,covered elsewhere,covered elsewhere
ChargeType,Misdemeanor,Misdemeanor,Misdemeanor
chargetype,Misdemeanor,Misdemeanor,Misdemeanor
Class,1,1,3
DispositionCode,Guilty,Guilty,Guilty
disposition,Conviction,Conviction,Conviction
Plea,,,


## What code sections are most frequent?

In [5]:
codesections = cases['CodeSection'].value_counts().reset_index()
codesections.head(15)

Unnamed: 0,CodeSection,count
0,A.46.2-862,26379
1,B.46.2-301,25967
2,46.2-300,17934
3,C.46.2-862,11728
4,18.2-250.1,10573
5,A.18.2-266,8568
6,18.2-95,7561
7,18.2-250,6949
8,18.2-57,6699
9,A.46.2-852,6667


1. A.46.2-862 reckless driving
2. B.46.2-301


   18.2-250.1: possession of mariuana unlawful

## Which Code Sections most often lead to convictions?

In [7]:
cases['DispositionCode'].value_counts()
cases['conviction'] = [x in ['Guilty', 'Guilty In Absentia'] for x in cases['DispositionCode']]

In [15]:
''' SQL equivalance to pandas code
SELECT CodeSEction, AVG(conviction) AS conv_rate
FROM cases
GROUP BY CodeSection
'''

convict_rate = cases.groupby('CodeSection').agg({'conviction': ['mean', 'count']}).reset_index()
convict_rate.columns = ['CodeSection', 'conviction_rate', 'count'] # fix multi index situation that happened
convict_rate = convict_rate.query("count >= 30")
convict_rate

Unnamed: 0,CodeSection,conviction_rate,count
3,1-12,0.440000,75
4,1-200,0.227273,44
46,10-42,0.444444,63
47,10-43,0.288000,125
63,10-62,0.217054,129
...,...,...,...
4146,NO DMV,0.635236,403
4148,NODMV,0.476190,42
4194,Z.18.2-32,0.566667,30
4196,Z.18.2-47,0.447059,85


In [16]:
convict_rate.sort_values('conviction_rate', ascending = False)

Unnamed: 0,CodeSection,conviction_rate,count
1806,23-55,0.981818,55
1633,21-336,0.960000,50
1755,23-22.1(A),0.954198,131
2103,29-17(C),0.942857,70
4111,G.18.2-266,0.930233,43
...,...,...,...
2321,3.2-6503.1,0.071429,42
140,11.1-2,0.052632,38
2481,35-416,0.033333,30
1433,19.2-100,0.000000,238


In [18]:
#what is 23-55?
cases.query("CodeSection == '23-55'") # virginia beach - sleeping in public park

Unnamed: 0,person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,...,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime,conviction
8585,153020000000040,2018-03-08,23-55,covered elsewhere,Misdemeanor,Misdemeanor,U,Guilty,Conviction,Guilty,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,True,True
17940,82091000000145,2014-11-10,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty,Conviction,Guilty,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
34148,16050000000485,2010-09-07,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,,...,False,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
34168,16050000000485,2011-09-14,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,True,True
34169,16050000000485,2011-10-11,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,True,True
46925,66010000000463,2011-09-27,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
68396,315010000000605,2011-09-07,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
120838,340010000000247,2016-06-20,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
120844,340010000000247,2016-08-15,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
120862,340010000000247,2016-12-13,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,True,True


## Which CodeSections have the most severe racial disparities?

In [19]:
cases['Race'].value_counts()

Race
White Caucasian(Non-Hispanic)                  114421
Black(Non-Hispanic)                             80173
White Caucasian (Non-Hispanic)                  41679
Black (Non-Hispanic)                            33254
Hispanic                                         9319
White                                            3527
Other(Includes Not Applicable.. Unknown)         3452
Asian Or Pacific Islander                        2787
Black                                            2200
MISSING                                          1022
Unknown (Includes Not Applicable.. Unknown)       785
Other (Includes Not Applicable.. Unknown)         615
American Indian                                   302
Unknown                                            54
Asian or Pacific Islander                           7
American Indian Or Alaskan Native                   1
Name: count, dtype: int64

In [20]:
replace_map = { 'White Caucasian(Non-Hispanic)' : 'White',
'Black(Non-Hispanic)' : 'Black',
'White Caucasian (Non-Hispanic)' : 'White',
'Black (Non-Hispanic)' : 'Black',
'Hispanic' : 'Hispanic',
'White' : 'White',
'Other(Includes Not Applicable.. Unknown)' : 'Missing/Other',
'Asian Or Pacific Islander' : 'Asian or Pacific Islander',
'Black' : 'Black',
'MISSING' : 'Missing/Other',
'Unknown (Includes Not Applicable.. Unknown)' : 'Missing/Other',
'Other (Includes Not Applicable.. Unknown)': 'Missing/Other',
'American Indian' : 'Asian or Pacific Islander',
'Unknown' : 'Missing/Other',
'Asian or Pacific Islander' : 'Asian or Pacific Islander',
'American Indian Or Alaskan Native' : 'American Indian or Alaskan Native'  
}
cases['Race'] = cases['Race'].replace(replace_map)
cases['Race'].value_counts()

Race
White                                159627
Black                                115627
Hispanic                               9319
Missing/Other                          5928
Asian or Pacific Islander              3096
American Indian or Alaskan Native         1
Name: count, dtype: int64

In [21]:
# I choos to analyze only the convictions
cases_convict = cases.query("conviction == True")

In [22]:
cases_convict_race = cases_convict.groupby(['CodeSection', 'Race']).size().reset_index()
cases_convict_race = cases_convict_race.rename({0:'count'}, axis=1)
cases_convict_race

Unnamed: 0,CodeSection,Race,count
0,01-2007,White,1
1,1,Black,3
2,1,White,1
3,1-12,Black,27
4,1-12,White,6
...,...,...,...
4847,Z.18.2-91,Hispanic,2
4848,Z.18.2-91,White,123
4849,Z.18.2-91; 26,Black,1
4850,Z.18.2-95,Black,2


In [25]:
cases_reshape = cases_convict_race.pivot_table(index = 'CodeSection', 
                              columns = 'Race',
                              values = 'count', fill_value=0).reset_index() #replace nan with 0 (approprite here b/ we understand the data and we know there'e no data)
cases_reshape

Race,CodeSection,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White
0,01-2007,0.0,0.0,0.0,0.0,0.0,1.0
1,1,0.0,0.0,3.0,0.0,0.0,1.0
2,1-12,0.0,0.0,27.0,0.0,0.0,6.0
3,1-200,0.0,0.0,7.0,0.0,0.0,3.0
4,1.21,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
3105,Z.18.2-90,0.0,0.0,4.0,0.0,1.0,3.0
3106,Z.18.2-91,0.0,2.0,95.0,2.0,0.0,123.0
3107,Z.18.2-91; 26,0.0,0.0,1.0,0.0,0.0,0.0
3108,Z.18.2-95,0.0,0.0,2.0,0.0,0.0,0.0


In [27]:
cases_reshape = cases_reshape.assign(total  = cases_reshape['White'] +
                                    cases_reshape['Black'] +
                                    cases_reshape['Hispanic'] +
                                    cases_reshape['Missing/Other'] +
                                    cases_reshape['Asian or Pacific Islander'] +
                                    cases_reshape['American Indian or Alaskan Native'])
cases_reshape

Race,CodeSection,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White,total
0,01-2007,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,0.0,0.0,3.0,0.0,0.0,1.0,4.0
2,1-12,0.0,0.0,27.0,0.0,0.0,6.0,33.0
3,1-200,0.0,0.0,7.0,0.0,0.0,3.0,10.0
4,1.21,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...
3105,Z.18.2-90,0.0,0.0,4.0,0.0,1.0,3.0,8.0
3106,Z.18.2-91,0.0,2.0,95.0,2.0,0.0,123.0,222.0
3107,Z.18.2-91; 26,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3108,Z.18.2-95,0.0,0.0,2.0,0.0,0.0,0.0,2.0


In [32]:
cases_reshape = cases_reshape.query('total >= 50')

In [33]:
cases_reshape = cases_reshape.assign(black_percent = cases_reshape['Black']/cases_reshape['total'],
                                    white_percent = cases_reshape['White']/cases_reshape['total'])

In [34]:
cases_reshape.sort_values('black_percent', ascending=False)

Race,CodeSection,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White,total,black_percent,white_percent
1360,24-253,0.0,0.0,110.0,0.0,0.0,5.0,115.0,0.956522,0.043478
1555,29-48,0.0,0.0,131.0,0.0,1.0,13.0,145.0,0.903448,0.089655
2290,46.2-938,0.0,1.0,180.0,2.0,2.0,36.0,221.0,0.814480,0.162896
249,14.2-81,0.0,0.0,71.0,4.0,0.0,16.0,91.0,0.780220,0.175824
2767,66-3,0.0,0.0,81.0,0.0,0.0,23.0,104.0,0.778846,0.221154
...,...,...,...,...,...,...,...,...,...,...
2604,54.1-3466,0.0,1.0,30.0,1.0,1.0,260.0,293.0,0.102389,0.887372
2168,46.2-618,0.0,0.0,7.0,3.0,0.0,61.0,71.0,0.098592,0.859155
1604,29.1-521,0.0,0.0,8.0,0.0,0.0,124.0,132.0,0.060606,0.939394
1642,29.1-735,0.0,0.0,8.0,4.0,1.0,137.0,150.0,0.053333,0.913333


In [36]:
cases.query("CodeSection=='24-253'")['fips']

2319      740
2325      740
4399      740
4400      740
4401      740
         ... 
262328    740
262331    740
273650    740
278506    740
288281    740
Name: fips, Length: 131, dtype: int64

## In what localities (fips) are these disparities most severe?