In [1]:
import numpy as np
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup

In [2]:
cases = pd.read_csv("data100k.csv")

In [4]:
cases.head(3).T

Unnamed: 0,0,1,2
person_id,102090000000110,343221000000125,343221000000125
HearingDate,2019-02-28,2009-12-07,2011-01-20
CodeSection,A.46.2-862,B.46.2-301,A.46.2-707
codesection,covered elsewhere,covered elsewhere,covered elsewhere
ChargeType,Misdemeanor,Misdemeanor,Misdemeanor
chargetype,Misdemeanor,Misdemeanor,Misdemeanor
Class,1,1,3
DispositionCode,Guilty,Guilty,Guilty
disposition,Conviction,Conviction,Conviction
Plea,,,


## What code sections are most frequent?

In [6]:
cases["CodeSection"].value_counts().reset_index().head(15)

Unnamed: 0,CodeSection,count
0,A.46.2-862,26379
1,B.46.2-301,25967
2,46.2-300,17934
3,C.46.2-862,11728
4,18.2-250.1,10573
5,A.18.2-266,8568
6,18.2-95,7561
7,18.2-250,6949
8,18.2-57,6699
9,A.46.2-852,6667


## Which ones most often lead to convictions?

In [14]:
cases["DispositionCode"].value_counts()
cases["conviction"] = [x == "Guilty" or x == "Guilty In Absentia" for x in cases["DispositionCode"]]
convict_rate = cases.groupby("CodeSection").agg({"conviction": ["mean", "count"]}).reset_index()
convict_rate.columns = ["CodeSection", "conviction_rate", "count"]
convict_rate = convict_rate.query("count > 30")
convict_rate.sort_values("conviction_rate", ascending=False)

Unnamed: 0,CodeSection,conviction_rate,count
1806,23-55,0.981818,55
1633,21-336,0.960000,50
1755,23-22.1(A),0.954198,131
2103,29-17(C),0.942857,70
4111,G.18.2-266,0.930233,43
...,...,...,...
1450,19.2-135,0.084337,83
2321,3.2-6503.1,0.071429,42
140,11.1-2,0.052632,38
1433,19.2-100,0.000000,238


In [16]:
cases.query("CodeSection == '23-55'")["fips"]

8585      810
17940     810
34148     810
34168     810
34169     810
46925     810
68396     810
120838    810
120844    810
120862    810
120864    810
121263    810
132073    810
153191    810
153206    810
153208    810
153210    810
153222    810
155161    810
157747    810
159548    810
163720    810
173408    810
192200    810
196647    810
200799    810
200828    810
200922    810
230213    810
240905    810
240909    810
240912    810
240914    810
240916    810
240917    810
240918    810
240919    810
240920    810
240922    810
240923    810
240926    810
240927    810
240945    810
240946    810
240948    810
249880    810
251381    810
257688    810
257697    810
257699    810
257739    810
257747    810
266159    810
291045    810
291048    810
Name: fips, dtype: int64

## Which ones have the most severe racial disparities?

In [18]:
cases["Race"].unique()

array(['Black(Non-Hispanic)', 'Hispanic', 'White Caucasian(Non-Hispanic)',
       'MISSING', 'Asian Or Pacific Islander', 'Black (Non-Hispanic)',
       'White Caucasian (Non-Hispanic)',
       'Other(Includes Not Applicable.. Unknown)',
       'Other (Includes Not Applicable.. Unknown)', 'Black', 'White',
       'Unknown (Includes Not Applicable.. Unknown)', 'American Indian',
       'Unknown', 'Asian or Pacific Islander',
       'American Indian Or Alaskan Native'], dtype=object)

In [19]:
replace_map = {'Black(Non-Hispanic)':"Black", 
               'Hispanic':"Hispanic", 
               'White Caucasian(Non-Hispanic)':"White",
               'MISSING':"Missing/Other", 
               'Asian Or Pacific Islander':"Asian or Pacific Islander", 
               'Black (Non-Hispanic)':"Black",
               'White Caucasian (Non-Hispanic)':"White",
               'Other(Includes Not Applicable.. Unknown)':"Missing/Other",
               'Other (Includes Not Applicable.. Unknown)':"Missing/Other", 
               'Black':"Black", 
               'White':"White",
               'Unknown (Includes Not Applicable.. Unknown)':"Missing/Other", 
               'American Indian':"American Indian or Alaskan Native",
               'Unknown':"Missing/Other", 
               'Asian or Pacific Islander':"Asian or Pacific Islander",
               'American Indian Or Alaskan Native':"American Indian or Alaskan Native"}
cases["Race"] = cases["Race"].replace(replace_map)
cases["Race"].value_counts()

Race
White                                159627
Black                                115627
Hispanic                               9319
Missing/Other                          5928
Asian or Pacific Islander              2794
American Indian or Alaskan Native       303
Name: count, dtype: int64

In [28]:
#I choose to analyze only the convictions
cases_convict = cases.query("conviction == True")
cases_convict_race = cases_convict.groupby(["CodeSection","Race"]).size().reset_index()
cases_convict_race = cases_convict_race.rename({0:"count"}, axis=1)
cases_reshape = cases_convict_race.pivot_table(index = "CodeSection", 
                               columns = "Race", 
                               values = "count",
                                fill_value=0).reset_index()
cases_reshape

Race,CodeSection,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White
0,01-2007,0.0,0.0,0.0,0.0,0.0,1.0
1,1,0.0,0.0,3.0,0.0,0.0,1.0
2,1-12,0.0,0.0,27.0,0.0,0.0,6.0
3,1-200,0.0,0.0,7.0,0.0,0.0,3.0
4,1.21,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
3105,Z.18.2-90,0.0,0.0,4.0,0.0,1.0,3.0
3106,Z.18.2-91,0.0,2.0,95.0,2.0,0.0,123.0
3107,Z.18.2-91; 26,0.0,0.0,1.0,0.0,0.0,0.0
3108,Z.18.2-95,0.0,0.0,2.0,0.0,0.0,0.0


In [31]:
cases_reshape = cases_reshape.assign(total = cases_reshape["American Indian or Alaskan Native"] +
                                    cases_reshape["Asian or Pacific Islander"] +
                                    cases_reshape["Black"] +
                                    cases_reshape["Hispanic"] +
                                    cases_reshape["Missing/Other"] +
                                    cases_reshape["White"])
cases_reshape = cases_reshape.query("total >= 50")
cases_reshape = cases_reshape.assign(black_pc = cases_reshape["Black"]/cases_reshape["total"],
                                     white_pc = cases_reshape["White"]/cases_reshape["total"])
cases_reshape.sort_values("black_pc", ascending=False)

Race,CodeSection,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White,total,black_pc,white_pc
1360,24-253,0.0,0.0,110.0,0.0,0.0,5.0,115.0,0.956522,0.043478
1555,29-48,0.0,0.0,131.0,0.0,1.0,13.0,145.0,0.903448,0.089655
2290,46.2-938,0.0,1.0,180.0,2.0,2.0,36.0,221.0,0.814480,0.162896
249,14.2-81,0.0,0.0,71.0,4.0,0.0,16.0,91.0,0.780220,0.175824
2767,66-3,0.0,0.0,81.0,0.0,0.0,23.0,104.0,0.778846,0.221154
...,...,...,...,...,...,...,...,...,...,...
2604,54.1-3466,0.0,1.0,30.0,1.0,1.0,260.0,293.0,0.102389,0.887372
2168,46.2-618,0.0,0.0,7.0,3.0,0.0,61.0,71.0,0.098592,0.859155
1604,29.1-521,0.0,0.0,8.0,0.0,0.0,124.0,132.0,0.060606,0.939394
1642,29.1-735,0.0,0.0,8.0,4.0,1.0,137.0,150.0,0.053333,0.913333


In [32]:
cases.query("CodeSection == '24-253'")["fips"]

2319      740
2325      740
4399      740
4400      740
4401      740
         ... 
262328    740
262331    740
273650    740
278506    740
288281    740
Name: fips, Length: 131, dtype: int64

## In what localities (fips) are these disparities most severe?