In [1]:
import numpy as np
import pandas as pd

In [4]:
cases = pd.read_csv('data100k.csv')

In [5]:
cases.head(3).T

Unnamed: 0,0,1,2
person_id,102090000000110,343221000000125,343221000000125
HearingDate,2019-02-28,2009-12-07,2011-01-20
CodeSection,A.46.2-862,B.46.2-301,A.46.2-707
codesection,covered elsewhere,covered elsewhere,covered elsewhere
ChargeType,Misdemeanor,Misdemeanor,Misdemeanor
chargetype,Misdemeanor,Misdemeanor,Misdemeanor
Class,1,1,3
DispositionCode,Guilty,Guilty,Guilty
disposition,Conviction,Conviction,Conviction
Plea,,,


## What code sections are most frequent?

In [6]:
codesections = cases['CodeSection'].value_counts().reset_index()
codesections.head(15)

Unnamed: 0,CodeSection,count
0,A.46.2-862,26379
1,B.46.2-301,25967
2,46.2-300,17934
3,C.46.2-862,11728
4,18.2-250.1,10573
5,A.18.2-266,8568
6,18.2-95,7561
7,18.2-250,6949
8,18.2-57,6699
9,A.46.2-852,6667


1. A.46.2-862 reckless driving
2. B.46.2-301


   18.2-250.1: possession of mariuana unlawful

## Which Code Sections most often lead to convictions?

In [7]:
cases['DispositionCode'].value_counts()
cases['conviction'] = [x in ['Guilty', 'Guilty In Absentia'] for x in cases['DispositionCode']]

In [8]:
''' SQL equivalance to pandas code
SELECT CodeSEction, AVG(conviction) AS conv_rate
FROM cases
GROUP BY CodeSection
'''

convict_rate = cases.groupby('CodeSection').agg({'conviction': ['mean', 'count']}).reset_index()
convict_rate.columns = ['CodeSection', 'conviction_rate', 'count'] # fix multi index situation that happened
convict_rate = convict_rate.query("count >= 30")
convict_rate

Unnamed: 0,CodeSection,conviction_rate,count
3,1-12,0.440000,75
4,1-200,0.227273,44
46,10-42,0.444444,63
47,10-43,0.288000,125
63,10-62,0.217054,129
...,...,...,...
4146,NO DMV,0.635236,403
4148,NODMV,0.476190,42
4194,Z.18.2-32,0.566667,30
4196,Z.18.2-47,0.447059,85


In [9]:
convict_rate.sort_values('conviction_rate', ascending = False)

Unnamed: 0,CodeSection,conviction_rate,count
1806,23-55,0.981818,55
1633,21-336,0.960000,50
1755,23-22.1(A),0.954198,131
2103,29-17(C),0.942857,70
4111,G.18.2-266,0.930233,43
...,...,...,...
2321,3.2-6503.1,0.071429,42
140,11.1-2,0.052632,38
2481,35-416,0.033333,30
1433,19.2-100,0.000000,238


In [10]:
#what is 23-55?
cases.query("CodeSection == '23-55'") # virginia beach - sleeping in public park

Unnamed: 0,person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,...,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime,conviction
8585,153020000000040,2018-03-08,23-55,covered elsewhere,Misdemeanor,Misdemeanor,U,Guilty,Conviction,Guilty,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,True,True
17940,82091000000145,2014-11-10,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty,Conviction,Guilty,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
34148,16050000000485,2010-09-07,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,,...,False,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
34168,16050000000485,2011-09-14,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,True,True
34169,16050000000485,2011-10-11,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,True,True
46925,66010000000463,2011-09-27,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
68396,315010000000605,2011-09-07,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
120838,340010000000247,2016-06-20,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
120844,340010000000247,2016-08-15,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,False,True
120862,340010000000247,2016-12-13,23-55,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction or deferred dismissal of misdemeano...,False,True,True


## Which CodeSections have the most severe racial disparities?

In [11]:
cases['Race'].value_counts()

Race
White Caucasian(Non-Hispanic)                  114421
Black(Non-Hispanic)                             80173
White Caucasian (Non-Hispanic)                  41679
Black (Non-Hispanic)                            33254
Hispanic                                         9319
White                                            3527
Other(Includes Not Applicable.. Unknown)         3452
Asian Or Pacific Islander                        2787
Black                                            2200
MISSING                                          1022
Unknown (Includes Not Applicable.. Unknown)       785
Other (Includes Not Applicable.. Unknown)         615
American Indian                                   302
Unknown                                            54
Asian or Pacific Islander                           7
American Indian Or Alaskan Native                   1
Name: count, dtype: int64

In [12]:
replace_map = { 'White Caucasian(Non-Hispanic)' : 'White',
'Black(Non-Hispanic)' : 'Black',
'White Caucasian (Non-Hispanic)' : 'White',
'Black (Non-Hispanic)' : 'Black',
'Hispanic' : 'Hispanic',
'White' : 'White',
'Other(Includes Not Applicable.. Unknown)' : 'Missing/Other',
'Asian Or Pacific Islander' : 'Asian or Pacific Islander',
'Black' : 'Black',
'MISSING' : 'Missing/Other',
'Unknown (Includes Not Applicable.. Unknown)' : 'Missing/Other',
'Other (Includes Not Applicable.. Unknown)': 'Missing/Other',
'American Indian' : 'Asian or Pacific Islander',
'Unknown' : 'Missing/Other',
'Asian or Pacific Islander' : 'Asian or Pacific Islander',
'American Indian Or Alaskan Native' : 'American Indian or Alaskan Native'  
}
cases['Race'] = cases['Race'].replace(replace_map)
cases['Race'].value_counts()

Race
White                                159627
Black                                115627
Hispanic                               9319
Missing/Other                          5928
Asian or Pacific Islander              3096
American Indian or Alaskan Native         1
Name: count, dtype: int64

In [13]:
# I choos to analyze only the convictions
cases_convict = cases.query("conviction == True")

In [68]:
cases_convict_race = cases_convict.groupby(['CodeSection', 'Race', 'fips']).size().reset_index()
cases_convict_race = cases_convict_race.rename({0:'count'}, axis=1)
cases_convict_race

Unnamed: 0,CodeSection,Race,fips,count
0,01-2007,White,51,1
1,1,Black,550,3
2,1,White,550,1
3,1-12,Black,650,27
4,1-12,White,650,6
...,...,...,...,...
27418,Z.18.2-91,White,840,2
27419,Z.18.2-91; 26,Black,700,1
27420,Z.18.2-95,Black,67,1
27421,Z.18.2-95,Black,83,1


In [69]:
cases_reshape = cases_convict_race.pivot_table(index = ['CodeSection', 'fips'], 
                              columns = 'Race',
                              values = 'count', fill_value=0).reset_index() #replace nan with 0 (approprite here b/c we understand the data and we know there'e no data)
cases_reshape

Race,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White
0,01-2007,51,0.0,0.0,0.0,0.0,0.0,1.0
1,1,550,0.0,0.0,3.0,0.0,0.0,1.0
2,1-12,650,0.0,0.0,27.0,0.0,0.0,6.0
3,1-200,29,0.0,0.0,1.0,0.0,0.0,0.0
4,1-200,105,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
18700,Z.18.2-91,840,0.0,0.0,0.0,0.0,0.0,2.0
18701,Z.18.2-91; 26,700,0.0,0.0,1.0,0.0,0.0,0.0
18702,Z.18.2-95,67,0.0,0.0,1.0,0.0,0.0,0.0
18703,Z.18.2-95,83,0.0,0.0,1.0,0.0,0.0,0.0


In [70]:
cases_reshape = cases_reshape.assign(total  = cases_reshape['White'] +
                                    cases_reshape['Black'] +
                                    cases_reshape['Hispanic'] +
                                    cases_reshape['Missing/Other'] +
                                    cases_reshape['Asian or Pacific Islander'] +
                                    cases_reshape['American Indian or Alaskan Native'])
cases_reshape

Race,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White,total
0,01-2007,51,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,550,0.0,0.0,3.0,0.0,0.0,1.0,4.0
2,1-12,650,0.0,0.0,27.0,0.0,0.0,6.0,33.0
3,1-200,29,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1-200,105,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
18700,Z.18.2-91,840,0.0,0.0,0.0,0.0,0.0,2.0,2.0
18701,Z.18.2-91; 26,700,0.0,0.0,1.0,0.0,0.0,0.0,1.0
18702,Z.18.2-95,67,0.0,0.0,1.0,0.0,0.0,0.0,1.0
18703,Z.18.2-95,83,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [71]:
cases_reshape = cases_reshape.query('total >= 25')

In [72]:
cases_reshape = cases_reshape.assign(black_percent = cases_reshape['Black']/cases_reshape['total'],
                                    white_percent = cases_reshape['White']/cases_reshape['total'])

In [73]:
cases_reshape.sort_values('black_percent', ascending=False)

Race,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White,total,black_percent,white_percent
6763,18.2-472.1,119,0.0,0.0,40.0,0.0,0.0,0.0,40.0,1.000000,0.000000
7375,18.2-53.1,740,0.0,0.0,30.0,0.0,0.0,0.0,30.0,1.000000,0.000000
1847,18.2-168,630,0.0,0.0,26.0,0.0,0.0,0.0,26.0,1.000000,0.000000
3040,18.2-248,760,0.0,0.0,228.0,0.0,0.0,3.0,231.0,0.987013,0.012987
2093,18.2-178,147,0.0,0.0,46.0,0.0,1.0,0.0,47.0,0.978723,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
8123,18.2-67.1,169,0.0,0.0,0.0,0.0,0.0,26.0,26.0,0.000000,1.000000
15405,A.18.2-266,105,0.0,0.0,0.0,0.0,0.0,32.0,32.0,0.000000,1.000000
3753,18.2-258.1,630,0.0,0.0,0.0,0.0,0.0,31.0,31.0,0.000000,1.000000
8770,18.2-95,105,0.0,0.0,0.0,0.0,0.0,34.0,34.0,0.000000,1.000000


In [20]:
cases.query("CodeSection=='24-253'")['fips']

2319      740
2325      740
4399      740
4400      740
4401      740
         ... 
262328    740
262331    740
273650    740
278506    740
288281    740
Name: fips, Length: 131, dtype: int64

## In what localities (fips) are these disparities most severe?

In [21]:
race_url = 'https://virginia.box.com/shared/static/i8i5onrkveks849pkky0gwgxlax8d8fe.xlsx'
hisp_url = 'https://virginia.box.com/shared/static/fegrn0p0igzl95snji3ku6edwu0hy3dj.xlsx'

In [41]:
race_pop = pd.read_excel(race_url, skiprows = [0, 1, 2, 3, 5, 6, 7])
race_pop.head(10)

  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Unnamed: 0,FIPS,Jurisdiction,Total Population,American Indian,Unnamed: 4,Asian,Unnamed: 6,Black,Unnamed: 8,Pacific Islander,Unnamed: 10,Two or more races,Unnamed: 12,White,Unnamed: 14
0,1,Accomack County,33246,441,0.013265,370,0.011129,9859,0.296547,79,0.002376,609,0.018318,23125,0.695572
1,3,Albemarle County,113535,1069,0.009416,7925,0.069802,12581,0.110812,242,0.002132,3210,0.028273,95210,0.838596
2,5,Alleghany County,14986,126,0.008408,92,0.006139,906,0.060456,17,0.001134,283,0.018884,14136,0.94328
3,7,Amelia County,13268,177,0.01334,138,0.010401,2759,0.207944,15,0.001131,259,0.019521,10445,0.787232
4,9,Amherst County,31273,495,0.015828,339,0.01084,6475,0.207048,55,0.001759,828,0.026477,24796,0.792888
5,11,Appomattox County,16353,130,0.00795,101,0.006176,3200,0.195683,18,0.001101,374,0.02287,13286,0.81245
6,13,Arlington County,232965,3689,0.015835,31145,0.13369,26879,0.115378,760,0.003262,8882,0.038126,180541,0.77497
7,15,Augusta County,77563,672,0.008664,804,0.010366,4618,0.059539,75,0.000967,1443,0.018604,72898,0.939855
8,17,Bath County,4114,27,0.006563,33,0.008021,205,0.04983,0,0.0,67,0.016286,3918,0.952358
9,19,Bedford County,80131,723,0.009023,1430,0.017846,6486,0.080942,77,0.000961,1439,0.017958,72916,0.90996


In [42]:
race_pop.columns

Index(['FIPS', 'Jurisdiction', 'Total Population', 'American Indian',
       'Unnamed: 4', 'Asian ', 'Unnamed: 6', 'Black', 'Unnamed: 8',
       'Pacific Islander', 'Unnamed: 10', 'Two or more races', 'Unnamed: 12',
       'White ', 'Unnamed: 14'],
      dtype='object')

In [79]:
race_pop = race_pop[['FIPS', 'Jurisdiction', 'Total Population', 'American Indian', 'Asian ', 'Black', 'Pacific Islander', 'Two or more races', 'White ']]
race_pop

Unnamed: 0,FIPS,Jurisdiction,Total Population,American Indian,Asian,Black,Pacific Islander,Two or more races,White
0,1,Accomack County,33246,441,370,9859,79,609,23125
1,3,Albemarle County,113535,1069,7925,12581,242,3210,95210
2,5,Alleghany County,14986,126,92,906,17,283,14136
3,7,Amelia County,13268,177,138,2759,15,259,10445
4,9,Amherst County,31273,495,339,6475,55,828,24796
...,...,...,...,...,...,...,...,...,...
128,800,Suffolk city,96194,1052,3115,43068,232,2975,51977
129,810,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018
130,820,Waynesboro city,22550,309,522,3665,35,792,18840
131,830,Williamsburg city,15590,202,1164,2754,37,591,12055


In [80]:
race_pop = race_pop.rename({'FIPS': 'fips', 
                            'Jurisdiction' :'jurisdiction', 
                            'Total Population': 'total_pop', 
                            'American Indian': 'amerind_pop', 
                            'Asian ': "asian_pop", 
                            'Black': 'black_pop', 
                            'Pacific Islander': 'pacificisland_pop',  
                            'Two or more races': 'mixed_pop', 
                            'White ': 'white_pop'}, axis=1)
race_pop

Unnamed: 0,fips,jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacificisland_pop,mixed_pop,white_pop
0,1,Accomack County,33246,441,370,9859,79,609,23125
1,3,Albemarle County,113535,1069,7925,12581,242,3210,95210
2,5,Alleghany County,14986,126,92,906,17,283,14136
3,7,Amelia County,13268,177,138,2759,15,259,10445
4,9,Amherst County,31273,495,339,6475,55,828,24796
...,...,...,...,...,...,...,...,...,...
128,800,Suffolk city,96194,1052,3115,43068,232,2975,51977
129,810,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018
130,820,Waynesboro city,22550,309,522,3665,35,792,18840
131,830,Williamsburg city,15590,202,1164,2754,37,591,12055


In [81]:
hisp_pop = pd.read_excel(hisp_url, skiprows = [0,1,2,3,5,6,7,8,9])
hisp_pop.head(10)

  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Unnamed: 0,FIPS,Jurisdiction,"Decennial Census Count, April 1, 2010",Unnamed: 3,Unnamed: 4,"Population Estimate, July 1, 2021",Unnamed: 6,Unnamed: 7,"April 1, 2010 - July 1, 2021",Unnamed: 9
0,1,"Accomack County, Virginia",33164,2850,8.593656,33246,3170,0.09535,320,0.112281
1,3,"Albemarle County, Virginia",98970,5417,5.473376,113535,6750,0.059453,1333,0.246077
2,5,"Alleghany County, Virginia",16250,176,1.083077,14986,265,0.017683,89,0.505682
3,7,"Amelia County, Virginia",12690,290,2.285264,13268,507,0.038212,217,0.748276
4,9,"Amherst County, Virginia",32353,625,1.931815,31273,849,0.027148,224,0.3584
5,11,"Appomattox County, Virginia",14973,167,1.115341,16353,360,0.022014,193,1.155689
6,13,"Arlington County, Virginia",207627,31382,15.114605,232965,36284,0.155749,4902,0.156204
7,15,"Augusta County, Virginia",73750,1525,2.067797,77563,2849,0.036731,1324,0.868197
8,17,"Bath County, Virginia",4731,101,2.134855,4114,101,0.02455,0,0.0
9,19,"Bedford County, Virginia",68676,1090,1.587163,80131,2179,0.027193,1089,0.999083


In [82]:
hisp_pop = hisp_pop[['FIPS', 'Unnamed: 6']]
hisp_pop

Unnamed: 0,FIPS,Unnamed: 6
0,1,3170
1,3,6750
2,5,265
3,7,507
4,9,849
...,...,...
128,800,4684
129,810,40525
130,820,2244
131,830,1183


In [83]:
hisp_pop = hisp_pop.rename({'FIPS':'fips',
                           'Unnamed: 6': 'hisp_pop'}, axis=1)
hisp_pop

Unnamed: 0,fips,hisp_pop
0,1,3170
1,3,6750
2,5,265
3,7,507
4,9,849
...,...,...
128,800,4684
129,810,40525
130,820,2244
131,830,1183


# Things that go wrong (without any programming error) when you merge
1. Rows that should mathc are unmatced, and either get missing data if a full/outer join, or get deleted in an inner join
2. You think you are doing a one-to-one merge but are actually doing a many-to-one, or many-to-many
   
IF yo have a "small" dataset, I recommend doing the merge twice: once with checks, once without

In [86]:
# Merge with checks
pop = pd.merge(race_pop, hisp_pop, 
               on='fips', 
               how = 'outer',
               validate = 'one_to_one',
               indicator = 'matched')

In [88]:
pop['matched'].value_counts()

matched
both          133
left_only       0
right_only      0
Name: count, dtype: int64

In [89]:
# Merge without checks
pop = pd.merge(race_pop, hisp_pop,
              on = 'fips',
              how = 'inner')

In [90]:
cases_pop = pd.merge(cases_reshape, pop, 
                     on='fips', 
                     how='outer', 
                     validate = 'many_to_one', 
                     indicator= 'matched')

In [91]:
cases_pop.query("matched=='left_only'")['fips'].unique()

array([701, 702, 711, 712, 761, 762, 764])

In [92]:
pop.tail(10)

Unnamed: 0,fips,jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacificisland_pop,mixed_pop,white_pop,hisp_pop
123,750,Radford city,16499,150,480,1831,34,447,14459,500
124,760,Richmond city,226604,3052,7416,106660,677,5869,115477,17639
125,770,Roanoke city,98865,1050,3870,32251,205,3367,65021,7010
126,775,Salem city,25373,235,576,2603,37,652,22588,1011
127,790,Staunton city,25661,301,515,3578,40,884,22136,1016
128,800,Suffolk city,96194,1052,3115,43068,232,2975,51977,4684
129,810,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018,40525
130,820,Waynesboro city,22550,309,522,3665,35,792,18840,2244
131,830,Williamsburg city,15590,202,1164,2754,37,591,12055,1183
132,840,Winchester city,28136,479,928,3908,62,1000,23797,5322


In [94]:
replace_map = {701: 700,
              711: 710,
              761: 760,
              762: 760,
              764: 760,
              712: 710,
              702: 700}
cases_reshape['fips'] = cases_reshape['fips'].replace(replace_map)

In [95]:
cases_pop = pd.merge(cases_reshape, pop, 
                     on='fips', 
                     how='outer', 
                     validate = 'many_to_one', 
                     indicator= 'matched')

In [96]:
cases_pop['matched'].value_counts()

matched
both          1284
right_only      21
left_only        0
Name: count, dtype: int64

In [97]:
cases_pop.query("matched=='right_only'")['fips']

75       17
108      29
135      36
183      45
193      49
257      63
388      91
394      95
441     115
482     133
571     157
707     181
806     530
864     580
890     620
930     660
937     678
956     683
957     685
1060    720
1077    735
Name: fips, dtype: int64

In [98]:
pop.query('fips==17')

Unnamed: 0,fips,jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacificisland_pop,mixed_pop,white_pop,hisp_pop
8,17,Bath County,4114,27,33,205,0,67,3918,101


In [99]:
pop.query('fips==29')

Unnamed: 0,fips,jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacificisland_pop,mixed_pop,white_pop,hisp_pop
14,29,Buckingham County,16947,147,115,5918,12,387,11145,427


In [101]:
pop.query('fips==36')

Unnamed: 0,fips,jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacificisland_pop,mixed_pop,white_pop,hisp_pop
18,36,Charles City County,6594,539,93,2994,15,238,3201,156


Reasons why rows might fail to match:
1. Differnces in coding/spelling: recode the values in one dataset so that they match
2. Differences in coverage: notheing we can do, other than collect new data if feasible

In [102]:
cases_pop = pd.merge(cases_reshape, pop, 
                     on='fips', 
                     how='inner')
cases_pop

Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black,Hispanic,Missing/Other,White,total,black_percent,white_percent,jurisdiction,total_pop,amerind_pop,asian_pop,black_pop,pacificisland_pop,mixed_pop,white_pop,hisp_pop
0,1-12,650,0.0,0.0,27.0,0.0,0.0,6.0,33.0,0.818182,0.181818,Hampton city,137746,2615,5433,75602,541,6155,60473,9254
1,10-42,550,0.0,0.0,8.0,0.0,0.0,17.0,25.0,0.320000,0.680000,Chesapeake city,251269,3498,13826,82110,795,10008,162117,18427
2,10-43,550,0.0,0.0,4.0,0.0,0.0,21.0,25.0,0.160000,0.840000,Chesapeake city,251269,3498,13826,82110,795,10008,162117,18427
3,10-62,550,0.0,0.0,7.0,0.0,0.0,21.0,28.0,0.250000,0.750000,Chesapeake city,251269,3498,13826,82110,795,10008,162117,18427
4,13-1-5,510,0.0,0.0,37.0,0.0,1.0,54.0,92.0,0.402174,0.586957,Alexandria city,154706,2285,12722,37719,535,5434,107355,25586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,MISSING,13,0.0,0.0,35.0,2.0,0.0,5.0,42.0,0.833333,0.119048,Arlington County,232965,3689,31145,26879,760,8882,180541,36284
1280,NO DMV,710,0.0,0.0,18.0,1.0,0.0,8.0,27.0,0.666667,0.296296,Norfolk city,235089,4292,12673,103373,1219,9507,124482,21484
1281,NO DMV,770,0.0,0.0,14.0,3.0,0.0,15.0,32.0,0.437500,0.468750,Roanoke city,98865,1050,3870,32251,205,3367,65021,7010
1282,NO DMV,810,0.0,1.0,19.0,1.0,1.0,21.0,43.0,0.441860,0.488372,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018,40525


In [103]:
cases_pop['black_overrep_index'] = case_pop = cases_pop['black_percent']/(cases_pop['black_pop']/cases_pop['total_pop'])

In [104]:
cases_pop.sort_values('black_overrep_index', ascending=False)[['CodeSection', 'jurisdiction', 'Black', 'total', 'black_pop', 'total_pop', 'black_overrep_index']].head(20)

Unnamed: 0,CodeSection,jurisdiction,Black,total,black_pop,total_pop,black_overrep_index
690,46.2-300,Carroll County,35.0,171.0,530,29048,11.217919
1022,A.46.2-862,Carroll County,28.0,138.0,530,29048,11.120372
1219,C.46.2-862,Carroll County,95.0,519.0,530,29048,10.03221
734,46.2-300,Smyth County,27.0,99.0,969,29477,8.296369
162,18.2-192,Arlington County,38.0,40.0,26879,232965,8.233816
667,46.2-1172,Hanover County,81.0,95.0,11853,111603,8.02803
243,18.2-250,Rockingham County,16.0,54.0,3186,84394,7.848597
732,46.2-300,Scott County,9.0,84.0,301,21419,7.624229
599,19.2-306,Nelson County,35.0,38.0,1838,14790,7.411517
1058,A.46.2-862,Washington County,28.0,183.0,1110,53635,7.393196


In [55]:
# What a many-to -many merge looks like

data1 = [{'Country': 'USA', 'Value1': 5},
        {'Country': 'France', 'Value1': 15},
        {'Country': 'China', 'Value1': 50},
        {'Country': 'Spain', 'Value1': 25},
        {'Country': 'Uk', 'Value1': 0.5},
        {'Country': 'Thailand', 'Value1': 500}]
data1 = pd.DataFrame.from_records(data1)
data1

Unnamed: 0,Country,Value1
0,USA,5.0
1,France,15.0
2,China,50.0
3,Spain,25.0
4,Uk,0.5
5,Thailand,500.0


In [56]:
data2 = [{'Country': 'United States', 'Value2': 3},
        {'Country': 'France', 'Value2': 13},
        {'Country': 'China', 'Value2': 30},
        {'Country': 'Spain', 'Value2': 23},
        {'Country': 'Uk', 'Value2': 0.3},
        {'Country': 'Thailand', 'Value2': 300}]
data2 = pd.DataFrame.from_records(data2)
data2

Unnamed: 0,Country,Value2
0,United States,3.0
1,France,13.0
2,China,30.0
3,Spain,23.0
4,Uk,0.3
5,Thailand,300.0


In [58]:
pd.merge(data1, data2, on='Country', how='outer')

Unnamed: 0,Country,Value1,Value2
0,China,50.0,30.0
1,France,15.0,13.0
2,Spain,25.0,23.0
3,Thailand,500.0,300.0
4,USA,5.0,
5,Uk,0.5,0.3
6,United States,,3.0


In [107]:
# this is now curroupted data, it's not usable
# the default join in merge is inner