In [10]:
# Data preprocessing

import csv
from hypothesis_test import *
from kardis_test import *
from mg_calculator import *
from q_finder import *
from graph_distributions import *

"""----------------------------------------Process Charges and Priors Data------------------------------------------------"""
#0 denotes no priors. le5 denotes greater than 0 but less than equal to 5 priors. gt5 denotes greater than 5 priors
#Priors = 0 

file = open(r"Datasets/people.csv", mode='r', encoding='utf-8-sig')
csvreader = csv.reader(file)
rowlist = [row for row in csvreader]
for category in rowlist[0]:
    index = rowlist[0].index(category)
    exec(category + " = [row[index] for row in rowlist[1:]]")
file.close()

battery_ids = [int(id[x]) for x in range(len(id)) if c_charge_desc[x] == 'Battery']
battery_races = [race[x] for x in battery_ids]

#this groups together all instances of possession of contraband, sans aggravated assault. 
possession_ids = [int(id[x]) for x in range(len(id)) if ('Possession' in c_charge_desc[x]) and \
                  (not "Aggravated" in c_charge_desc[x])]
possession_races = [race[x] for x in possession_ids]

priors_0_ids = [int(id[x]) for x in range(len(id)) if int(priors_count[x]) == 0]
priors_0_races = [race[x] for x in possession_ids]

priors_le5_ids = [int(id[x]) for x in range(len(id)) if 0 < int(priors_count[x]) <= 5]
priors_le5_races = [race[x] for x in priors_le5_ids]

priors_gt5_ids = [int(id[x]) for x in range(len(id)) if int(priors_count[x]) > 5]
priors_gt5_races = [race[x] for x in priors_gt5_ids]

file = open(r"Datasets/compas.csv", mode='r', encoding='utf-8-sig')
csvreader = csv.reader(file)
rowlist = [row for row in csvreader]
for category in rowlist[0]:
    index = rowlist[0].index(category)
    exec(category + " = [row[index] for row in rowlist[1:]]")
file.close()

#the -1 *3 +2 is to selectively take the recidivism scores. 
battery_scores = [score_text[(x-1)*3+2] for x in battery_ids]
possession_scores = [score_text[(x-1)*3+2] for x in possession_ids]
priors_0_scores = [score_text[(x-1)*3+2] for x in priors_0_ids]
priors_le5_scores = [score_text[(x-1)*3+2] for x in priors_le5_ids]
priors_gt5_scores = [score_text[(x-1)*3+2] for x in priors_gt5_ids]



['Possession of Cocaine', 'Possession of Cannabis', 'Possession Burglary Tools', 'Possession Of Alprazolam', 'Possession Of Alprazolam', 'Possession Of Heroin', 'Possession Of Methamphetamine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Oxycodone', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession Burglary Tools', 'Possession Of Alprazolam', 'Possession Of Methamphetamine', 'Possession of Morphine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession Of Alprazolam', 'Possession Of Alprazolam', 'Possession of Cannabis', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Oxycodone', 'Possession of Cocaine', 'Possession Of Carisoprodol', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession Of Heroin', 'Possession of Cocaine', 'Possession of Cocaine', 'Possession of Cocaine', 

In [3]:
# Battery Test
recid_races_and_scores = list(filter(lambda x: not x[1] == 'N/A', list(zip(battery_races,battery_scores))))
white_count = len(list(filter(lambda x: x[0] == 'Caucasian', recid_races_and_scores)))
white_props = [recid_races_and_scores.count(('Caucasian','High'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Medium'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Low'))/white_count]
AA_scores = list(filter(lambda x: x[0] == 'African-American', recid_races_and_scores))

print("Null hypothesis (proposed distribution): " + str(white_props) + "\n")

hypothesis_test(AA_scores,\
                [('African-American','High'),('African-American','Medium'),('African-American','Low')],\
                alpha = 0.05, hypothesis = white_props)

Null hypothesis (proposed distribution): [0.09090909090909091, 0.15254237288135594, 0.7565485362095532]

Proposed distribution not rejected at alpha = 0.05. Kardis = 273.112234394461.


(mpf('273.11223439446104'), False)

In [4]:
#Possession Test
recid_races_and_scores = list(filter(lambda x: not x[1] == 'N/A', list(zip(possession_races,possession_scores))))
white_count = len(list(filter(lambda x: x[0] == 'Caucasian', recid_races_and_scores)))
white_props = [recid_races_and_scores.count(('Caucasian','High'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Medium'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Low'))/white_count]
AA_scores = list(filter(lambda x: x[0] == 'African-American', recid_races_and_scores))

print("Null hypothesis (proposed distribution): " + str(white_props) + "\n")

hypothesis_test(AA_scores,\
                [('African-American','High'),('African-American','Medium'),('African-American','Low')],\
                alpha = 0.05, hypothesis = white_props)

Null hypothesis (proposed distribution): [0.08788598574821853, 0.14489311163895488, 0.7672209026128266]

Proposed distribution not rejected at alpha = 0.05. Kardis = 1969.27223811894.


(mpf('1969.272238118939'), False)

In [5]:
#0 Priors Test
recid_races_and_scores = list(filter(lambda x: not x[1] == 'N/A', list(zip(priors_0_races,priors_0_scores))))
white_count = len(list(filter(lambda x: x[0] == 'Caucasian', recid_races_and_scores)))
white_props = [recid_races_and_scores.count(('Caucasian','High'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Medium'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Low'))/white_count]
AA_scores = list(filter(lambda x: x[0] == 'African-American', recid_races_and_scores))

print("Null hypothesis (proposed distribution): " + str(white_props) + "\n")

hypothesis_test(AA_scores,\
                [('African-American','High'),('African-American','Medium'),('African-American','Low')],\
                alpha = 0.05, hypothesis = white_props)

Null hypothesis (proposed distribution): [0.0665083135391924, 0.16389548693586697, 0.7695961995249406]

Proposed distribution not rejected at alpha = 0.05. Kardis = 2809.20381478302.


(mpf('2809.2038147830217'), False)

In [6]:
#<=5 Priors Test
recid_races_and_scores = list(filter(lambda x: not x[1] == 'N/A', list(zip(priors_le5_races,priors_le5_scores))))
white_count = len(list(filter(lambda x: x[0] == 'Caucasian', recid_races_and_scores)))
white_props = [recid_races_and_scores.count(('Caucasian','High'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Medium'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Low'))/white_count]
AA_scores = list(filter(lambda x: x[0] == 'African-American', recid_races_and_scores))

print("Null hypothesis (proposed distribution): " + str(white_props) + "\n")

hypothesis_test(AA_scores,\
                [('African-American','High'),('African-American','Medium'),('African-American','Low')],\
                alpha = 0.05, hypothesis = white_props)

Null hypothesis (proposed distribution): [0.06993006993006994, 0.16083916083916083, 0.7692307692307693]

Proposed distribution not rejected at alpha = 0.05. Kardis = 12657.5955958137.


(mpf('12657.59559581368'), False)

In [7]:
#>5 Priors test
recid_races_and_scores = list(filter(lambda x: not x[1] == 'N/A', list(zip(priors_gt5_races,priors_gt5_scores))))
white_count = len(list(filter(lambda x: x[0] == 'Caucasian', recid_races_and_scores)))
white_props = [recid_races_and_scores.count(('Caucasian','High'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Medium'))/white_count, \
                recid_races_and_scores.count(('Caucasian', 'Low'))/white_count]
AA_scores = list(filter(lambda x: x[0] == 'African-American', recid_races_and_scores))

print("Null hypothesis (proposed distribution): " + str(white_props) + "\n")

hypothesis_test(AA_scores,\
                [('African-American','High'),('African-American','Medium'),('African-American','Low')],\
                alpha = 0.05, hypothesis = white_props)

Null hypothesis (proposed distribution): [0.0776566757493188, 0.16893732970027248, 0.7534059945504087]

Proposed distribution not rejected at alpha = 0.05. Kardis = 11165.1678795278.


(mpf('11165.167879527762'), False)