# COMPAS Dataset Analysis

In [1]:
import pandas as pd
from scipy.constants import femto
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
df = pd.read_csv('../data/data_with_caucasian.csv')
df.head()

Unnamed: 0,Two_yr_Recidivism,Number_of_Priors,score_factor,Age_Above_FourtyFive,Age_Below_TwentyFive,Caucasian,African_American,Asian,Hispanic,Native_American,Other,Female,Misdemeanor
0,1,14,1,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,1,1
2,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,1,1
4,1,1,0,1,0,1,0,0,0,0,0,1,0


In [3]:
df.pop('score_factor')

0       1
1       0
2       0
3       0
4       0
       ..
6167    1
6168    0
6169    0
6170    0
6171    0
Name: score_factor, Length: 6172, dtype: int64

In [4]:
total = len(df)
male_df = df.loc[df['Female'] == 0]
total_male = len(male_df)
female_df = df.loc[df['Female'] == 1]
total_female = len(female_df)
print(total, total_male, total_female)
print(total_male, len(male_df.loc[male_df['Two_yr_Recidivism'] == 1]), len(male_df.loc[male_df['Two_yr_Recidivism'] == 0]))
print(total_female, len(female_df.loc[female_df['Two_yr_Recidivism'] == 1]), len(female_df.loc[female_df['Two_yr_Recidivism'] == 0]))
print('Male Rec Rate', 100/total_male*len(male_df.loc[male_df['Two_yr_Recidivism'] == 1]))
print('Female Rec Rate', 100/total_female*len(female_df.loc[female_df['Two_yr_Recidivism'] == 1]))

6172 4997 1175
4997 2396 2601
1175 413 762
Male Rec Rate 47.94876926155693
Female Rec Rate 35.148936170212764


In [5]:
df_fem_rec = female_df.loc[female_df['Two_yr_Recidivism'] == 1]
len(df_fem_rec)

413

In [6]:
female_df = pd.concat([female_df, df_fem_rec])
len(female_df)

1588

In [7]:
total = len(df)
total_male = len(male_df)
total_female = len(female_df)
print(total, total_male, total_female)
print(total_male, len(male_df.loc[male_df['Two_yr_Recidivism'] == 1]), len(male_df.loc[male_df['Two_yr_Recidivism'] == 0]))
print(total_female, len(female_df.loc[female_df['Two_yr_Recidivism'] == 1]), len(female_df.loc[female_df['Two_yr_Recidivism'] == 0]))
print('Male Rec Rate', 100/total_male*len(male_df.loc[male_df['Two_yr_Recidivism'] == 1]))
print('Female Rec Rate', 100/total_female*len(female_df.loc[female_df['Two_yr_Recidivism'] == 1]))

6172 4997 1588
4997 2396 2601
1588 826 762
Male Rec Rate 47.94876926155693
Female Rec Rate 52.015113350125944


### Male Racial Balancing

In [8]:
org_m_caucasian_df = male_df.loc[male_df['Caucasian'] == 1]
org_m_afroam_df = male_df.loc[male_df['African_American'] == 1]
org_m_asian_df = male_df.loc[male_df['Asian'] == 1]
org_m_hispanic_df = male_df.loc[male_df['Hispanic'] == 1]
org_m_native_df = male_df.loc[male_df['Native_American'] == 1]
org_m_other_df = male_df.loc[male_df['Other'] == 1]


In [9]:
# Caucasian balancing
org_m_caucasian_df_rec = org_m_caucasian_df.loc[org_m_caucasian_df['Two_yr_Recidivism'] == 1]
org_m_caucasian_df_nor = org_m_caucasian_df.loc[org_m_caucasian_df['Two_yr_Recidivism'] == 0]
print(len(org_m_caucasian_df), len(org_m_caucasian_df_rec), len(org_m_caucasian_df_nor))
print(100/len(org_m_caucasian_df)*len(org_m_caucasian_df_rec))
print('#####')

buffer_df = org_m_caucasian_df_rec.drop(org_m_caucasian_df_rec.sample(frac=.5).index)
print(len(buffer_df))
print('#####')

org_m_caucasian_df = pd.concat([org_m_caucasian_df, buffer_df])
org_m_caucasian_df_rec = org_m_caucasian_df.loc[org_m_caucasian_df['Two_yr_Recidivism'] == 1]
org_m_caucasian_df_nor = org_m_caucasian_df.loc[org_m_caucasian_df['Two_yr_Recidivism'] == 0]

print(len(org_m_caucasian_df), len(org_m_caucasian_df_rec), len(org_m_caucasian_df_nor))
print(100/len(org_m_caucasian_df)*len(org_m_caucasian_df_rec))

1621 652 969
40.22208513263418
#####
326
#####
1947 978 969
50.23112480739599


In [10]:
# African balancing
org_m_african_df_rec = org_m_afroam_df.loc[org_m_afroam_df['Two_yr_Recidivism'] == 1]
org_m_african_df_nor = org_m_afroam_df.loc[org_m_afroam_df['Two_yr_Recidivism'] == 0]
print(len(org_m_afroam_df), len(org_m_african_df_rec), len(org_m_african_df_nor))
print(100/len(org_m_afroam_df)*len(org_m_african_df_rec))
print('#####')

buffer_df = org_m_african_df_nor.drop(org_m_african_df_nor.sample(frac=.75).index)
print(len(buffer_df))
print('#####')

org_m_afroam_df = pd.concat([org_m_afroam_df, buffer_df])
org_m_african_df_rec = org_m_afroam_df.loc[org_m_afroam_df['Two_yr_Recidivism'] == 1]
org_m_african_df_nor = org_m_afroam_df.loc[org_m_afroam_df['Two_yr_Recidivism'] == 0]

print(len(org_m_afroam_df), len(org_m_african_df_rec), len(org_m_african_df_nor))
print(100/len(org_m_afroam_df)*len(org_m_african_df_rec))

2626 1458 1168
55.521706016755516
#####
292
#####
2918 1458 1460
49.96572995202193


In [11]:
# Asian balancing
org_m_asian_df_rec = org_m_asian_df.loc[org_m_asian_df['Two_yr_Recidivism'] == 1]
org_m_asian_df_nor = org_m_asian_df.loc[org_m_asian_df['Two_yr_Recidivism'] == 0]
print(len(org_m_asian_df), len(org_m_asian_df_rec), len(org_m_asian_df_nor))
print(100/len(org_m_asian_df)*len(org_m_asian_df_rec))
print('#####')

org_m_asian_df = pd.concat([org_m_asian_df, org_m_asian_df_rec, org_m_asian_df_rec])
print('#####')

org_m_asian_df_rec = org_m_asian_df.loc[org_m_asian_df['Two_yr_Recidivism'] == 1]
org_m_asian_df_nor = org_m_asian_df.loc[org_m_asian_df['Two_yr_Recidivism'] == 0]

print(len(org_m_asian_df), len(org_m_asian_df_rec), len(org_m_asian_df_nor))
print(100/len(org_m_asian_df)*len(org_m_asian_df_rec))

29 7 22
24.137931034482758
#####
#####
43 21 22
48.83720930232558


In [12]:
# Hispanic balancing
org_m_hispanic_df_rec = org_m_hispanic_df.loc[org_m_hispanic_df['Two_yr_Recidivism'] == 1]
org_m_hispanic_df_nor = org_m_hispanic_df.loc[org_m_hispanic_df['Two_yr_Recidivism'] == 0]
print(len(org_m_hispanic_df), len(org_m_hispanic_df_rec), len(org_m_hispanic_df_nor))
print(100/len(org_m_hispanic_df)*len(org_m_hispanic_df_rec))
print('#####')

buffer_df = org_m_hispanic_df_rec.drop(org_m_hispanic_df_rec.sample(frac=.38).index)
print(len(buffer_df))
print('#####')

org_m_hispanic_df = pd.concat([org_m_hispanic_df, buffer_df])
org_m_hispanic_df_rec = org_m_hispanic_df.loc[org_m_hispanic_df['Two_yr_Recidivism'] == 1]
org_m_hispanic_df_nor = org_m_hispanic_df.loc[org_m_hispanic_df['Two_yr_Recidivism'] == 0]
print(len(org_m_hispanic_df), len(org_m_hispanic_df_rec), len(org_m_hispanic_df_nor))
print(100/len(org_m_hispanic_df)*len(org_m_hispanic_df_rec))

427 163 264
38.17330210772834
#####
101
#####
528 264 264
50.0


In [13]:
# Native balancing
org_m_native_df_rec = org_m_native_df.loc[org_m_native_df['Two_yr_Recidivism'] == 1]
org_m_native_df_nor = org_m_native_df.loc[org_m_native_df['Two_yr_Recidivism'] == 0]
print(len(org_m_native_df), len(org_m_native_df_rec), len(org_m_native_df_nor))
print(100/len(org_m_native_df)*len(org_m_native_df_rec))
print('#####')

org_m_native_df = pd.concat([org_m_native_df, org_m_native_df_rec])
org_m_native_df_rec = org_m_native_df.loc[org_m_native_df['Two_yr_Recidivism'] == 1]
org_m_native_df_nor = org_m_native_df.loc[org_m_native_df['Two_yr_Recidivism'] == 0]
print(len(org_m_native_df), len(org_m_native_df_rec), len(org_m_native_df_nor))
print(100/len(org_m_native_df)*len(org_m_native_df_rec))

9 3 6
33.33333333333333
#####
12 6 6
50.0


### Female Racial Balancing

In [14]:
org_f_caucasian_df = female_df.loc[female_df['Caucasian'] == 1]
org_f_afroam_df = female_df.loc[female_df['African_American'] == 1]
org_f_asian_df = female_df.loc[female_df['Asian'] == 1]
org_f_hispanic_df = female_df.loc[female_df['Hispanic'] == 1]
org_f_native_df = female_df.loc[female_df['Native_American'] == 1]
org_f_other_df = female_df.loc[female_df['Other'] == 1]

In [15]:
# Caucasian balancing
org_f_caucasian_df_rec = org_f_caucasian_df.loc[org_f_caucasian_df['Two_yr_Recidivism'] == 1]
org_f_caucasian_df_nor = org_f_caucasian_df.loc[org_f_caucasian_df['Two_yr_Recidivism'] == 0]
print(len(org_f_caucasian_df), len(org_f_caucasian_df_rec), len(org_f_caucasian_df_nor))
print(100/len(org_f_caucasian_df)*len(org_f_caucasian_df_rec))
print('#####')

buffer_df = org_f_caucasian_df_nor.drop(org_f_caucasian_df_nor.sample(frac=.91).index)
print(len(buffer_df))
print('#####')

org_f_caucasian_df = pd.concat([org_f_caucasian_df, buffer_df])
org_f_caucasian_df_rec = org_f_caucasian_df.loc[org_f_caucasian_df['Two_yr_Recidivism'] == 1]
org_f_caucasian_df_nor = org_f_caucasian_df.loc[org_f_caucasian_df['Two_yr_Recidivism'] == 0]

print(len(org_f_caucasian_df), len(org_f_caucasian_df_rec), len(org_f_caucasian_df_nor))
print(100/len(org_f_caucasian_df)*len(org_f_caucasian_df_rec))

652 340 312
52.147239263803684
#####
28
#####
680 340 340
50.0


In [16]:
# African balancing
org_f_afroam_df_rec = org_f_afroam_df.loc[org_f_afroam_df['Two_yr_Recidivism'] == 1]
org_f_afroam_df_nor = org_f_afroam_df.loc[org_f_afroam_df['Two_yr_Recidivism'] == 0]
print(len(org_f_afroam_df), len(org_f_afroam_df_rec), len(org_f_afroam_df_nor))
print(100/len(org_f_afroam_df)*len(org_f_afroam_df_rec))
print('#####')

buffer_df = org_f_afroam_df_nor.drop(org_f_afroam_df_nor.sample(frac=.8255).index)
print(len(buffer_df))
print('#####')

org_f_afroam_df = pd.concat([org_f_afroam_df, buffer_df])
org_f_afroam_df_rec = org_f_afroam_df.loc[org_f_afroam_df['Two_yr_Recidivism'] == 1]
org_f_afroam_df_nor = org_f_afroam_df.loc[org_f_afroam_df['Two_yr_Recidivism'] == 0]

print(len(org_f_afroam_df), len(org_f_afroam_df_rec), len(org_f_afroam_df_nor))
print(100/len(org_f_afroam_df)*len(org_f_afroam_df_rec))

752 406 346
53.98936170212766
#####
60
#####
812 406 406
50.0


In [17]:
# Asian balancing
org_f_asian_df_rec = org_f_asian_df.loc[org_f_asian_df['Two_yr_Recidivism'] == 1]
org_f_asian_df_nor = org_f_asian_df.loc[org_f_asian_df['Two_yr_Recidivism'] == 0]
print(len(org_f_asian_df), len(org_f_asian_df_rec), len(org_f_asian_df_nor))
print(100/len(org_f_asian_df)*len(org_f_asian_df_rec))
print('#####')

org_f_asian_df = pd.concat([org_f_asian_df, org_f_asian_df_nor])
org_f_asian_df_rec = org_f_asian_df.loc[org_f_asian_df['Two_yr_Recidivism'] == 1]
org_f_asian_df_nor = org_f_asian_df.loc[org_f_asian_df['Two_yr_Recidivism'] == 0]
print(len(org_f_asian_df), len(org_f_asian_df_rec), len(org_f_asian_df_nor))
print(100/len(org_f_asian_df)*len(org_f_asian_df_rec))

3 2 1
66.66666666666667
#####
4 2 2
50.0


In [18]:
# Hispanic balancing
org_f_hispanic_df_rec = org_f_hispanic_df.loc[org_f_hispanic_df['Two_yr_Recidivism'] == 1]
org_f_hispanic_df_nor = org_f_hispanic_df.loc[org_f_hispanic_df['Two_yr_Recidivism'] == 0]
print(len(org_f_hispanic_df),len(org_f_hispanic_df_rec), len(org_f_hispanic_df_nor))
print(100/len(org_f_hispanic_df)*len(org_f_hispanic_df_rec))
print('#####')

buffer_df = org_f_hispanic_df_rec.drop(org_f_hispanic_df_rec.sample(frac=.7).index)
print(len(buffer_df))
print('#####')

org_f_hispanic_df = pd.concat([org_f_hispanic_df, buffer_df])
org_f_hispanic_df_rec = org_f_hispanic_df.loc[org_f_hispanic_df['Two_yr_Recidivism'] == 1]
org_f_hispanic_df_nor = org_f_hispanic_df.loc[org_f_hispanic_df['Two_yr_Recidivism'] == 0]
print(len(org_f_hispanic_df),len(org_f_hispanic_df_rec), len(org_f_hispanic_df_nor))
print(100/len(org_f_hispanic_df)*len(org_f_hispanic_df_rec))

108 52 56
48.148148148148145
#####
8
#####
116 60 56
51.72413793103448


In [19]:
# Native balancing
org_f_native_df_rec = org_f_native_df.loc[org_f_native_df['Two_yr_Recidivism'] == 1]
org_f_native_df_nor = org_f_native_df.loc[org_f_native_df['Two_yr_Recidivism'] == 0]
print(len(org_f_native_df), len(org_f_native_df_rec), len(org_f_native_df_nor))
print(100/len(org_f_native_df)*len(org_f_native_df_rec))
print('#####')



4 4 0
100.0
#####


In [20]:
# Other balancing
org_f_other_df_rec = org_f_other_df.loc[org_f_other_df['Two_yr_Recidivism'] == 1]
org_f_other_df_nor = org_f_other_df.loc[org_f_other_df['Two_yr_Recidivism'] == 0]
print(len(org_f_other_df), len(org_f_other_df_rec), len(org_f_other_df_nor))
print(100/len(org_f_other_df)*len(org_f_other_df_rec))
print('#####')

buffer_df = org_f_other_df_rec.drop(org_f_other_df_rec.sample(frac=.24).index)
print(len(buffer_df))
print('#####')

org_f_other_df = pd.concat([org_f_other_df, buffer_df, buffer_df])
org_f_other_df_rec = org_f_other_df.loc[org_f_other_df['Two_yr_Recidivism'] == 1]
org_f_other_df_nor = org_f_other_df.loc[org_f_other_df['Two_yr_Recidivism'] == 0]
print(len(org_f_other_df), len(org_f_other_df_rec), len(org_f_other_df_nor))
print(100/len(org_f_other_df)*len(org_f_other_df_rec))

69 22 47
31.884057971014492
#####
12
#####
93 46 47
49.46236559139785


In [21]:
# Concat everything together

ndf = pd.concat([org_m_caucasian_df,
                 org_f_caucasian_df,
                 org_m_afroam_df,
                 org_f_afroam_df,
                 org_m_asian_df,
                 org_f_asian_df,
                 org_m_hispanic_df,
                 org_f_hispanic_df,
                 org_m_native_df,
                 org_f_native_df,
                 org_m_other_df,
                 org_f_other_df])

In [22]:
total = len(ndf)
male_df = ndf.loc[ndf['Female'] == 0]
total_male = len(male_df)
female_df = ndf.loc[ndf['Female'] == 1]
total_female = len(female_df)
print(total, total_male, total_female)
print(total_male, len(male_df.loc[male_df['Two_yr_Recidivism'] == 1]), len(male_df.loc[male_df['Two_yr_Recidivism'] == 0]))
print(total_female, len(female_df.loc[female_df['Two_yr_Recidivism'] == 1]), len(female_df.loc[female_df['Two_yr_Recidivism'] == 0]))
print('Male Rec Rate', 100/total_male*len(male_df.loc[male_df['Two_yr_Recidivism'] == 1]))
print('Female Rec Rate', 100/total_female*len(female_df.loc[female_df['Two_yr_Recidivism'] == 1]))

7442 5733 1709
5733 2840 2893
1709 858 851
Male Rec Rate 49.53776382347811
Female Rec Rate 50.204798127559975


In [23]:
# export to csv
df.to_csv('../data/balanced_data.csv', index=False)