# COMPAS Dataset Analysis

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
df = pd.read_csv('../data/data_with_caucasian.csv')
df.head()

In [None]:
df.pop('score_factor')

In [None]:
total = len(df)
did_rec = len(df.loc[df['Two_yr_Recidivism'] == 1])
did_not_rec = len(df.loc[df['Two_yr_Recidivism'] == 0])

print(total, did_rec, did_not_rec)

In [None]:
fig, ax = plt.subplots()

fruits = ['Rückfällig', 'nicht Rückfällig']
counts = [2809, 3363]
bar_labels = ['red', 'green']
bar_colors = ['tab:red', 'tab:green']

ax.bar(fruits, counts, label=bar_labels, color=bar_colors)

ax.set_ylabel('Anzahl')
ax.set_title('Balance Dataset')

#plt.show()

In [None]:
print('Rückfällig %', 100/total*did_rec)

In [None]:
# gender ration
total = len(df)
male = len(df.loc[df['Female'] == 0])
female = len(df.loc[df['Female'] == 1])

print(total, male, female)
print('Male %', 100/total*male)

In [None]:
df_male = df.loc[df['Female'] == 0]
df_female = df.loc[df['Female'] == 1]

## General Race Analysis

In [None]:
total = len(df)
did_rec = len(df.loc[df['Two_yr_Recidivism'] == 1])
did_not_rec = len(df.loc[df['Two_yr_Recidivism'] == 0])

print(total, did_rec, did_not_rec)
print('Rückfällig all %', 100/total*did_rec)

In [None]:
white = df.loc[df['Caucasian'] == 1]
black = df.loc[df['African_American'] == 1]
hispanic = df.loc[df['Hispanic'] == 1]
asian = df.loc[df['Asian'] == 1]
native = df.loc[df['Native_American'] == 1]
other = df.loc[df['Other'] == 1]

print('Races split')
print('white', len(white), 100/total*len(white))
print('black', len(black), 100/total*len(black))
print('hispanic', len(hispanic), 100/total*len(hispanic))
print('asian', len(asian), 100/total*len(asian))
print('native', len(native), 100/total*len(native))
print('other', len(other), 100/total*len(other))

In [None]:
print('% of positive labels per race all')
print('white', len(white.loc[white['Two_yr_Recidivism'] == 1]), 100/total*len(white.loc[white['Two_yr_Recidivism'] == 1]))
print('black', len(black.loc[black['Two_yr_Recidivism'] == 1]), 100/total*len(black.loc[black['Two_yr_Recidivism'] == 1]))
print('hispanic', len(hispanic.loc[hispanic['Two_yr_Recidivism'] == 1]), 100/total*len(hispanic.loc[hispanic['Two_yr_Recidivism'] == 1]))
print('asian', len(asian.loc[asian['Two_yr_Recidivism'] == 1]), 100/total*len(asian.loc[asian['Two_yr_Recidivism'] == 1]))
print('native', len(native.loc[native['Two_yr_Recidivism'] == 1]), 100/total*len(native.loc[native['Two_yr_Recidivism'] == 1]))
print('other', len(other.loc[other['Two_yr_Recidivism'] == 1]), 100/total*len(other.loc[other['Two_yr_Recidivism'] == 1]))

## Male Analysis

In [None]:
total = len(df_male)
did_rec = len(df_male.loc[df_male['Two_yr_Recidivism'] == 1])
did_not_rec = len(df_male.loc[df_male['Two_yr_Recidivism'] == 0])

print(total, did_rec, did_not_rec)
print('Rückfällig male %', 100/total*did_rec)

In [None]:
white = df_male.loc[df_male['Caucasian'] == 1]
black = df_male.loc[df_male['African_American'] == 1]
hispanic = df_male.loc[df_male['Hispanic'] == 1]
asian = df_male.loc[df_male['Asian'] == 1]
native = df_male.loc[df_male['Native_American'] == 1]
other = df_male.loc[df_male['Other'] == 1]

print('Races split')
print('white', len(white), 100/total*len(white))
print('black', len(black), 100/total*len(black))
print('hispanic', len(hispanic), 100/total*len(hispanic))
print('asian', len(asian), 100/total*len(asian))
print('native', len(native), 100/total*len(native))
print('other', len(other), 100/total*len(other))

In [None]:
print('% of positive labels per race male')
print('white', len(white.loc[white['Two_yr_Recidivism'] == 1]), 100/total*len(white.loc[white['Two_yr_Recidivism'] == 1]))
print('black', len(black.loc[black['Two_yr_Recidivism'] == 1]), 100/total*len(black.loc[black['Two_yr_Recidivism'] == 1]))
print('hispanic', len(hispanic.loc[hispanic['Two_yr_Recidivism'] == 1]), 100/total*len(hispanic.loc[hispanic['Two_yr_Recidivism'] == 1]))
print('asian', len(asian.loc[asian['Two_yr_Recidivism'] == 1]), 100/total*len(asian.loc[asian['Two_yr_Recidivism'] == 1]))
print('native', len(native.loc[native['Two_yr_Recidivism'] == 1]), 100/total*len(native.loc[native['Two_yr_Recidivism'] == 1]))
print('other', len(other.loc[other['Two_yr_Recidivism'] == 1]), 100/total*len(other.loc[other['Two_yr_Recidivism'] == 1]))

## Female Analysis

In [None]:
total = len(df_female)
did_rec = len(df_female.loc[df_female['Two_yr_Recidivism'] == 1])
did_not_rec = len(df_female.loc[df_female['Two_yr_Recidivism'] == 0])

print(total, did_rec, did_not_rec)
print('Rückfällig female %', 100/total*did_rec)

In [None]:
white = df_female.loc[df_female['Caucasian'] == 1]
black = df_female.loc[df_female['African_American'] == 1]
hispanic = df_female.loc[df_female['Hispanic'] == 1]
asian = df_female.loc[df_female['Asian'] == 1]
native = df_female.loc[df_female['Native_American'] == 1]
other = df_female.loc[df_female['Other'] == 1]

print('Races split')
print('white', len(white), 100/total*len(white))
print('black', len(black), 100/total*len(black))
print('hispanic', len(hispanic), 100/total*len(hispanic))
print('asian', len(asian), 100/total*len(asian))
print('native', len(native), 100/total*len(native))
print('other', len(other), 100/total*len(other))

In [None]:
print('% of positive labels per race female')
print('white', len(white.loc[white['Two_yr_Recidivism'] == 1]), 100/total*len(white.loc[white['Two_yr_Recidivism'] == 1]))
print('black', len(black.loc[black['Two_yr_Recidivism'] == 1]), 100/total*len(black.loc[black['Two_yr_Recidivism'] == 1]))
print('hispanic', len(hispanic.loc[hispanic['Two_yr_Recidivism'] == 1]), 100/total*len(hispanic.loc[hispanic['Two_yr_Recidivism'] == 1]))
print('asian', len(asian.loc[asian['Two_yr_Recidivism'] == 1]), 100/total*len(asian.loc[asian['Two_yr_Recidivism'] == 1]))
print('native', len(native.loc[native['Two_yr_Recidivism'] == 1]), 100/total*len(native.loc[native['Two_yr_Recidivism'] == 1]))
print('other', len(other.loc[other['Two_yr_Recidivism'] == 1]), 100/total*len(other.loc[other['Two_yr_Recidivism'] == 1]))

## Log Regression

In [None]:
df_lr = pd.read_csv('../data/data_with_caucasian.csv')
#feature_names = ['Number_of_Priors','score_factor','Age_Above_FourtyFive','Age_Below_TwentyFive','Caucasian','African_American','Asian','Hispanic','Native_American','Other','Female','Misdemeanor']
feature_names = ['Number_of_Priors','Age_Above_FourtyFive','Age_Below_TwentyFive','Caucasian','African_American','Asian','Hispanic','Native_American','Other','Female','Misdemeanor']

z = df_lr.pop('score_factor')
y = df_lr.pop('Two_yr_Recidivism')
X = df_lr

model = LogisticRegression(random_state=129).fit(X, y)
result = list(zip(feature_names, model.coef_[0]))

for r in result:
    print(r)

In [None]:
total = len(df)
did_rec = len(df.loc[df['Two_yr_Recidivism'] == 1])
df_did_rec = df.loc[df['Two_yr_Recidivism'] == 1]
did_not_rec = len(df.loc[df['Two_yr_Recidivism'] == 0])
df_did_not_rec = df.loc[df['Two_yr_Recidivism'] == 0]

In [None]:
did_age_45 = df_did_rec.loc[df_did_rec['Age_Above_FourtyFive'] == 1]
did_age_18 = df_did_rec.loc[df_did_rec['Age_Below_TwentyFive'] == 1]
did_age_25 = df_did_rec.loc[(df_did_rec['Age_Below_TwentyFive'] == 0) & (df_did_rec['Age_Above_FourtyFive'] == 0)]

print('<45', len(did_age_45), 100/did_rec*len(did_age_45))
print('25', len(did_age_25), 100/did_rec*len(did_age_25))
print('18', len(did_age_18), 100/did_rec*len(did_age_18))