In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [144]:
df = pd.read_csv("compas-scores-two-years.csv")
df

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,10996,steven butler,steven,butler,2013-11-23,Male,1992-07-17,23,Less than 25,African-American,...,5,Medium,2013-11-23,2013-11-22,2013-11-24,0,1,860,0,0
7210,10997,malcolm simmons,malcolm,simmons,2014-02-01,Male,1993-03-25,23,Less than 25,African-American,...,5,Medium,2014-02-01,2014-01-31,2014-02-02,0,1,790,0,0
7211,10999,winston gregory,winston,gregory,2014-01-14,Male,1958-10-01,57,Greater than 45,Other,...,1,Low,2014-01-14,2014-01-13,2014-01-14,0,0,808,0,0
7212,11000,farrah jean,farrah,jean,2014-03-09,Female,1982-11-17,33,25 - 45,African-American,...,2,Low,2014-03-09,2014-03-08,2014-03-09,3,0,754,0,0


In [145]:
print(df["c_charge_degree"])

0       F
1       F
2       F
3       F
4       F
       ..
7209    F
7210    F
7211    F
7212    M
7213    F
Name: c_charge_degree, Length: 7214, dtype: object


In [116]:
condition_met = (df["score_text"] != 'N/A')

# Filtern Sie den DataFrame basierend auf der Bedingung
filtered_df = df[condition_met]

# Überprüfen Sie die Anzahl der Zeilen im gefilterten DataFrame
if len(filtered_df) > 0:
    print("Die Bedingung wurde erfüllt. Es gibt Zeilen im DataFrame, die nicht 'N/A' in der Spalte 'score_text' enthalten.")
else:
    print("Die Bedingung wurde nicht erfüllt. Es gibt keine Zeilen im DataFrame, die nicht 'N/A' in der Spalte 'score_text' enthalten.")

Die Bedingung wurde erfüllt. Es gibt Zeilen im DataFrame, die nicht 'N/A' in der Spalte 'score_text' enthalten.


Inspo
- https://fairlens.readthedocs.io/en/latest/user_guide/compas.html
- This: https://afraenkel.github.io/fairness-book/content/04-compas.html and this together: https://github.com/propublica/compas-analysis
- https://www.kaggle.com/code/osocapo/fairness-analysis-for-binary-classification
- https://github.com/huynhkthai/RecidivismPredictionModel/blob/main/Project_2.ipynb
- https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
- https://www.kaggle.com/code/zeedofa/racial-disparities-in-compas
- https://github.com/shebna12/COMPAS-recidivism

Data cleaning

However not all of the rows are useable for the first round of analysis.

There are a number of reasons remove rows because of missing data:

- If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense.
- We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
- In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them).
- We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility

In [131]:
# Check if any row meets the condition
# Check if any row does not meet the condition
if ~((df["days_b_screening_arrest"] <= 30) & (df["days_b_screening_arrest"] >= -30)).any():
    print("All rows have 'days_b_screening_arrest' between -30 and 30.")
else:
    print("There are rows where 'days_b_screening_arrest' is not between -30 and 30.")


There are rows where 'days_b_screening_arrest' is not between -30 and 30.


In [137]:
# Apply filtering to the DataFrame and reset the index
df = df[(df["days_b_screening_arrest"] <= 30) & (df["days_b_screening_arrest"] >= -30)].reset_index(drop=True)

In [139]:
# Check if any value in the column "days_b_screening_arrest" is not between -30 and 30
if ((df["days_b_screening_arrest"] > 30) | (df["days_b_screening_arrest"] < -30)).any():
    print("There are values in 'days_b_screening_arrest' that are not between -30 and 30.")
else:
    print("All values in 'days_b_screening_arrest' are between -30 and 30.")

All values in 'days_b_screening_arrest' are between -30 and 30.


In [141]:
print(df["c_charge_degree"])

0       F
1       F
2       F
3       M
4       F
       ..
6167    F
6168    F
6169    F
6170    M
6171    F
Name: c_charge_degree, Length: 6172, dtype: object


### Training a Model 

In [97]:
# Select the features to use
print(df.columns)
print(df['priors_count'].dtype)

Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')
int64


In [107]:
# Select the features to use
df = df[["sex", "age", "c_charge_desc", "c_charge_degree", "juv_misd_count", "juv_fel_count", "priors_count", "is_receid"]]
print(df.columns)

KeyError: "['is_receid'] not in index"

In [99]:
# Split the dataset into train and test
sp = int(len(df) * 0.8)
df_train = df[:sp].reset_index(drop=True)
df_test = df[sp:].reset_index(drop=True)

In [91]:
# Convert categorical columns to numerical columns using one-hot encoding
# Define preprocess function
def preprocess(df):
    X = df.copy()

    # One-hot encode 'sex' column
    X = pd.get_dummies(X, columns=["sex"], prefix="sex")
    
    # One-hot encode 'c_charge_degree' column
    X = pd.get_dummies(X, columns=["c_charge_degree"], prefix="c_charge_degree")

    X = pd.get_dummies(df, columns=['c_charge_desc'], prefix='charge_desc')

    y = df["is_recid"] 
    
    return X, y # return the modified DataFrame

# Assuming df is your original DataFrame
# Call the preprocess function and print the result
#preprocessed_df = preprocess(df)
#print(preprocessed_df)

In [89]:
# Print all unique values from the 'c_charge_desc' column
"""
unique_charge_desc = df['c_charge_desc'].unique()
for charge_desc in unique_charge_desc:
    print(charge_desc)
"""

"\nunique_charge_desc = df['c_charge_desc'].unique()\nfor charge_desc in unique_charge_desc:\n    print(charge_desc)\n"

In [92]:
df_train = df[:sp].reset_index(drop=True)

# Train a regressor
X, y = preprocess(df_train)
clf = LogisticRegression(random_state=0).fit(X, y)

ValueError: could not convert string to float: 'miguel hernandez'