# Analysis with the `Race` column

In [130]:
import pandas as pd

# Aequitas

In [96]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

warnings.filterwarnings('ignore')

raw_data = pd.read_csv("./compas-scores-two-years.csv")
print(f"Total number of rows in raw data: {raw_data.shape[0]}")

df = raw_data[['age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 
               'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 
               'two_year_recid', 'c_jail_in', 'c_jail_out', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'r_charge_degree', 'is_violent_recid', 'vr_charge_degree', 'v_score_text']]

df = df[(df['days_b_screening_arrest'] <= 30) &
        (df['days_b_screening_arrest'] >= -30) &
        (df['is_recid'] != -1) &
        (df['c_charge_degree'] != "O") &
        (df['score_text'] != 'N/A')]

df['c_jail_in'] = pd.to_datetime(df['c_jail_in'])
df['c_jail_out'] = pd.to_datetime(df['c_jail_out'])
df['length_of_stay'] = (df['c_jail_out'] - df['c_jail_in']).dt.days


o_df = df.copy() 

df = pd.get_dummies(df, columns=['c_charge_degree', 'age_cat', 'race', 'sex'])

df['c_charge_degree'] = o_df['c_charge_degree']
df['age_cat'] = o_df['age_cat']
df['race'] = o_df['race']
df['sex'] = o_df['sex']


df['score_factor'] = np.where(df['score_text'] != "Low", 1, 0) 

print("Data columns after get_dummies:", df.columns.tolist())

X_columns = [
    'sex_Male', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'age_cat_25 - 45',
    'race_African-American', 'race_Asian', 'race_Hispanic', 'race_Native American', 
    'race_Other', 'priors_count', 'two_year_recid', 'length_of_stay'
]

X = df[X_columns]
y = df['score_factor']

scaler = StandardScaler()
X[['priors_count', 'length_of_stay']] = scaler.fit_transform(X[['priors_count', 'length_of_stay']])

log_reg = LogisticRegression()
log_reg.fit(X, y)

y_pred = log_reg.predict(X)
df['predicted_score'] = y_pred


# save the model to disk
df.to_csv('compas_with_predictions.csv', index=False)


Total number of rows in raw data: 7214
Data columns after get_dummies: ['age', 'score_text', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 'two_year_recid', 'c_jail_in', 'c_jail_out', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'r_charge_degree', 'is_violent_recid', 'vr_charge_degree', 'v_score_text', 'length_of_stay', 'c_charge_degree_F', 'c_charge_degree_M', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'race_African-American', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other', 'sex_Female', 'sex_Male', 'c_charge_degree', 'age_cat', 'race', 'sex', 'score_factor']


In [65]:
df

This cell creates a binary **score_factor** column where 1 indicates a high or medium score, and 0 indicates a low score. It also displays the column names to confirm dummy variable generation.

In [139]:
# Create a binary target variable for the logistic regression model
df['score_factor'] = np.where(df['score_text'] != "Low", 1, 0)  # HighScore=1, LowScore=0

# Display the final list of columns after dummy variable encoding
print("Data columns after get_dummies:", df.columns.tolist())


This cell selects feature columns based on expected dummy variables. It checks if any expected columns are missing, which helps avoid errors if certain categories were not created.

In [140]:
# Define feature columns for logistic regression based on actual dummy variable names
X_columns = [
    'sex_Male', 'age_cat_Greater than 45', 'age_cat_Less than 25', 
    'race_African-American', 'race_Asian', 'race_Hispanic', 'race_Native American', 
    'race_Other', 'priors_count', 'two_year_recid'
]

# Check for missing expected columns
missing_columns = [col for col in X_columns if col not in df.columns]
if missing_columns:
    print("Warning: The following expected columns are missing:", missing_columns)
else:
    X = df[X_columns]
    y = df['score_factor']


This cell standardizes **priors_count** to improve logistic regression model performance, ensuring itâ€™s on a comparable scale with other features.

In [141]:
# Standardize the 'priors_count' feature for better logistic regression performance
scaler = StandardScaler()
X[['priors_count']] = scaler.fit_transform(X[['priors_count']])


This cell trains the logistic regression model on the dataset and prints a classification report to evaluate performance on the binary classification task.

In [142]:
# Fit Logistic Regression Model and print classification report
log_reg = LogisticRegression()
log_reg.fit(X, y)
y_pred = log_reg.predict(X)
print(classification_report(y, y_pred))


This cell calculates and displays odds ratios for each feature in the logistic regression model, offering insight into how different factors (e.g., race, age) affect the likelihood of receiving a high COMPAS score.

In [143]:
# Calculate and display odds ratios for model coefficients
odds_ratios = np.exp(log_reg.coef_).flatten()
odds_ratios_dict = {feature: odds_ratio for feature, odds_ratio in zip(X.columns, odds_ratios)}
print("\nOdds Ratios for significant factors:")
print(f"Black defendants are {odds_ratios_dict.get('race_African-American', 'N/A')} times more likely to receive a higher score.")
print(f"Women are {odds_ratios_dict.get('sex_Male', 'N/A')} times less likely to receive a higher score.")
print(f"People under 25 are {odds_ratios_dict.get('age_cat_Less than 25', 'N/A')} times as likely to receive a higher score as middle-aged defendants.")


# Analysis without the `race` column

## Data processing

In [144]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Ignore warnings
warnings.filterwarnings('ignore')

# Load and filter the dataset
raw_data = pd.read_csv("./compas-scores-two-years.csv")
df = raw_data[['age', 'c_charge_degree', 'age_cat', 'score_text', 'sex', 
               'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 
               'two_year_recid', 'c_jail_in', 'c_jail_out']]

# Filter the data based on the specified conditions
df = df[(df['days_b_screening_arrest'] <= 30) &
        (df['days_b_screening_arrest'] >= -30) &
        (df['is_recid'] != -1) &
        (df['c_charge_degree'] != "O") &
        (df['score_text'] != 'N/A')]

# Calculate length of stay in jail
df['c_jail_in'] = pd.to_datetime(df['c_jail_in'])
df['c_jail_out'] = pd.to_datetime(df['c_jail_out'])
df['length_of_stay'] = (df['c_jail_out'] - df['c_jail_in']).dt.days


## Feature Engineering withour the `race` column

In [145]:
# Create dummy variables for categorical columns except for 'race'
df = pd.get_dummies(df, columns=['c_charge_degree', 'age_cat', 'sex'], drop_first=True)

# Create a binary target variable for the logistic regression model
df['score_factor'] = np.where(df['score_text'] != "Low", 1, 0)  # HighScore=1, LowScore=0


## Define Features and Fit Logistic Regression Model

In [None]:
X_columns = ['sex_Male', 'age_cat_Greater than 45', 'age_cat_Less than 25', 
             'priors_count', 'two_year_recid', 'length_of_stay']
X = df[X_columns]
y = df['score_factor']

scaler = StandardScaler()
X[['priors_count', 'length_of_stay']] = scaler.fit_transform(X[['priors_count', 'length_of_stay']])

log_reg = LogisticRegression()
log_reg.fit(X, y)

y_pred = log_reg.predict(X)
print(classification_report(y, y_pred))

# Add the predicted scores to the filtered DataFrame (df) with matching index
df['predicted_score'] = y_pred


## Analyze Race Distribution by Risk Category

In [147]:
# Add the predicted scores to the filtered DataFrame (df) with matching index
df['predicted_score'] = y_pred

# Extract the 'race' column for only the rows in 'df' (using the same index)
df['race'] = raw_data.loc[df.index, 'race']

# Now, we proceed to analyze the race distribution within each predicted score
race_distribution = df.groupby(['predicted_score', 'race']).size().unstack().fillna(0)

# Print the race distribution by predicted risk score to verify
print("\nRace distribution by predicted risk score:")
print(race_distribution)

# Plotting the race distribution as a stacked bar chart
race_distribution.T.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')

# Adding labels and title for clarity
plt.title("Race Distribution by Predicted Risk Score")
plt.xlabel("Predicted Risk Score")
plt.ylabel("Count")
plt.legend(title="Race", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


# Aequitas

## Columns to use

- sex
- age_cat
- race
- juv_fail_count
- decile_score
- juv_misd_count
- juv_other_count
- days_b_screening_arrest
- c_charge_degree
- is_recid
- r_charge_degree
- is_violent_recid
- vr_charge_degree
- score_text
- v_score_text
- two_year_recid

In [148]:
from aequitas import Audit
import aequitas.plot as ap

In [None]:

# DROPING days_b_screening_arrest, TO BE DISCUSSED

columns_to_select = [
    "sex", "age_cat", "race", "juv_fel_count", "decile_score", "juv_misd_count", "juv_other_count", 
    "c_charge_degree", "is_recid", "r_charge_degree", "is_violent_recid", 
    "vr_charge_degree", "score_text", "v_score_text", "two_year_recid", "days_b_screening_arrest",
    "score_factor", "predicted_score"
]


new_df = df[columns_to_select]



# add df['score_factor'] and df['predicted_score'] to new_df




In [69]:
new_df

In [83]:
# drop rows with missing values
new_df = new_df.dropna()

In [84]:
new_df = new_df.drop(columns=['days_b_screening_arrest'])

In [85]:
# convert columns to categorical
for column in new_df.columns:
    new_df[column] = new_df[column].astype('object')
    
# convert target columns to numerical

new_df['score_factor'] = new_df['score_factor'].astype('int')

In [88]:
# select a row where race = Caucasian, sex = Male, age_cat = 25-45, recidivism = no, c_charge_degree = F, decile_score = 5, juv_fel_count = 0, juv_misd_count = 0, juv_other_count = 0, r_charge_degree = F, vr_charge_degree = F, score_text = Low, v_score_text = Low, two_year_recid = 0
# this row will be used as reference group
reference = new_df.loc[(new_df['race'] == 'Caucasian') & (new_df['sex'] == 'Male') & (new_df['age_cat'] == '25 - 45') & (new_df['score_text'] == 'Low')]
                       
                       
reference = reference.iloc[0]
reference.drop(['predicted_score', 'score_factor'], inplace=True)

reference = reference.to_dict()
reference

In [90]:

audit = Audit(new_df, 
              label_column="predicted_score", 
              score_column="score_factor", 
              sensitive_attribute_column=["sex", "age_cat", "race", "is_recid", "is_violent_recid", "c_charge_degree", "decile_score", "juv_fel_count", "juv_misd_count", "juv_other_count", "r_charge_degree", "vr_charge_degree", "score_text", "v_score_text", "two_year_recid"],
              reference_groups=reference)
audit.audit()

In [91]:
audit.confusion_matrix

In [93]:
audit.metrics.round(2)

In [94]:
metrics = ['fpr','fdr']
disparity_tolerance = 1.25

In [80]:
audit.disparities.style

Unnamed: 0_level_0,Unnamed: 1_level_0,ppr_disparity,pprev_disparity,precision_disparity,fdr_disparity,for_disparity,fpr_disparity,fnr_disparity,tpr_disparity,tnr_disparity,npv_disparity
attribute_name,attribute_value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
sex,Female,0.121588,0.904497,0.943397,1.305475,0.537662,0.858001,0.780952,1.045098,1.065299,1.236231
sex,Male,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
age_cat,25 - 45,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
age_cat,Greater than 45,0.109489,0.567474,0.61435,2.686275,0.470894,0.795564,2.007722,0.832799,1.091458,1.171727
age_cat,Less than 25,0.540146,1.240876,1.170585,0.254107,3.639883,2.058824,1.332712,0.944797,0.526316,0.143196
race,African-American,3.467391,1.513931,1.231086,0.38069,2.842221,1.584,0.796964,1.051517,0.81039,0.593276
race,Asian,0.021739,1.347826,0.686567,1.84,5.529412,4.08,2.470588,0.626866,0.0,0.0
race,Caucasian,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
race,Hispanic,0.206522,1.01087,0.939513,1.162105,0.582043,1.064348,0.658824,1.086567,0.979108,1.092276
race,Native American,0.021739,2.021739,1.373134,0.0,,,0.0,1.253731,,


In [95]:
audit.disparity_plot(metrics=metrics, attribute='race', fairness_threshold=disparity_tolerance)