# Exploratory Data Analysis: COMPAS Recidivism Data

This notebook analyzes the COMPAS (Correctional Offender Management Profiling for Alternative Sanctions) algorithm data, focusing on potential racial bias in recidivism predictions.

## Context

COMPAS is used by judges and parole officers to score criminal defendants' likelihood of reoffending. ProPublica's analysis revealed potential racial bias in the algorithm's predictions.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for visualizations
plt.style.use('seaborn')
sns.set_palette('deep')

## Loading the Datasets

We have multiple datasets available:
1. compas-scores-raw.csv - Raw COMPAS scores
2. cox-violent-parsed.csv - Parsed violent recidivism data
3. cox-violent-parsed_filt.csv - Filtered violent recidivism data
4. propublica_data_for_fairml.csv - Simplified dataset for fairness analysis

In [None]:
# Load the datasets
compas_raw = pd.read_csv('dataset/compas-scores-raw.csv')
violent_parsed = pd.read_csv('dataset/cox-violent-parsed.csv')
violent_filtered = pd.read_csv('dataset/cox-violent-parsed_filt.csv')
fairml_data = pd.read_csv('dataset/propublicaCompassRecividism_data_fairml.csv/propublica_data_for_fairml.csv')

print("\nCompas Raw Shape:", compas_raw.shape)
print("Violent Parsed Shape:", violent_parsed.shape)
print("Violent Filtered Shape:", violent_filtered.shape)
print("FairML Data Shape:", fairml_data.shape)

## Initial Data Exploration

In [None]:
# Let's look at the structure of each dataset
print("\nCompas Raw Columns:")
print(compas_raw.columns.tolist())

print("\nViolent Parsed Columns:")
print(violent_parsed.columns.tolist())

print("\nViolent Filtered Columns:")
print(violent_filtered.columns.tolist())

print("\nFairML Data Columns:")
print(fairml_data.columns.tolist())

In [None]:
# Basic statistics for the raw COMPAS scores
print("\nCompas Raw Summary Statistics:")
compas_raw.describe()

## Racial Bias Analysis

Let's analyze the racial disparities in COMPAS predictions vs actual recidivism rates.

In [None]:
# Function to calculate recidivism prediction accuracy by race
def analyze_racial_bias(df, race_col, score_col, recid_col):
    results = []
    for race in df[race_col].unique():
        race_df = df[df[race_col] == race]
        
        # True negatives (correctly predicted no recidivism)
        tn = len(race_df[(race_df[score_col].isin(['Low'])) & (race_df[recid_col] == 0)])
        
        # False positives (incorrectly predicted recidivism)
        fp = len(race_df[(race_df[score_col].isin(['Medium', 'High'])) & (race_df[recid_col] == 0)])
        
        # True positives (correctly predicted recidivism)
        tp = len(race_df[(race_df[score_col].isin(['Medium', 'High'])) & (race_df[recid_col] == 1)])
        
        # False negatives (incorrectly predicted no recidivism)
        fn = len(race_df[(race_df[score_col].isin(['Low'])) & (race_df[recid_col] == 1)])
        
        # Calculate metrics
        total = tn + fp + tp + fn
        false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
        false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0
        accuracy = (tp + tn) / total if total > 0 else 0
        
        results.append({
            'Race': race,
            'Total': total,
            'False Positive Rate': false_positive_rate,
            'False Negative Rate': false_negative_rate,
            'Accuracy': accuracy
        })
    
    return pd.DataFrame(results)

In [None]:
# Analyze racial bias in the raw COMPAS data
compas_raw['score_text'] = pd.cut(compas_raw['decile_score'], 
                                 bins=[-np.inf, 4, 7, np.inf], 
                                 labels=['Low', 'Medium', 'High'])

racial_bias_analysis = analyze_racial_bias(compas_raw, 
                                          'race', 
                                          'score_text',
                                          'is_recid')

print("Racial Bias Analysis:")
display(racial_bias_analysis)

# Visualize the results
plt.figure(figsize=(12, 6))
bar_width = 0.25
index = np.arange(len(racial_bias_analysis))

plt.bar(index, racial_bias_analysis['False Positive Rate'], bar_width, label='False Positive Rate')
plt.bar(index + bar_width, racial_bias_analysis['False Negative Rate'], bar_width, label='False Negative Rate')
plt.bar(index + 2*bar_width, racial_bias_analysis['Accuracy'], bar_width, label='Accuracy')

plt.xlabel('Race')
plt.ylabel('Rate')
plt.title('COMPAS Prediction Metrics by Race')
plt.xticks(index + bar_width, racial_bias_analysis['Race'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

## Analysis of Violent Recidivism

In [None]:
# Analyze violent recidivism predictions
violent_analysis = analyze_racial_bias(violent_parsed,
                                      'race',
                                      'v_score_text',
                                      'is_violent_recid')

print("Violent Recidivism Analysis:")
display(violent_analysis)

# Visualize violent recidivism analysis
plt.figure(figsize=(12, 6))
index = np.arange(len(violent_analysis))

plt.bar(index, violent_analysis['False Positive Rate'], bar_width, label='False Positive Rate')
plt.bar(index + bar_width, violent_analysis['False Negative Rate'], bar_width, label='False Negative Rate')
plt.bar(index + 2*bar_width, violent_analysis['Accuracy'], bar_width, label='Accuracy')

plt.xlabel('Race')
plt.ylabel('Rate')
plt.title('Violent Recidivism Prediction Metrics by Race')
plt.xticks(index + bar_width, violent_analysis['Race'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

## Feature Importance Analysis

In [None]:
# Using the FairML dataset for feature importance analysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare the data
feature_cols = [col for col in fairml_data.columns if col != 'two_year_recid']
X = fairml_data[feature_cols].copy()
y = fairml_data['two_year_recid']

# Encode categorical variables
le = LabelEncoder()
for col in X.select_dtypes(include=['object']).columns:
    X[col] = le.fit_transform(X[col])

# Train a random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Visualize feature importance
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Predicting Recidivism')
plt.tight_layout()
plt.show()