## Project 3

Dataset: COMPAS

In [2]:
import pandas as pd

compas_data = pd.read_csv("compas-scores-two-years.csv")
print(compas_data.shape)
print(compas_data.columns)

(7214, 53)
Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')


In [3]:
compas_data.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Select relevant columns for features and target
features = compas_data[['sex', 'age', 'race', 'priors_count', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'decile_score']]
target = compas_data['two_year_recid']

# Convert categorical variables (e.g., sex, race) to numeric
features = pd.get_dummies(features, columns=['sex', 'race'], drop_first=True)

# Handle missing values if any
features = features.fillna(0)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report


(0.6822170900692841,
 '              precision    recall  f1-score   support\n\n           0       0.69      0.77      0.73      1207\n           1       0.67      0.57      0.61       958\n\n    accuracy                           0.68      2165\n   macro avg       0.68      0.67      0.67      2165\nweighted avg       0.68      0.68      0.68      2165\n')

In [None]:
import numpy as np

# Add predictions and demographic data to a results DataFrame for analysis
results = X_test.copy()
results['actual'] = y_test
results['predicted'] = y_pred

# Add back original demographic data
results['race'] = compas_data.loc[results.index, 'race']
results['sex'] = compas_data.loc[results.index, 'sex']

# Define a function to calculate TPR and FPR for groups
def calculate_group_metrics(data, group_column, actual='actual', predicted='predicted'):
    metrics = []
    for group in data[group_column].unique():
        group_data = data[data[group_column] == group]
        tp = np.sum((group_data[actual] == 1) & (group_data[predicted] == 1))
        fn = np.sum((group_data[actual] == 1) & (group_data[predicted] == 0))
        fp = np.sum((group_data[actual] == 0) & (group_data[predicted] == 1))
        tn = np.sum((group_data[actual] == 0) & (group_data[predicted] == 0))
        
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # True Positive Rate
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        
        metrics.append({'Group': group, 'TPR': tpr, 'FPR': fpr, 'Size': len(group_data)})
    
    return pd.DataFrame(metrics)

# Calculate metrics for race and sex
race_metrics = calculate_group_metrics(results, 'race')
sex_metrics = calculate_group_metrics(results, 'sex')



race_metrics, sex_metrics


(              Group       TPR       FPR  Size
 0         Caucasian  0.489130  0.178879   740
 1  African-American  0.653448  0.298872  1112
 2             Other  0.261905  0.151899   121
 3          Hispanic  0.310345  0.144000   183
 4   Native American  1.000000  0.500000     3
 5             Asian  0.000000  0.200000     6,
     Group       TPR       FPR  Size
 0    Male  0.606577  0.259768  1768
 1  Female  0.335766  0.107692   397)

In [6]:
from tabulate import tabulate

# Display as a formatted table
print("Race-Wise Metrics")
print(tabulate(race_metrics, headers="keys", tablefmt="pretty"))

print("\nSex-Wise Metrics")
print(tabulate(sex_metrics, headers="keys", tablefmt="pretty"))


Race-Wise Metrics
+---+------------------+--------------------+---------------------+------+
|   |      Group       |        TPR         |         FPR         | Size |
+---+------------------+--------------------+---------------------+------+
| 0 |    Caucasian     | 0.4891304347826087 | 0.1788793103448276  | 740  |
| 1 | African-American | 0.653448275862069  | 0.29887218045112784 | 1112 |
| 2 |      Other       | 0.2619047619047619 | 0.1518987341772152  | 121  |
| 3 |     Hispanic     | 0.3103448275862069 |        0.144        | 183  |
| 4 | Native American  |        1.0         |         0.5         |  3   |
| 5 |      Asian       |        0.0         |         0.2         |  6   |
+---+------------------+--------------------+---------------------+------+

Sex-Wise Metrics
+---+--------+--------------------+---------------------+------+
|   | Group  |        TPR         |         FPR         | Size |
+---+--------+--------------------+---------------------+------+
| 0 |  Male  | 0.60