## Project 3

Dataset: COMPAS

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [48]:
df = pd.read_csv("compas-scores-two-years.csv")
print(df.shape)
print(df.columns)

(7214, 53)
Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')


We will first fit a model predicting if the person will reoffend in the next two years. We could use a classifier for this.

Next steps:

* look at the data for bias
* build a classifier

Sensitive variables: race, sex

Target variable: two_year_recid

Features: age/age_cat, juv_fel_count, juv_misd_count, juv_other_count, priors_count, c_charge_degree, score_text, days_b_screening_arrest, decile_score, length_of_stay

## Twin test

Feature: prior convictions

In [49]:
df = df[["race", "sex", "age_cat", "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", "c_charge_degree", "two_year_recid"]]

df = df[(df["c_charge_degree"] != 'O')]

# sum 1-20 together
df["juv_fel_count"] = np.where(df["juv_fel_count"] == 0, 0, 1)
df["juv_misd_count"] = np.where(df["juv_misd_count"] == 0, 0, 1)
df["juv_other_count"] = np.where(df["juv_other_count"] == 0, 0, 1)
df["priors_count"]

df.loc[(df['priors_count'] >= 0) & (df['priors_count'] <= 10), 'priors_count'] = 0
df.loc[(df['priors_count'] > 10) & (df['priors_count'] <= 20), 'priors_count'] = 1
df.loc[(df['priors_count'] > 20), 'priors_count'] = 2

df["c_charge_degree"] = np.where(df["c_charge_degree"] == "M", 0, 1)
df["age_cat"] = df["age_cat"].replace({"Less than 25": 0, "25 - 45": 1, "Greater than 45": 2})
df["race"] = df["race"].replace({"Asian": "Minority", "Native American": "Minority", "Other": "Minority"})

df["sensitive"] = df.apply(lambda x: x["race"][0] + x["sex"][0], axis=1)

df.head()
df.to_csv("data_clean.csv")

  df["age_cat"] = df["age_cat"].replace({"Less than 25": 0, "25 - 45": 1, "Greater than 45": 2})


In [80]:
import pandas as pd

def individual_fairness(df, charge_column,outcome_column):
    """
    Compute probabilities for individual fairness based on the sensitive attribute and a specified charge column.

    Parameters
    ----------
    file_path : str
        Path to the CSV file containing the data.
    charge_column : str
        The column name corresponding to the charge degree.

    Returns
    -------
    pd.DataFrame
        A DataFrame with computed probabilities for individual fairness analysis.
    """
    # Read the dataset
    # Group by sensitive attribute, charge degree, and recidivism outcome
    df_count = df.groupby(["sensitive", charge_column, outcome_column]).agg(
        count=('sensitive', 'count')).reset_index()

    # Sum counts for each sensitive attribute and charge degree
    df_sum = df_count.groupby(["sensitive", charge_column]).agg(
        sum=('count', 'sum')).reset_index()

    # Merge the count and sum dataframes
    df_summary = pd.merge(df_count, df_sum, on=["sensitive", charge_column])

    # Calculate probabilities
    df_summary["prob"] = df_summary["count"] / df_summary["sum"]

    # Print the summary
    print(df_summary)

    return df_summary

In [113]:
# Call the function with the appropriate file path and charge column
df = pd.read_csv('data_clean.csv')

df_summary = individual_fairness(df, "c_charge_degree", 'two_year_recid')
#df_summary = individual_fairness(df, "priors_count", 'two_year_recid')
#df_summary = individual_fairness(df, "age_cat" , 'two_year_recid')
#df_summary = individual_fairness(df, "juv_fel_count" , 'two_year_recid')
#df_summary = individual_fairness(df, "c_charge_degree", 'two_year_recid')
#df_summary = individual_fairness(df, "juv_other_count", 'two_year_recid')


   sensitive  c_charge_degree  two_year_recid  count   sum      prob
0         AF                0               0    157   229  0.685590
1         AF                0               1     72   229  0.314410
2         AF                1               0    248   423  0.586288
3         AF                1               1    175   423  0.413712
4         AM                0               0    470   920  0.510870
5         AM                0               1    450   920  0.489130
6         AM                1               0    920  2124  0.433145
7         AM                1               1   1204  2124  0.566855
8         CF                0               0    190   259  0.733591
9         CF                0               1     69   259  0.266409
10        CF                1               0    178   308  0.577922
11        CF                1               1    130   308  0.422078
12        CM                0               0    459   715  0.641958
13        CM                0     

In [114]:
df.shape

(7214, 11)

## Group fairness

only sensitive data as prior

In [88]:
import pandas as pd

def group_fairness(df, sensitive_column, outcome_column):
    """
    Compute probabilities for group fairness based on a sensitive attribute.

    Parameters
    ----------
    file_path : str
        Path to the CSV file containing the data.
    sensitive_column : str
        The name of the column representing the sensitive attribute.
    outcome_column : str
        The name of the column representing the outcome (e.g., recidivism).

    Returns
    -------
    pd.DataFrame
        A DataFrame with computed probabilities for group fairness analysis.
    """
    # Group by sensitive attribute and outcome
    df_count = df.groupby([sensitive_column, outcome_column]).agg(
        count=('sensitive', 'count')).reset_index()

    # Sum counts for each sensitive attribute
    df_sum = df_count.groupby([sensitive_column]).agg(
        sum=('count', 'sum')).reset_index()

    # Merge the count and sum dataframes
    df_summary = pd.merge(df_count, df_sum, on=[sensitive_column])

    # Calculate probabilities
    df_summary["prob"] = df_summary["count"] / df_summary["sum"]

    # Print the summary
    print(df_summary)

    return df_summary


In [108]:
df = pd.read_csv('data_clean.csv')
df_summary = group_fairness(df,"sensitive",'two_year_recid')

   sensitive  two_year_recid  count   sum      prob
0         AF               0    405   652  0.621166
1         AF               1    247   652  0.378834
2         AM               0   1390  3044  0.456636
3         AM               1   1654  3044  0.543364
4         CF               0    368   567  0.649030
5         CF               1    199   567  0.350970
6         CM               0   1120  1887  0.593535
7         CM               1    767  1887  0.406465
8         HF               0     70   103  0.679612
9         HF               1     33   103  0.320388
10        HM               0    335   534  0.627341
11        HM               1    199   534  0.372659
12        MF               0     54    73  0.739726
13        MF               1     19    73  0.260274
14        MM               0    221   354  0.624294
15        MM               1    133   354  0.375706


## Model

In [117]:
df = pd.read_csv("data_clean.csv")

# Select relevant columns for features and target
features = df.iloc[:, 3:-2]

target = df['two_year_recid']

# Handle missing values if any
features = features.fillna(0)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
#model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train an XGBoost model (replacing Logistic Regression)
#model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

#model.fit(X_train, y_train)

# Make predictions
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)
print(report)

              precision    recall  f1-score   support

           0       0.65      0.80      0.72       823
           1       0.62      0.42      0.50       620

    accuracy                           0.64      1443
   macro avg       0.63      0.61      0.61      1443
weighted avg       0.64      0.64      0.63      1443



In [118]:
df_test = X_test.copy()
df_test['two_year_recid_actual'] = y_test
df_test['two_year_recid_predicted'] = y_test_pred
df_test["sensitive"] = df.loc[df_test.index.values, "sensitive"]
df_test["data_split"] = "test"

# Create a DataFrame for the training set
df_train = X_train.copy()
df_train['two_year_recid_actual'] = y_train
df_train['two_year_recid_predicted'] = y_train_pred
df_train["sensitive"] = df.loc[df_train.index.values, "sensitive"]
df_train["data_split"] = "train"

# Concatenate train and test results
all_results = pd.concat([df_train, df_test], ignore_index=True)

In [120]:
all_results.shape

(7214, 10)

In [119]:
result = individual_fairness(all_results, "c_charge_degree", 'two_year_recid_predicted')

   sensitive  c_charge_degree  two_year_recid_predicted  count   sum      prob
0         AF                0                         0    207   229  0.903930
1         AF                0                         1     22   229  0.096070
2         AF                1                         0    270   423  0.638298
3         AF                1                         1    153   423  0.361702
4         AM                0                         0    740   920  0.804348
5         AM                0                         1    180   920  0.195652
6         AM                1                         0   1062  2124  0.500000
7         AM                1                         1   1062  2124  0.500000
8         CF                0                         0    244   259  0.942085
9         CF                0                         1     15   259  0.057915
10        CF                1                         0    237   308  0.769481
11        CF                1                       

In [99]:
df_summary = group_fairness(df_test,"sensitive",'two_year_recid_predicted')

   sensitive  two_year_recid_predicted  count  sum      prob
0         AF                         0     80  107  0.747664
1         AF                         1     27  107  0.252336
2         AM                         0    362  624  0.580128
3         AM                         1    262  624  0.419872
4         CF                         0     88  107  0.822430
5         CF                         1     19  107  0.177570
6         CM                         0    322  398  0.809045
7         CM                         1     76  398  0.190955
8         HF                         0     17   20  0.850000
9         HF                         1      3   20  0.150000
10        HM                         0     72   97  0.742268
11        HM                         1     25   97  0.257732
12        MF                         0     14   16  0.875000
13        MF                         1      2   16  0.125000
14        MM                         0     60   74  0.810811
15        MM            

Note! When looking at c_charge_degree we found that the model was more biased toward African-American males than Caucasian males (prediction for AM 0.5 vs. CM 0.27; in data AM 0.56 and CM 0.43)

In [None]:
# Load the data
df = pd.read_csv("data_clean.csv")

# Select relevant columns for features and target
features = df.iloc[:, 3:-2]
target = df['two_year_recid']

# Handle missing values if any
features = features.fillna(0)

# Convert target to categorical if necessary
# (Assuming binary classification: 0 and 1)
target = to_categorical(target, num_classes=2)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Build a simple neural network
model = Sequential([
    Dense(64, input_dim=features.shape[1], activation='relu'),  # Hidden layer with 64 neurons
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    Dense(2, activation='softmax')  # Output layer (2 classes)
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

# Evaluate model performance
accuracy = accuracy_score(y_test_classes, y_pred_classes)
report = classification_report(y_test_classes, y_pred_classes)

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(report)


In [None]:
df_test = X_test.copy()
df_test['two_year_recid_actual'] = y_test.argmax(axis=1)
df_test['two_year_recid_predicted'] = y_pred.argmax(axis=1)

df_test["sensitive"] = df.loc[df_test.index.values, "sensitive"]