## Project 3

Dataset: COMPAS

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [5]:
df = pd.read_csv("compas-scores-two-years.csv")
print(df.shape)
print(df.columns)

(7214, 53)
Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')


We will first fit a model predicting if the person will reoffend in the next two years. We could use a classifier for this.

Next steps:

* look at the data for bias
* build a classifier

Sensitive variables: race, sex

Target variable: two_year_recid

Features: age/age_cat, juv_fel_count, juv_misd_count, juv_other_count, priors_count, c_charge_degree, score_text, days_b_screening_arrest, decile_score, length_of_stay

## Twin test

Feature: prior convictions

In [6]:
df = df[["race", "sex", "age_cat", "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", "c_charge_degree", "two_year_recid"]]

df = df[(df["c_charge_degree"] != 'O')]

# sum 1-20 together
df["juv_fel_count"] = np.where(df["juv_fel_count"] == 0, 0, 1)
df["juv_misd_count"] = np.where(df["juv_misd_count"] == 0, 0, 1)
df["juv_other_count"] = np.where(df["juv_other_count"] == 0, 0, 1)
df["priors_count"]

df.loc[(df['priors_count'] >= 0) & (df['priors_count'] <= 10), 'priors_count'] = 0
df.loc[(df['priors_count'] > 10) & (df['priors_count'] <= 20), 'priors_count'] = 1
df.loc[(df['priors_count'] > 20), 'priors_count'] = 2

df["c_charge_degree"] = np.where(df["c_charge_degree"] == "M", 0, 1)
df["age_cat"] = df["age_cat"].replace({"Less than 25": 0, "25 - 45": 1, "Greater than 45": 2})
df["race"] = df["race"].replace({"Asian": "Minority", "Native American": "Minority", "Other": "Minority"})

df["sensitive"] = df.apply(lambda x: x["race"][0] + x["sex"][0], axis=1)

df.head()
df.to_csv("data_clean.csv")

  df["age_cat"] = df["age_cat"].replace({"Less than 25": 0, "25 - 45": 1, "Greater than 45": 2})


In [7]:
df = pd.read_csv("data_clean.csv")

df_count = df.groupby(["sensitive", "c_charge_degree", "two_year_recid"]).agg(
    count=('race', 'count')).reset_index()

df_sum = df_count.groupby(["sensitive", "c_charge_degree"]).agg(
    sum=('count', 'sum')).reset_index()

df_summary = pd.merge(df_count, df_sum, on=["sensitive", "c_charge_degree"])

df_summary["prob"] = df_summary["count"] / df_summary["sum"]
print(df_summary)

   sensitive  c_charge_degree  two_year_recid  count   sum      prob
0         AF                0               0    157   229  0.685590
1         AF                0               1     72   229  0.314410
2         AF                1               0    248   423  0.586288
3         AF                1               1    175   423  0.413712
4         AM                0               0    470   920  0.510870
5         AM                0               1    450   920  0.489130
6         AM                1               0    920  2124  0.433145
7         AM                1               1   1204  2124  0.566855
8         CF                0               0    190   259  0.733591
9         CF                0               1     69   259  0.266409
10        CF                1               0    178   308  0.577922
11        CF                1               1    130   308  0.422078
12        CM                0               0    459   715  0.641958
13        CM                0     

## Group fairness

only sensitive data as prior

In [8]:
df = pd.read_csv("data_clean.csv")

df_count = df.groupby(["sensitive", "two_year_recid"]).agg(
    count=('race', 'count')).reset_index()

df_sum = df_count.groupby(["sensitive"]).agg(
    sum=('count', 'sum')).reset_index()

df_summary = pd.merge(df_count, df_sum, on=["sensitive"])

df_summary["prob"] = df_summary["count"] / df_summary["sum"]
print(df_summary)

   sensitive  two_year_recid  count   sum      prob
0         AF               0    405   652  0.621166
1         AF               1    247   652  0.378834
2         AM               0   1390  3044  0.456636
3         AM               1   1654  3044  0.543364
4         CF               0    368   567  0.649030
5         CF               1    199   567  0.350970
6         CM               0   1120  1887  0.593535
7         CM               1    767  1887  0.406465
8         HF               0     70   103  0.679612
9         HF               1     33   103  0.320388
10        HM               0    335   534  0.627341
11        HM               1    199   534  0.372659
12        MF               0     54    73  0.739726
13        MF               1     19    73  0.260274
14        MM               0    221   354  0.624294
15        MM               1    133   354  0.375706


## Model

In [18]:
df = pd.read_csv("data_clean.csv")

# Select relevant columns for features and target
features = df.iloc[:, 3:-2]

target = df['two_year_recid']

# Handle missing values if any
features = features.fillna(0)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a logistic regression model
#model = LogisticRegression(max_iter=1000)
#model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train an XGBoost model (replacing Logistic Regression)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

#model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(report)

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.65      0.80      0.72       823
           1       0.61      0.42      0.50       620

    accuracy                           0.64      1443
   macro avg       0.63      0.61      0.61      1443
weighted avg       0.63      0.64      0.62      1443



In [19]:
# Load the data
df = pd.read_csv("data_clean.csv")

# Select relevant columns for features and target
features = df.iloc[:, 3:-2]
target = df['two_year_recid']

# Handle missing values if any
features = features.fillna(0)

# Convert target to categorical if necessary
# (Assuming binary classification: 0 and 1)
target = to_categorical(target, num_classes=2)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Build a simple neural network
model = Sequential([
    Dense(64, input_dim=features.shape[1], activation='relu'),  # Hidden layer with 64 neurons
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    Dense(2, activation='softmax')  # Output layer (2 classes)
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

# Evaluate model performance
accuracy = accuracy_score(y_test_classes, y_pred_classes)
report = classification_report(y_test_classes, y_pred_classes)

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(report)


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6031 - loss: 0.6666 - val_accuracy: 0.6149 - val_loss: 0.6508
Epoch 2/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6068 - loss: 0.6531 - val_accuracy: 0.6119 - val_loss: 0.6499
Epoch 3/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6162 - loss: 0.6446 - val_accuracy: 0.6129 - val_loss: 0.6515
Epoch 4/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6264 - loss: 0.6463 - val_accuracy: 0.6109 - val_loss: 0.6509
Epoch 5/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6276 - loss: 0.6475 - val_accuracy: 0.6119 - val_loss: 0.6512
Epoch 6/20
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6277 - loss: 0.6392 - val_accuracy: 0.6030 - val_loss: 0.6604
Epoch 7/20
[1m127/127[0m [32m━━━━━━━

In [10]:
df_test = X_test.copy()
df_test['two_year_recid_actual'] = y_test
df_test['two_year_recid_predicted'] = y_pred

df_test["sensitive"] = df.loc[df_test.index.values, "sensitive"]

In [11]:
# twin test for model

df_count = df_test.groupby(["sensitive", "c_charge_degree", "two_year_recid_predicted"]).agg(
    count=('sensitive', 'count')).reset_index()

df_sum = df_count.groupby(["sensitive", "c_charge_degree"]).agg(
    sum=('count', 'sum')).reset_index()

df_summary = pd.merge(df_count, df_sum, on=["sensitive", "c_charge_degree"])

df_summary["prob"] = df_summary["count"] / df_summary["sum"]
print(df_summary)

   sensitive  c_charge_degree  two_year_recid_predicted  count  sum      prob
0         AF                0                         0     59   65  0.907692
1         AF                0                         1      6   65  0.092308
2         AF                1                         0     73  109  0.669725
3         AF                1                         1     36  109  0.330275
4         AM                0                         0    213  272  0.783088
5         AM                0                         1     59  272  0.216912
6         AM                1                         0    331  666  0.496997
7         AM                1                         1    335  666  0.503003
8         CF                0                         0     73   78  0.935897
9         CF                0                         1      5   78  0.064103
10        CF                1                         0     61   87  0.701149
11        CF                1                         1     26  

Note! When looking at c_charge_degree we found that the model was more biased toward African-American males than Caucasian males (prediction for AM 0.5 vs. CM 0.27; in data AM 0.56 and CM 0.43)

In [12]:
df_test = df_test[df_test["two_year_recid_actual"] != df_test['two_year_recid_predicted']]

# samples misclassified, count by sensitive information
df_test["sensitive"].value_counts()

Unnamed: 0_level_0,count
sensitive,Unnamed: 1_level_1
AM,351
CM,203
CF,65
AF,55
HM,49
MM,32
HF,11
MF,8
