## Project 3

Dataset: COMPAS

In [256]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [257]:
df = pd.read_csv("compas-scores-two-years.csv")
print(df.shape)
print(df.columns)

(7214, 53)
Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')


We will first fit a model predicting if the person will reoffend in the next two years. We could use a classifier for this.

Next steps:

* look at the data for bias
* build a classifier

Sensitive variables: race, sex

Target variable: two_year_recid

Features: age/age_cat, juv_fel_count, juv_misd_count, juv_other_count, priors_count, c_charge_degree, score_text, days_b_screening_arrest, decile_score, length_of_stay

## Twin test

Feature: prior convictions

In [258]:
df = df[["race", "sex", "age_cat", "juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", "c_charge_degree", "two_year_recid"]]

df = df[(df["c_charge_degree"] != 'O')]

# sum 1-20 together
df["juv_fel_count"] = np.where(df["juv_fel_count"] == 0, 0, 1)
df["juv_misd_count"] = np.where(df["juv_misd_count"] == 0, 0, 1)
df["juv_other_count"] = np.where(df["juv_other_count"] == 0, 0, 1)
df["priors_count"]

df.loc[(df['priors_count'] >= 0) & (df['priors_count'] <= 10), 'priors_count'] = 0
df.loc[(df['priors_count'] > 10) & (df['priors_count'] <= 20), 'priors_count'] = 1
df.loc[(df['priors_count'] > 20), 'priors_count'] = 2

df["c_charge_degree"] = np.where(df["c_charge_degree"] == "M", 0, 1)
df["age_cat"] = df["age_cat"].replace({"Less than 25": 0, "25 - 45": 1, "Greater than 45": 2})
df["race"] = df["race"].replace({"Asian": "Minority", "Native American": "Minority", "Other": "Minority"})
print(df["race"].unique())
df["sensitive"] = df.apply(lambda x: x["race"][0] + x["sex"][0], axis=1)

df.head()
df.to_csv("data_clean.csv")

['Minority' 'African-American' 'Caucasian' 'Hispanic']



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [284]:
import pandas as pd

def individual_fairness(df, charge_column,outcome_column):
    """
    Compute probabilities for individual fairness based on the sensitive attribute and a specified charge column.

    Parameters
    ----------
    file_path : str
        Path to the CSV file containing the data.
    charge_column : str
        The column name corresponding to the charge degree.

    Returns
    -------
    pd.DataFrame
        A DataFrame with computed probabilities for individual fairness analysis.
    """
    # Read the dataset
    # Group by sensitive attribute, charge degree, and recidivism outcome
    df_count = df.groupby(["sensitive", charge_column, outcome_column]).agg(
        count=('sensitive', 'count')).reset_index()

    # Sum counts for each sensitive attribute and charge degree
    df_sum = df_count.groupby(["sensitive", charge_column]).agg(
        sum=('count', 'sum')).reset_index()

    # Merge the count and sum dataframes
    df_summary = pd.merge(df_count, df_sum, on=["sensitive", charge_column])

    # Calculate probabilities
    df_summary["prob"] = df_summary["count"] / df_summary["sum"]
    print(outcome_column)
    df_summary = df_summary[df_summary[outcome_column]==1]
    # Print the summary
    print(df_summary)

    return df_summary

In [286]:
# Call the function with the appropriate file path and charge column
df = pd.read_csv('data_clean.csv')

#data_summary = individual_fairness(df, "c_charge_degree", 'two_year_recid')
data_summary = individual_fairness(df, "age_cat", 'two_year_recid')
#df_summary = individual_fairness(df, "age_cat" , 'two_year_recid')
#df_summary = individual_fairness(df, "juv_fel_count" , 'two_year_recid')
#df_summary = individual_fairness(df, "c_charge_degree", 'two_year_recid')
#df_summary = individual_fairness(df, "juv_other_count", 'two_year_recid')


two_year_recid
   sensitive  age_cat  two_year_recid  count   sum      prob
1         AF        0               1     76   169  0.449704
3         AF        1               1    151   395  0.382278
5         AF        2               1     20    88  0.227273
7         AM        0               1    485   751  0.645806
9         AM        1               1    959  1799  0.533074
11        AM        2               1    210   494  0.425101
13        CF        0               1     27    87  0.310345
15        CF        1               1    131   309  0.423948
17        CF        2               1     41   171  0.239766
19        CM        0               1    164   303  0.541254
21        CM        1               1    435  1003  0.433699
23        CM        2               1    168   581  0.289157
25        HF        0               1      7    17  0.411765
27        HF        1               1     21    63  0.333333
29        HF        2               1      5    23  0.217391
31       

In [114]:
df.shape

(7214, 11)

## Group fairness

only sensitive data as prior

In [296]:
import pandas as pd

def group_fairness(df, sensitive_column, outcome_column):
    """
    Compute probabilities for group fairness based on a sensitive attribute.

    Parameters
    ----------
    file_path : str
        Path to the CSV file containing the data.
    sensitive_column : str
        The name of the column representing the sensitive attribute.
    outcome_column : str
        The name of the column representing the outcome (e.g., recidivism).

    Returns
    -------
    pd.DataFrame
        A DataFrame with computed probabilities for group fairness analysis.
    """
    # Group by sensitive attribute and outcome
    df_count = df.groupby([sensitive_column, outcome_column]).agg(
        count=('sensitive', 'count')).reset_index()

    # Sum counts for each sensitive attribute
    df_sum = df_count.groupby([sensitive_column]).agg(
        sum=('count', 'sum')).reset_index()

    # Merge the count and sum dataframes
    df_summary = pd.merge(df_count, df_sum, on=[sensitive_column])

    # Calculate probabilities
    df_summary["prob"] = df_summary["count"] / df_summary["sum"]
    print(outcome_column)
    df_summary = df_summary[df_summary[outcome_column]==1]
    # Print the summary
    # Print the summary
    print(df_summary)

    return df_summary


In [297]:
df = pd.read_csv('data_clean.csv')
data_summary = group_fairness(df,"sensitive",'two_year_recid')

two_year_recid
   sensitive  two_year_recid  count   sum      prob
1         AF               1    247   652  0.378834
3         AM               1   1654  3044  0.543364
5         CF               1    199   567  0.350970
7         CM               1    767  1887  0.406465
9         HF               1     33   103  0.320388
11        HM               1    199   534  0.372659
13        MF               1     19    73  0.260274
15        MM               1    133   354  0.375706


## Model

In [298]:
df = pd.read_csv("data_clean.csv")

# Select relevant columns for features and target
features = df.iloc[:, 3:-2]

target = df['two_year_recid']

# Handle missing values if any
features = features.fillna(0)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
#model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train an XGBoost model (replacing Logistic Regression)
#model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

#model.fit(X_train, y_train)

# Make predictions
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred)
print(report)

              precision    recall  f1-score   support

           0       0.65      0.80      0.72       823
           1       0.62      0.42      0.50       620

    accuracy                           0.64      1443
   macro avg       0.63      0.61      0.61      1443
weighted avg       0.64      0.64      0.63      1443



In [299]:
df_test = X_test.copy()
df_test['two_year_recid_actual'] = y_test
df_test['two_year_recid_predicted'] = y_test_pred
df_test["sensitive"] = df.loc[df_test.index.values, "sensitive"]
df_test["data_split"] = "test"

# Create a DataFrame for the training set
df_train = X_train.copy()
df_train['two_year_recid_actual'] = y_train
df_train['two_year_recid_predicted'] = y_train_pred
df_train["sensitive"] = df.loc[df_train.index.values, "sensitive"]
df_train["data_split"] = "train"

# Concatenate train and test results
all_results = pd.concat([df_train, df_test], ignore_index=True)

In [289]:
model_summary = individual_fairness(all_results, "age_cat", 'two_year_recid_predicted')

two_year_recid_predicted
   sensitive  age_cat  two_year_recid_predicted  count   sum      prob
1         AF        0                         1    118   169  0.698225
3         AF        1                         1     47   395  0.118987
5         AF        2                         1     10    88  0.113636
7         AM        0                         1    620   751  0.825566
9         AM        1                         1    484  1799  0.269038
11        AM        2                         1    138   494  0.279352
13        CF        0                         1     60    87  0.689655
15        CF        1                         1     19   309  0.061489
17        CF        2                         1      7   171  0.040936
19        CM        0                         1    241   303  0.795380
21        CM        1                         1    109  1003  0.108674
23        CM        2                         1     42   581  0.072289
25        HF        0                         1     

In [300]:
model_summary = group_fairness(all_results,"sensitive",'two_year_recid_predicted')

two_year_recid_predicted
   sensitive  two_year_recid_predicted  count   sum      prob
1         AF                         1    175   652  0.268405
3         AM                         1   1242  3044  0.408016
5         CF                         1     86   567  0.151675
7         CM                         1    392  1887  0.207737
9         HF                         1     14   103  0.135922
11        HM                         1    127   534  0.237828
13        MF                         1     10    73  0.136986
15        MM                         1     72   354  0.203390


In [280]:
data_summary

Unnamed: 0,sensitive,c_charge_degree,two_year_recid,count,sum,prob
1,AF,0,1,72,229,0.31441
3,AF,1,1,175,423,0.413712
5,AM,0,1,450,920,0.48913
7,AM,1,1,1204,2124,0.566855
9,CF,0,1,69,259,0.266409
11,CF,1,1,130,308,0.422078
13,CM,0,1,256,715,0.358042
15,CM,1,1,511,1172,0.436007
17,HF,0,1,12,50,0.24
19,HF,1,1,21,53,0.396226


In [290]:
import pandas as pd
import plotly.express as px

def plot_probabilities_plotly(model_summary, data_summary, sensitive_columns, prob_columns):
    """
    Function to plot probabilities using Plotly, grouped by sensitive columns.

    Parameters:
    - model_summary: DataFrame containing model-specific summaries.
    - data_summary: DataFrame containing data-specific summaries.
    - sensitive_columns: List of column names that are sensitive (e.g., gender, race).
    - prob_columns: List of columns that represent probabilities (e.g., prob1, prob2).
    """
    # Add a 'Type' column to distinguish model and data summaries
    model_summary['Type'] = 'Model'
    data_summary['Type'] = 'Data'

    # Combine both dataframes into one
    combined_df = pd.concat([model_summary, data_summary])

    # Melt the dataframe to get a long-form table
    melted_df = combined_df.melt(id_vars=sensitive_columns + ['Type'], value_vars=prob_columns,
                                 var_name='Probability', value_name='Value')
    print(melted_df)
    # Create an interactive bar plot using Plotly Express
    fig = px.bar(melted_df, x='Probability', y='Value', color='Type',
                 barmode='group', facet_col='sensitive',  # You can adjust facets based on other columns
                 title="Probabilities by Sensitive Columns and Model/Data Type",
                 labels={'Value': 'Probability'},
                 category_orders={'Probability': prob_columns})  # Preserve the order of prob columns

    # Show the plot
    fig.update_layout(yaxis_title="Probability")
    fig.show()
# Example usage:
# Let's assume `model_summary` and `data_summary` are your dataframes


In [302]:
#df_model = model_summary[model_summary['age_cat']==1]
#df_data = data_summary[data_summary['age_cat']==1]
df_model = model_summary
df_data = data_summary

sensitive_columns = ['sensitive']
prob_columns = ['prob']

# You can call the function as follows:
plot_probabilities_plotly(df_model, df_data, sensitive_columns, prob_columns)


   sensitive   Type Probability     Value
0         AF  Model        prob  0.268405
1         AM  Model        prob  0.408016
2         CF  Model        prob  0.151675
3         CM  Model        prob  0.207737
4         HF  Model        prob  0.135922
5         HM  Model        prob  0.237828
6         MF  Model        prob  0.136986
7         MM  Model        prob  0.203390
8         AF   Data        prob  0.378834
9         AM   Data        prob  0.543364
10        CF   Data        prob  0.350970
11        CM   Data        prob  0.406465
12        HF   Data        prob  0.320388
13        HM   Data        prob  0.372659
14        MF   Data        prob  0.260274
15        MM   Data        prob  0.375706


In [99]:
df_summary = group_fairness(df_test,"sensitive",'two_year_recid_predicted')

   sensitive  two_year_recid_predicted  count  sum      prob
0         AF                         0     80  107  0.747664
1         AF                         1     27  107  0.252336
2         AM                         0    362  624  0.580128
3         AM                         1    262  624  0.419872
4         CF                         0     88  107  0.822430
5         CF                         1     19  107  0.177570
6         CM                         0    322  398  0.809045
7         CM                         1     76  398  0.190955
8         HF                         0     17   20  0.850000
9         HF                         1      3   20  0.150000
10        HM                         0     72   97  0.742268
11        HM                         1     25   97  0.257732
12        MF                         0     14   16  0.875000
13        MF                         1      2   16  0.125000
14        MM                         0     60   74  0.810811
15        MM            

Note! When looking at c_charge_degree we found that the model was more biased toward African-American males than Caucasian males (prediction for AM 0.5 vs. CM 0.27; in data AM 0.56 and CM 0.43)

In [None]:
# Load the data
df = pd.read_csv("data_clean.csv")

# Select relevant columns for features and target
features = df.iloc[:, 3:-2]
target = df['two_year_recid']

# Handle missing values if any
features = features.fillna(0)

# Convert target to categorical if necessary
# (Assuming binary classification: 0 and 1)
target = to_categorical(target, num_classes=2)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Build a simple neural network
model = Sequential([
    Dense(64, input_dim=features.shape[1], activation='relu'),  # Hidden layer with 64 neurons
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons
    Dense(2, activation='softmax')  # Output layer (2 classes)
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

# Evaluate model performance
accuracy = accuracy_score(y_test_classes, y_pred_classes)
report = classification_report(y_test_classes, y_pred_classes)

# Print results
print(f"Accuracy: {accuracy:.2f}")
print(report)


In [None]:
df_test = X_test.copy()
df_test['two_year_recid_actual'] = y_test.argmax(axis=1)
df_test['two_year_recid_predicted'] = y_pred.argmax(axis=1)

df_test["sensitive"] = df.loc[df_test.index.values, "sensitive"]