In [10]:
import numpy as np
import pandas as pd
import random
import math

# --------------- KiTE Imports ---------------
from KiTE.metrics import ELCE2
from KiTE.calibrate import calibrate, calibration_error
from KiTE.calibration_models import EWF_calibration, KRR_calibration

# --------------- Visualization Imports ---------------
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import seaborn as sns

plt.style.use("tableau-colorblind10")
# sns.set()

# --------------- Model Imports ---------------
from scipy import stats
from sklearn.calibration import calibration_curve
from sklearn.metrics import pairwise_distances, pairwise_kernels, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import (
    LinearRegression,
    TheilSenRegressor,
    RANSACRegressor,
    HuberRegressor,
    Ridge,
)
from sklearn.pipeline import make_pipeline

In [11]:
# Constants: MUST be maintained
model_class_col_name = 'model_split_class'
probability_col_name = 'probability'

In [12]:
def load_recidivism_data(file_name="../../KiTE-utils/notebooks/BROWARD_ORIGINAL.csv"):
    df = pd.read_csv(file_name)[
        [
            "sex",
            "age",
            "race",
            "juv_fel_count",
            "juv_misd_count",
            "priors_count",
            "c_charge_degree",
            "is_recid",
        ]
    ]

    df["c_charge_degree"].replace(["F", "M"], [0, 1], inplace=True)
    df["sex"].replace(["Male", "Female"], [0, 1], inplace=True)
    df["age"] /= 10.0

    # Get one hot encoding of columns B
    one_hot = pd.get_dummies(df["race"])

    # Drop column B as it is now encoded
    df = df.drop("race", axis=1)

    # Join the encoded df
    df = df.join(one_hot)

    return df

In [13]:
df = load_recidivism_data()
df

Unnamed: 0,sex,age,juv_fel_count,juv_misd_count,priors_count,c_charge_degree,is_recid,African-American,Asian,Caucasian,Hispanic,Native American,Other
0,0,6.9,0,0,0,0,0,0,0,0,0,0,1
1,0,3.4,0,0,0,0,1,1,0,0,0,0,0
2,0,2.4,0,0,4,0,1,1,0,0,0,0,0
3,0,2.3,0,1,1,0,0,1,0,0,0,0,0
4,0,4.3,0,0,2,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,0,2.3,0,0,0,0,0,1,0,0,0,0,0
7210,0,2.3,0,0,0,0,0,1,0,0,0,0,0
7211,0,5.7,0,0,0,0,0,0,0,0,0,0,1
7212,1,3.3,0,0,3,1,0,1,0,0,0,0,0


In [14]:
def build_model(df):
    features = [
        "age",
        "sex",
        "African-American",
        "juv_fel_count",
        "juv_misd_count",
        "priors_count",
        "c_charge_degree",
        "Asian",
        "Caucasian",
        "Hispanic",
        "Native American",
        "Other",
    ]
    fair_features = ["age", "sex", "African-American"]
    target = "is_recid"

    # Split data into train, validate and test data
    train, validate, test = np.split(
        df.sample(frac=1), [int(0.33 * len(df)), int(0.66 * len(df))]
    )
    
    X_train = np.array(train[features])
    X_cv = np.array(validate[features])
    X_test = np.array(test[features])

    X_cv_fair = np.array(validate[fair_features])
    X_test_fair = np.array(test[fair_features])

    y_train = np.array(train[target])
    y_cv = np.array(validate[target])
    y_test = np.array(test[target])

    # Train the Random Forest model on the 1st subset of data (training set)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    prob_test = clf.predict_proba(X_test)[:, 1]
    prob_cv = clf.predict_proba(X_cv)[:, 1]
    
    # -------- USER SHOULD: Label with how we split the data -----------
    validate[model_class_col_name] = 'cv'
    test[model_class_col_name] = "test"
    validate[probability_col_name] = prob_cv
    test[probability_col_name] = prob_test
    
    return pd.concat([validate, test]).reset_index(drop=True)

In [15]:
df = build_model(df)
df

Unnamed: 0,sex,age,juv_fel_count,juv_misd_count,priors_count,c_charge_degree,is_recid,African-American,Asian,Caucasian,Hispanic,Native American,Other,model_split_class,probability
0,1,4.0,0,0,13,0,1,1,0,0,0,0,0,cv,0.630000
1,0,2.7,0,0,1,0,1,0,0,0,0,0,1,cv,0.556167
2,0,3.0,0,0,1,1,1,0,0,0,0,0,1,cv,0.163333
3,0,3.4,0,0,3,1,0,1,0,0,0,0,0,cv,0.352500
4,0,3.1,0,0,11,0,1,0,0,1,0,0,0,cv,0.910000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4829,0,5.5,0,0,0,1,0,0,0,1,0,0,0,test,0.478107
4830,0,2.5,0,0,1,0,1,0,0,0,1,0,0,test,0.672722
4831,0,3.1,0,0,3,0,1,1,0,0,0,0,0,test,0.847121
4832,0,3.8,0,0,0,0,0,0,0,1,0,0,0,test,0.066667


In [16]:
df.to_csv("../sample_data/compass.csv",index=False)