In [8]:
import numpy as np
import pandas as pd
import random
import math

# --------------- KiTE Imports ---------------
from KiTE.metrics import ELCE2
from KiTE.calibrate import calibrate, calibration_error
from KiTE.calibration_models import EWF_calibration, KRR_calibration

# --------------- Visualization Imports ---------------
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import seaborn as sns

plt.style.use("tableau-colorblind10")
# sns.set()

# --------------- Model Imports ---------------
from scipy import stats
from sklearn.calibration import calibration_curve
from sklearn.metrics import pairwise_distances, pairwise_kernels, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import (
    LinearRegression,
    TheilSenRegressor,
    RANSACRegressor,
    HuberRegressor,
    Ridge,
)
from sklearn.pipeline import make_pipeline

In [9]:
# Constants: MUST be maintained
model_class_col_name = "model_split_class"
probability_col_name = "probability"

In [10]:
def load_recidivism_data(file_name="../../KiTE-utils/notebooks/BROWARD_ORIGINAL.csv"):
    df = pd.read_csv(file_name)[
        [
            "sex",
            "age",
            "race",
            "juv_fel_count",
            "juv_misd_count",
            "priors_count",
            "c_charge_degree",
            "is_recid",
        ]
    ]

    df["c_charge_degree"].replace(["F", "M"], [0, 1], inplace=True)
    df["sex"].replace(["Male", "Female"], [0, 1], inplace=True)
    # df["age"] /= 10.0

    # Get one hot encoding of columns B
    one_hot = pd.get_dummies(df["race"])

    # Drop column B as it is now encoded
    df = df.drop("race", axis=1)

    # Join the encoded df
    df = df.join(one_hot)

    return df

In [11]:
df = load_recidivism_data()
df

Unnamed: 0,sex,age,juv_fel_count,juv_misd_count,priors_count,c_charge_degree,is_recid,African-American,Asian,Caucasian,Hispanic,Native American,Other
0,0,69,0,0,0,0,0,0,0,0,0,0,1
1,0,34,0,0,0,0,1,1,0,0,0,0,0
2,0,24,0,0,4,0,1,1,0,0,0,0,0
3,0,23,0,1,1,0,0,1,0,0,0,0,0
4,0,43,0,0,2,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7209,0,23,0,0,0,0,0,1,0,0,0,0,0
7210,0,23,0,0,0,0,0,1,0,0,0,0,0
7211,0,57,0,0,0,0,0,0,0,0,0,0,1
7212,1,33,0,0,3,1,0,1,0,0,0,0,0


In [12]:
def build_model(df):
    features = [
        "age",
        "sex",
        "African-American",
        "juv_fel_count",
        "juv_misd_count",
        "priors_count",
        "c_charge_degree",
        "Asian",
        "Caucasian",
        "Hispanic",
        "Native American",
        "Other",
    ]
    target = "is_recid"

    # Split data into train, validate and test data
    train, validate, test = np.split(
        df.sample(frac=1), [int(0.33 * len(df)), int(0.66 * len(df))]
    )

    X_train = np.array(train[features])
    X_cv = np.array(validate[features])
    X_test = np.array(test[features])

    y_train = np.array(train[target])
    y_cv = np.array(validate[target])
    y_test = np.array(test[target])

    # Train the Random Forest model on the 1st subset of data (training set)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    prob_test = clf.predict_proba(X_test)[:, 1]
    prob_cv = clf.predict_proba(X_cv)[:, 1]

    # -------- USER SHOULD: Label with how we split the data -----------
    validate[probability_col_name] = prob_cv
    test[probability_col_name] = prob_test

    return pd.concat([validate, test]).reset_index(drop=True)

In [13]:
df = build_model(df)
df

Unnamed: 0,sex,age,juv_fel_count,juv_misd_count,priors_count,c_charge_degree,is_recid,African-American,Asian,Caucasian,Hispanic,Native American,Other,probability
0,0,25,0,0,1,1,1,0,0,0,1,0,0,0.493333
1,1,22,0,1,2,0,0,0,0,1,0,0,0,0.672167
2,0,24,0,0,0,1,1,1,0,0,0,0,0,0.495517
3,0,42,0,0,5,0,0,1,0,0,0,0,0,0.185714
4,1,42,0,0,0,0,1,0,0,1,0,0,0,0.505833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4829,0,23,0,0,10,0,1,0,0,1,0,0,0,0.880000
4830,0,65,0,0,11,0,0,1,0,0,0,0,0,0.370000
4831,0,20,1,0,1,0,1,1,0,0,0,0,0,1.000000
4832,0,38,0,0,1,1,0,1,0,0,0,0,0,0.288333


In [14]:
df.to_csv("../sample_data/compass.csv", index=False)