In [None]:
import folktables
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import dill as pkl
import numpy as np
import pdl

Pull the income dataset from folktables:

In [None]:
ACSIncome = folktables.BasicProblem(
    features=[
        "ST",
        "AGEP",
        "CIT",
        "COW",
        "DDRS",
        "DEAR",
        "DEYE",
        "DOUT",
        "DRAT",
        "DREM",
        "ENG",
        "FER",
        "JWTRNS",
        "LANX",
        "MAR",
        "MIL",
        "SCHL",
        "SEX",
        'WKHP',
        "OCCP",
        "RAC1P"
    ],
    target='PINCP',
    # target_transform=lambda x: x > 50000,    
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

data_source = folktables.ACSDataSource(survey_year='2021', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["VA","TX", "WV", "KY", "FL", "OK", "TN", "AK", "SC", "AL", "NC", "LA", "MS", "MD", "GA", "DE"], download=True)
data_np, labels, _ = ACSIncome.df_to_numpy(acs_data)
indices = labels < 100000
data_np = data_np[indices]
labels = labels[indices]
data = pd.DataFrame(data_np, columns = ["ST", "AGEP", "CIT", "COW", "DDRS", "DEAR", "DEYE", "DOUT", "DRAT", "DREM", "ENG", "FER", "JWTRNS", "LANX", "MAR", "MIL", "SCHL", "SEX", 'WKHP', "OCCP", 'RAC1P'])

x_train, x_val_test, y_train, y_val_test = train_test_split(data, labels, test_size = .3, random_state = 23)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size = .5, random_state = 23)

Train the initial model that will be the base of the pointed decision list (PDL), and initialize the PDL

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg = DecisionTreeRegressor(max_depth = 1, random_state = 42) 
reg.fit(x_train, y_train)
team_pdl = pdl.PointerDecisionList(reg, x_train, y_train, x_val, y_val, alpha = 100000, min_group_size = 1)

Build an initial group function g, which checks that an individual's RAC1P indicator is 1. (I.e. that they are encoded as white only.) Build a decision tree regressor of depth 5. Here we're training it on the group of interest, but you could do whatever. 

In [None]:
def g(X):
    return X['RAC1P'] == 1

reg = sklearn.tree.DecisionTreeRegressor(max_depth = 5)
reg.fit(x_train[x_train['RAC1P']==1], y_train[x_train['RAC1P']==1])
h = reg.predict

Run an update on the PDL; if it accepts the update it returns True. 

In [None]:
team_pdl.update(g,h,x_train, y_train, x_val, y_val)