In [1]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import scipy as sp 
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from data_loader import ACSEmploymentDataset
from folk_utils import format_params, initialize_base_model, set_protected_groups_by_input, set_protected_groups_config, intialize_splits_with_cluster_labels, predict_with_subdomain_model




In [2]:
#! brew install lightgbm

In [3]:
#! pip install --upgrade threadpoolctl

In [4]:
folktables_state = 'GA'
folktables_year = 2018

seed_lst = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
num_clusters_lst = [1, 2, 3, 4, 5, 6, 7, 8]

In [5]:
folk = ACSEmploymentDataset(state=[folktables_state], year=folktables_year, with_nulls=False, optimize=False, subsample = 20000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[self.target] = acs_data[self.target].apply(lambda x: int(x == 1))


In [6]:
folk.dataset

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
57859,59,20.0,1,1,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,1
94503,13,10.0,5,2,2,2.0,1,1.0,0.0,1,1,2,2,2.0,2,1,0
85297,81,10.0,2,6,1,0.0,1,1.0,4.0,4,1,2,2,2.0,2,9,0
89183,28,20.0,1,0,2,0.0,1,3.0,4.0,1,1,2,2,2.0,1,1,1
37297,53,16.0,1,0,2,0.0,1,1.0,4.0,4,1,2,2,2.0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5577,28,20.0,1,1,1,0.0,1,1.0,4.0,4,1,1,2,1.0,1,2,0
19406,60,13.0,1,1,2,0.0,1,1.0,4.0,3,1,2,2,2.0,2,1,0
57983,29,16.0,5,0,2,0.0,1,1.0,2.0,1,1,2,2,2.0,1,6,0
90793,68,19.0,1,1,2,0.0,1,1.0,4.0,2,1,2,2,2.0,2,1,1


In [7]:
import json
f = open('best_params_folk.json')
saved_params_after_tuning = json.load(f)

In [8]:
res_df = {"seed": [], "model_name": [], "accuracy": [], "f1_score":[], "training_time": [], "n_clusters": []}

In [9]:
for SEED in seed_lst:
    print(SEED)
    for num_clusters in  num_clusters_lst:
        protected_groups = set_protected_groups_config(num_clusters)
        train_group_names = list(protected_groups.keys())
        train, test = intialize_splits_with_cluster_labels(SEED=SEED, k=num_clusters, dataset=folk, test_size=0.2)
        train_groups = set_protected_groups_by_input(train, protected_groups) 
        test_groups = set_protected_groups_by_input(test, protected_groups) 
        
        for model_name in ['rf', 'lgbm']:
            models = {}
            t_init = datetime.now()
            for i, train_group in enumerate(train_groups.keys()):
                models[train_group] = initialize_base_model(model_name, saved_params_after_tuning, SEED=SEED+i)
                models[train_group].fit(train_groups[train_group][folk.features], train_groups[train_group][folk.target])
            t_end = datetime.now()
            train_time = (t_end - t_init).seconds
            y_true, y_pred = predict_with_subdomain_model(models, train_group_names, test_groups, folk.features, folk.target)
            res_df["seed"].append(SEED)
            res_df["model_name"].append(model_name)
            res_df["accuracy"].append(accuracy_score(y_true, y_pred))
            res_df["f1_score"].append(f1_score(y_true, y_pred))
            res_df["training_time"].append(train_time)
            res_df["n_clusters"].append(num_clusters)

100




100




100




KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(res_df)
results.model_name.value_counts()

In [None]:
sns.boxplot(data=results, x='n_clusters', y='accuracy', hue='model_name')

In [None]:
sns.boxplot(data=results, x='n_clusters', y='f1_score', hue='model_name')

In [None]:
sns.boxplot(data=results, x='n_clusters', y='training_time', hue='model_name')