In [1]:
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import scipy as sp 
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from data_loader import ACSEmploymentDataset
from folk_utils import format_params, initialize_base_model, set_protected_groups_by_input, set_protected_groups_config, intialize_splits_with_cluster_labels, predict_with_subdomain_model




In [2]:
#! brew install lightgbm

In [3]:
#! pip install --upgrade threadpoolctl

In [4]:
states = ['NY', 'CA', 'MS', 'LA']
folktables_year = 2018
data_with_subdomains = pd.DataFrame({})
for state in states:
    folk = ACSEmploymentDataset(state=[state], year=folktables_year, with_nulls=False, optimize=False, subsample = 2500)
    data_with_subdomains = data_with_subdomains.append(folk.dataset, ignore_index=True)
    print(folk.dataset[folk.target].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[self.target] = acs_data[self.target].apply(lambda x: int(x == 1))
  data_with_subdomains = data_with_subdomains.append(folk.dataset, ignore_index=True)


0    1341
1    1159
Name: ESR, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[self.target] = acs_data[self.target].apply(lambda x: int(x == 1))
  data_with_subdomains = data_with_subdomains.append(folk.dataset, ignore_index=True)


0    1310
1    1190
Name: ESR, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[self.target] = acs_data[self.target].apply(lambda x: int(x == 1))
  data_with_subdomains = data_with_subdomains.append(folk.dataset, ignore_index=True)


0    1497
1    1003
Name: ESR, dtype: int64
0    1424
1    1076
Name: ESR, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[self.target] = acs_data[self.target].apply(lambda x: int(x == 1))
  data_with_subdomains = data_with_subdomains.append(folk.dataset, ignore_index=True)


In [5]:
data_with_subdomains[folk.target].value_counts()

0    5572
1    4428
Name: ESR, dtype: int64

In [6]:
folk.update_data(data_with_subdomains)

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,75,16.0,1,1,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,0
1,46,22.0,1,0,2,0.0,1,1.0,4.0,2,1,2,2,2.0,1,1,1
2,25,17.0,5,15,2,0.0,1,3.0,4.0,4,1,2,2,2.0,1,2,0
3,63,1.0,5,17,1,0.0,1,1.0,4.0,1,1,2,2,2.0,1,1,0
4,10,7.0,5,2,1,7.0,1,1.0,0.0,1,1,2,2,1.0,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,74,21.0,1,0,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,0
9996,48,22.0,3,0,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,1
9997,63,16.0,1,0,2,0.0,1,1.0,2.0,1,1,2,2,2.0,1,1,1
9998,45,19.0,1,0,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,0


In [7]:
folk.dataset

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,75,16.0,1,1,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,0
1,46,22.0,1,0,2,0.0,1,1.0,4.0,2,1,2,2,2.0,1,1,1
2,25,17.0,5,15,2,0.0,1,3.0,4.0,4,1,2,2,2.0,1,2,0
3,63,1.0,5,17,1,0.0,1,1.0,4.0,1,1,2,2,2.0,1,1,0
4,10,7.0,5,2,1,7.0,1,1.0,0.0,1,1,2,2,1.0,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,74,21.0,1,0,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,0
9996,48,22.0,3,0,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,1
9997,63,16.0,1,0,2,0.0,1,1.0,2.0,1,1,2,2,2.0,1,1,1
9998,45,19.0,1,0,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,1,0


In [8]:
seed_lst = [100, 200, 300]
num_clusters_lst = [1, 2, 3, 4, 5, 6, 7]

base_models = {
    'rf': RandomForestClassifier(random_state=0),
    'lgbm': LGBMClassifier(random_state=0)
}

rf_random_grid = {'learner__n_estimators': [100, 200, 500, 700, 1000],
               'learner__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'learner__min_samples_split': [2, 5, 10],
               'learner__min_samples_leaf': [1, 2, 4],
               'learner__bootstrap': [True, False]}

lgbm_random_grid = {
                'learner__max_depth' : [i for i in range(3,12)],
                'learner__num_leaves' : [int(x) for x in np.linspace(start = 20, stop = 3000, num = 10)],
                'learner__min_data_in_leaf' : [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
}

params = {
    'rf': rf_random_grid,
    'lgbm': lgbm_random_grid
}

In [9]:
res_df = {"seed": [], "model_name": [], "accuracy": [], "f1_score":[], "n_clusters": []}
best_params = {"seed": [], "train_group":[], "params":[], "training_time": []}

In [10]:
encoder = ColumnTransformer(transformers=[
                        ('categorical_features', OneHotEncoder(categories='auto', handle_unknown='ignore'), folk.categorical_columns),
                        ('numerical_features', StandardScaler(), folk.numerical_columns)])

for SEED in seed_lst:
    print(SEED)
    for num_clusters in  num_clusters_lst:
        protected_groups = set_protected_groups_config(num_clusters)
        train_group_names = list(protected_groups.keys())
        train, test = intialize_splits_with_cluster_labels(SEED=SEED, k=num_clusters, dataset=folk, test_size=0.2)
        train_groups = set_protected_groups_by_input(train, protected_groups) 
        test_groups = set_protected_groups_by_input(test, protected_groups) 
        
        for model_name in ['rf', 'lgbm']:
            trained_models = {}
            
            for i, train_group in enumerate(train_groups.keys()):
                model = Pipeline([
                                    ('features', encoder),
                                    ('learner', base_models[model_name])
                        ])
                search = RandomizedSearchCV(estimator = model, param_distributions = params[model_name], n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
                search.fit(train_groups[train_group][folk.features], train_groups[train_group][folk.target])
                best_params["seed"].append(SEED)
                best_params["train_group"].append(train_group)
                best_params["params"].append(search.best_params_)
                
                #t_init = datetime.now()
                trained_models[train_group] = search.best_estimator_
                #trained_models[train_group] = initialize_base_model(model_name, search.best_params_, SEED=SEED+i)
                trained_models[train_group].fit(train_groups[train_group][folk.features], train_groups[train_group][folk.target])
                #t_end = datetime.now()
                #best_params["training_time"] = (t_end - t_init).seconds
            
            y_true, y_pred = predict_with_subdomain_model(trained_models, train_group_names, test_groups, folk.features, folk.target)
            res_df["seed"].append(SEED)
            res_df["model_name"].append(model_name)
            res_df["accuracy"].append(accuracy_score(y_true, y_pred))
            res_df["f1_score"].append(f1_score(y_true, y_pred))
            #res_df["training_time"].append(train_time)
            res_df["n_clusters"].append(num_clusters)

100
Fitting 3 folds for each of 100 candidates, totalling 300 fits




[CV] END learner__bootstrap=False, learner__max_depth=10, learner__min_samples_leaf=1, learner__min_samples_split=5, learner__n_estimators=200; total time=   2.0s
[CV] END learner__bootstrap=False, learner__max_depth=10, learner__min_samples_leaf=1, learner__min_samples_split=5, learner__n_estimators=200; total time=   2.0s
[CV] END learner__bootstrap=False, learner__max_depth=10, learner__min_samples_leaf=1, learner__min_samples_split=5, learner__n_estimators=200; total time=   1.8s
[CV] END learner__bootstrap=True, learner__max_depth=80, learner__min_samples_leaf=2, learner__min_samples_split=2, learner__n_estimators=200; total time=   4.3s
[CV] END learner__bootstrap=True, learner__max_depth=80, learner__min_samples_leaf=2, learner__min_samples_split=2, learner__n_estimators=200; total time=   4.4s
[CV] END learner__bootstrap=True, learner__max_depth=80, learner__min_samples_leaf=2, learner__min_samples_split=2, learner__n_estimators=200; total time=   4.4s


KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(res_df)

In [None]:
results

In [None]:
sns.boxplot(data=results, x='n_clusters', y='accuracy', hue='model_name')

In [None]:
sns.boxplot(data=results, x='n_clusters', y='f1_score', hue='model_name')

In [None]:
sns.boxplot(data=results, x='n_clusters', y='training_time', hue='model_name')