In [25]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

import pandas as pd
import numpy as np
import scipy as sp 
import seaborn as sns
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from data_loader import ACSEmploymentDataset

from data_loader import CompasDataset, ACSEmploymentDataset

In [27]:
! brew install lightgbm

[34m==>[0m [1mDownloading https://formulae.brew.sh/api/formula.jws.json[0m
######################################################################### 100.0%
[34m==>[0m [1mDownloading https://formulae.brew.sh/api/cask.jws.json[0m
######################################################################### 100.0%
To reinstall 3.3.5, run:
  brew reinstall lightgbm


In [38]:
folktables_state = 'GA'
folktables_year = 2018

models = {
    'rf': RandomForestClassifier(random_state=0),
    'lgbm': LGBMClassifier(random_state=0)
}

rf_random_grid = {'learner__n_estimators': [100, 200, 500, 700, 1000],
               'learner__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'learner__min_samples_split': [2, 5, 10],
               'learner__min_samples_leaf': [1, 2, 4],
               'learner__bootstrap': [True, False]}

lgbm_random_grid = {
                'learner__max_depth' : [i for i in range(3,12)],
                'learner__num_leaves' : [int(x) for x in np.linspace(start = 20, stop = 3000, num = 10)],
                'learner__min_data_in_leaf' : [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
}

params = {
    'rf': rf_random_grid,
    'lgbm': lgbm_random_grid
}

In [29]:
folk = ACSEmploymentDataset(state=[folktables_state], year=folktables_year, with_nulls=False, optimize=False, subsample = 20000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[self.target] = acs_data[self.target].apply(lambda x: int(x == 1))


In [30]:
folk.dataset

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
7477,50,16.0,1,1,2,0.0,1,1.0,4.0,2,1,2,2,2.0,1,1,1
96089,66,17.0,1,0,2,0.0,4,1.0,4.0,4,2,2,2,2.0,1,1,0
82289,1,0.0,5,2,2,7.0,1,1.0,0.0,1,1,2,2,0.0,1,1,0
99909,64,16.0,1,0,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,2,0
12128,33,21.0,1,1,2,0.0,1,1.0,4.0,1,1,2,2,2.0,2,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98050,45,21.0,1,1,1,0.0,1,1.0,4.0,1,1,1,2,2.0,2,2,1
62908,27,22.0,1,1,2,0.0,1,1.0,4.0,4,1,2,2,2.0,1,1,1
74880,73,10.0,1,1,1,0.0,1,1.0,4.0,4,1,2,2,1.0,2,1,0
95160,46,15.0,1,0,2,0.0,4,1.0,4.0,3,2,2,2,2.0,1,2,1


In [39]:
best_params = {}
encoder = ColumnTransformer(transformers=[
                        ('categorical_features', OneHotEncoder(categories='auto', handle_unknown='ignore'), folk.categorical_columns),
                        ('numerical_features', StandardScaler(), folk.numerical_columns)])

train, test = train_test_split(folk.dataset, test_size=0.2, random_state=0)

for model_name in models.keys():
    model = Pipeline([
                                    ('features', encoder),
                                    ('learner', models[model_name])
                        ])
    search = RandomizedSearchCV(estimator = model, param_distributions = params[model_name], n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    search.fit(train[folk.features], train[folk.target])
    best_params[model_name] = search.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END learner__bootstrap=False, learner__max_depth=10, learner__min_samples_leaf=1, learner__min_samples_split=5, learner__n_estimators=200; total time=   1.2s
[CV] END learner__bootstrap=False, learner__max_depth=10, learner__min_samples_leaf=1, learner__min_samples_split=5, learner__n_estimators=200; total time=   1.2s
[CV] END learner__bootstrap=True, learner__max_depth=80, learner__min_samples_leaf=2, learner__min_samples_split=2, learner__n_estimators=200; total time=   1.9s
[CV] END learner__bootstrap=True, learner__max_depth=80, learner__min_samples_leaf=2, learner__min_samples_split=2, learner__n_estimators=200; total time=   2.0s
[CV] END learner__bootstrap=True, learner__max_depth=80, learner__min_samples_leaf=2, learner__min_samples_split=2, learner__n_estimators=200; total time=   2.0s
[CV] END learner__bootstrap=False, learner__max_depth=10, learner__min_samples_leaf=1, learner__min_samples_split=5, learner_

In [40]:
best_params

{'rf': {'learner__n_estimators': 200,
  'learner__min_samples_split': 5,
  'learner__min_samples_leaf': 2,
  'learner__max_depth': 20,
  'learner__bootstrap': True},
 'lgbm': {'learner__num_leaves': 20,
  'learner__min_data_in_leaf': 100,
  'learner__max_depth': 11}}

In [41]:
import json
with open('best_params_folk.json', 'w') as fp:
    json.dump(best_params, fp)