# Random Forest Classifier

This is a fast sklearn classifier, to investigate the model performance in the exploration phase. Further tuning of hyperparameters must be done. Also, the data needs further preprocessing, so we leave out all the object type columns.

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np

import glob

ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']
path = './data/preprocessing_IV/'

def read_train_val_test(name, path):
    train = pd.read_csv(path + name + '_train.csv')
    validate = pd.read_csv(path + name + '_validate.csv')
    test = pd.read_csv(path + name + '_test.csv')
    return train, validate, test

def readHyperparams():
    return pd.read_csv('hyperparameters.csv')

In [18]:
# split features and labels
def splitFeaturesLabels(df):
    # X = df.copy()
    # X.drop('has_kidney_issue', axis=1)
    X = df[['anchor_age', 'anchor_year']]
    return X, df.has_kidney_issue

In [19]:
def getBestParams(hyperparams, ethnicity):
    return hyperparams.loc[hyperparams['ethnicity']==ethnicity]

In [20]:
def trainModel(name, path, all_params):
    train, val, test = read_train_val_test(name, path)
    hyperparams = getBestParams(all_params, name)
    X_train, y_train = splitFeaturesLabels(train)
    clf = RandomForestClassifier(n_estimators=int(hyperparams['n_estimators']),
                                 max_depth=int(hyperparams['max_depth']),
                                 random_state=42)
    clf.fit(X_train, y_train)
    return clf

In [21]:
def aggregateForests(agg, model):
    agg.estimators_ += model.estimators_
    agg.n_estimators = len(model.estimators_)
    return agg

In [22]:
models = []
params = readHyperparams()
for eth in ethnic_group_names:
    models.append(trainModel(eth, path, params))
print(len(models))

8


In [23]:
def federatedForest(model_list):
    ff = model_list[0]
    for i, model in enumerate(model_list):
        if i > 0:
            ff = aggregateForests(ff, model_list[i])
    return ff

In [24]:
fed_test = federatedForest(models)

In [25]:
train, val, test = read_train_val_test('white', path)
X_test, y_test = splitFeaturesLabels(test)
y_pred = fed_test.predict(X_test)

In [26]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.7592888415572122
0.0
0.0
0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
from pprint import pprint
clf = RandomForestClassifier(random_state=42)
pprint(clf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [28]:
# for each local model and the federated model, evaluate metrics like this:

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
