# Random Forest Classifier

This is a fast sklearn classifier, to investigate the model performance in the exploration phase. Further tuning of hyperparameters must be done. Also, the data needs further preprocessing, so we leave out all the object type columns.

In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
import math

import glob

ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']
path = './data/preprocessing_IV/'

def read_train_val_test(name, path):
    train = pd.read_csv(path + name + '_train.csv')
    # NOTE: drope datetime columns for now, because sklearn does not support it
    train = train.drop(columns=['admittime', 'edregtime', 'emar_charttime', 'lab_charttime'])
    validate = pd.read_csv(path + name + '_validate.csv')
    # NOTE: drope datetime columns for now, because sklearn does not support it
    validate = validate.drop(columns=['admittime', 'edregtime', 'emar_charttime', 'lab_charttime'])
    test = pd.read_csv(path + name + '_test.csv')
    # NOTE: drope datetime columns for now, because sklearn does not support it
    test = test.drop(columns=['admittime', 'edregtime', 'emar_charttime', 'lab_charttime'])
    return train, validate, test

def readHyperparams():
    return pd.read_csv('hyperparameters.csv')

In [6]:
# split features and labels
def splitFeaturesLabels(df):
    X = df.copy()
    X = X.drop('has_kidney_issue', axis=1)
    # X = df[['anchor_age', 'anchor_year']]
    return X, df.has_kidney_issue

### Train each local model

In [7]:
def getBestParams(hyperparams, ethnicity):
    # get the hyperparameters corresponding to the ethnicity
    return hyperparams.loc[hyperparams['ethnicity']==ethnicity]

In [8]:
def trainModel(name, path, all_params):
    # train a model with the specified params
    train, val, test = read_train_val_test(name, path)
    hyperparams = getBestParams(all_params, name)
    X_train, y_train = splitFeaturesLabels(train)
    clf = RandomForestClassifier(n_estimators=int(hyperparams['n_estimators']),
                                 max_depth=int(hyperparams['max_depth']),
                                 random_state=42)
    clf.fit(X_train, y_train)
    return clf

In [11]:
models = []
params = readHyperparams()
for eth in ethnic_group_names:
    models.append(trainModel(eth, path, params))
print(len(models))

8


### Aggregate models into federated forest

In [9]:
def aggregateForests(agg, model, n):
    # Aggregate the estimators of
    size = model.n_estimators
    dupl = math.floor(n/size)
    for i in range(dupl):
        agg.estimators_ += model.estimators_
        agg.n_estimators = len(agg.estimators_)
    return agg

In [21]:
def getMaxForest(model_list):
    # normalize over the maximum number of trees so each model is weighted the same in the aggregated model
    sizes = [len(model.estimators_) for model in model_list]
    return max(sizes)

In [12]:
def federatedForest(model_list):
    # Aggregate all models and normalize
    ff = model_list[0]
    max_size = getMaxForest(model_list)
    for i, model in enumerate(model_list):
        if i > 0:
            ff = aggregateForests(ff, model_list[i], max_size)
    return ff

In [18]:
fed = federatedForest(models)

### Model evaluations

In [15]:
def evaluateMetrics(model, X, y):
    y_pr = model.predict(X)
    accuracy = accuracy_score(y, y_pr)
    precision = precision_score(y, y_pr)
    recall = recall_score(y, y_pr)
    auc = roc_auc_score(y, y_pr)
    return accuracy, precision, recall, auc

In [28]:
# for each local model and the federated model, evaluate metrics like this:

def printMetrics(model, ethnicity, path):
    train, val, test = read_train_val_test(ethnicity, path)
    X, y = splitFeaturesLabels(test)
    acc, prec, rec, auc = evaluateMetrics(model, X, y)
    print('Local model:\t\t' + ethnicity)
    print('Accuracy: \t\t\t' + str(acc))
    print('Precision: \t\t\t' + str(prec))
    print('Recall: \t\t\t' + str(rec))
    print('AUC: \t\t\t\t' + str(auc) + '\n\n')

def printFedMetrics(model, ethnicity, path):
    train, val, test = read_train_val_test(ethnicity, path)
    X, y = splitFeaturesLabels(test)
    acc, prec, rec, auc = evaluateMetrics(model, X, y)
    print('Federated model: \t' + ethnicity)
    print('Accuracy: \t\t\t' + str(acc))
    print('Precision: \t\t\t' + str(prec))
    print('Recall: \t\t\t' + str(rec))
    print('AUC: \t\t\t\t' + str(auc) + '\n\n')

In [29]:
for i in range(len(models)):
    # Since 'unknown' is overwritten as the aggregated model, redefine it
    if i == 0:
        unknown = trainModel('unknown', path, params)
        printMetrics(unknown, ethnic_group_names[i], path)
    else:
        printMetrics(models[i], ethnic_group_names[i], path)
    printFedMetrics(fed, ethnic_group_names[i], path)


Local model:		unknown
Accuracy: 			0.8856848609680742
Precision: 			0.6822033898305084
Recall: 			0.5227272727272727
AUC: 				0.7384138199621676


Federated model: 	unknown
Accuracy: 			0.8764160659114315
Precision: 			0.6055900621118012
Recall: 			0.6331168831168831
AUC: 				0.7776967524519544


Local model:		white
Accuracy: 			0.8174476393326233
Precision: 			0.721097615834458
Recall: 			0.394002703699152
AUC: 				0.6728458214019195




KeyboardInterrupt: 