# Random Forest Classifier

This is a fast sklearn classifier, to investigate the model performance in the exploration phase. Further tuning of hyperparameters must be done. Also, the data needs further preprocessing, so we leave out all the object type columns.

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
import math

import glob

ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']
path = './data/preprocessing_IV/'

def read_train_val_test(name, path):
    train = pd.read_csv(path + name + '_train.csv')
    # NOTE: drope datetime columns for now, because sklearn does not support it
    train = train.drop(columns=['admittime', 'edregtime', 'emar_charttime', 'lab_charttime'])
    validate = pd.read_csv(path + name + '_validate.csv')
    # NOTE: drope datetime columns for now, because sklearn does not support it
    validate = validate.drop(columns=['admittime', 'edregtime', 'emar_charttime', 'lab_charttime'])
    test = pd.read_csv(path + name + '_test.csv')
    # NOTE: drope datetime columns for now, because sklearn does not support it
    test = test.drop(columns=['admittime', 'edregtime', 'emar_charttime', 'lab_charttime'])
    return train, validate, test

def readHyperparams():
    return pd.read_csv('hyperparameters.csv')

In [3]:
# split features and labels
def splitFeaturesLabels(df):
    X = df.copy()
    X = X.drop('has_kidney_issue', axis=1)
    # X = df[['anchor_age', 'anchor_year']]
    return X, df.has_kidney_issue

### Train each local model

In [4]:
def getBestParams(hyperparams, ethnicity):
    # get the hyperparameters corresponding to the ethnicity
    return hyperparams.loc[hyperparams['ethnicity']==ethnicity]

In [5]:
def trainModel(name, path, all_params):
    # train a model with the specified params
    train, val, test = read_train_val_test(name, path)
    hyperparams = getBestParams(all_params, name)
    X_train, y_train = splitFeaturesLabels(train)
    clf = RandomForestClassifier(n_estimators=int(hyperparams['n_estimators']),
                                 max_depth=int(hyperparams['max_depth']),
                                 random_state=42)
    clf.fit(X_train, y_train)
    return clf

In [6]:
models = []
params = readHyperparams()
for eth in ethnic_group_names:
    models.append(trainModel(eth, path, params))
print(len(models))

8


### Aggregate models into federated forest

In [7]:
def aggregateForests(agg, model, n):
    # Aggregate the estimators of
    size = model.n_estimators
    dupl = math.floor(n/size)
    for i in range(dupl):
        agg.estimators_ += model.estimators_
        agg.n_estimators = len(agg.estimators_)
    return agg

In [8]:
def getMaxForest(model_list):
    # normalize over the maximum number of trees so each model is weighted the same in the aggregated model
    sizes = [len(model.estimators_) for model in model_list]
    return max(sizes)

In [9]:
def federatedForest(model_list):
    # Aggregate all models and normalize
    ff = model_list[0]
    max_size = getMaxForest(model_list)
    for i, model in enumerate(model_list):
        if i > 0:
            ff = aggregateForests(ff, model_list[i], max_size)
    return ff

In [10]:
fed = federatedForest(models)

### Model evaluations

In [11]:
def evaluateMetrics(model, X, y):
    y_pr = model.predict(X)
    accuracy = accuracy_score(y, y_pr)
    precision = precision_score(y, y_pr)
    recall = recall_score(y, y_pr)
    auc = roc_auc_score(y, y_pr)
    return accuracy, precision, recall, auc

In [18]:
# for each local model and the federated model, evaluate metrics like this:

def printMetrics(model, ethnicity, path):
    train, val, test = read_train_val_test(ethnicity, path)
    X, y = splitFeaturesLabels(test)
    acc, prec, rec, auc = evaluateMetrics(model, X, y)
    print('Local model:\t\t' + ethnicity)
    print('Accuracy: \t\t\t' + str(acc))
    print('Precision: \t\t\t' + str(prec))
    print('Recall: \t\t\t' + str(rec))
    print('AUC: \t\t\t\t' + str(auc) + '\n\n')

def printFedMetrics(model, ethnicity, path):
    train, val, test = read_train_val_test(ethnicity, path)
    X, y = splitFeaturesLabels(test)
    acc, prec, rec, auc = evaluateMetrics(model, X, y)
    print('Federated model: \t' + ethnicity)
    print('Accuracy: \t\t\t' + str(acc))
    print('Precision: \t\t\t' + str(prec))
    print('Recall: \t\t\t' + str(rec))
    print('AUC: \t\t\t\t' + str(auc) + '\n\n')

def metricsDF(model_type, model, ethnicity, path):
    train, val, test = read_train_val_test(ethnicity, path)
    X, y = splitFeaturesLabels(test)
    acc, prec, rec, auc = evaluateMetrics(model, X, y)
    return [model_type, ethnicity, round(acc, 4), round(prec,4), round(rec, 4), round(auc, 4)]

In [19]:
dfs = []
columns = ['Model', 'Ethnicity', 'Accuracy', 'Precision', 'Recall', 'AUC']

for i in range(len(models)):
    # Since 'unknown' is overwritten as the aggregated model, redefine it
    if i == 0:
        unknown = trainModel('unknown', path, params)
        printMetrics(unknown, ethnic_group_names[i], path)
        dfs.append(metricsDF('Local', unknown, ethnic_group_names[i], path))
    else:
        printMetrics(models[i], ethnic_group_names[i], path)
        dfs.append(metricsDF('Local', models[i], ethnic_group_names[i], path))
    printFedMetrics(fed, ethnic_group_names[i], path)
    dfs.append(metricsDF('Federated', fed, ethnic_group_names[i], path))



Local model:		unknown
Accuracy: 			0.8851699279093718
Precision: 			0.6855895196506551
Recall: 			0.5097402597402597
AUC: 				0.7328383061247199


Federated model: 	unknown
Accuracy: 			0.8748712667353244
Precision: 			0.6006191950464397
Recall: 			0.6298701298701299
AUC: 				0.7754613807245386


Local model:		white
Accuracy: 			0.8144894095373328
Precision: 			0.7039790118058592
Recall: 			0.3957232395231658
AUC: 				0.6714853389340611


Federated model: 	white
Accuracy: 			0.8155543722636375
Precision: 			0.6778900112233446
Recall: 			0.445372987587563
AUC: 				0.689141474897923


Local model:		other
Accuracy: 			0.8319672131147541
Precision: 			0.5949367088607594
Recall: 			0.2842741935483871
AUC: 				0.6201992539954001


Federated model: 	other
Accuracy: 			0.8438897168405365
Precision: 			0.6222222222222222
Recall: 			0.3951612903225806
AUC: 				0.6703868608834109


Local model:		asian
Accuracy: 			0.8715334420880914
Precision: 			0.6263345195729537
Recall: 			0.45595854922279794


In [31]:
df = pd.DataFrame(dfs, columns=columns)
df.head()

Unnamed: 0,Model,Ethnicity,Accuracy,Precision,Recall,AUC
0,Local,unknown,0.8852,0.6856,0.5097,0.7328
1,Federated,unknown,0.8749,0.6006,0.6299,0.7755
2,Local,white,0.8145,0.704,0.3957,0.6715
3,Federated,white,0.8156,0.6779,0.4454,0.6891
4,Local,other,0.832,0.5949,0.2843,0.6202


In [35]:
df.to_csv('./metrics.csv', index=False)

In [34]:
df.loc['mean1'] = df[df['Model']=='Local'].mean()
df.loc['mean2'] = df[df['Model']=='Federated'].mean()


  df.loc['mean1'] = df[df['Model']=='Local'].mean()
  df.loc['mean2'] = df[df['Model']=='Federated'].mean()


In [30]:
df[df['Model']=='Local']

Unnamed: 0,Model,Ethnicity,Accuracy,Precision,Recall,AUC
0,Local,unknown,0.8852,0.6856,0.5097,0.7328
1,Federated,unknown,0.8749,0.6006,0.6299,0.7755
2,Local,white,0.8145,0.704,0.3957,0.6715
3,Federated,white,0.8156,0.6779,0.4454,0.6891
4,Local,other,0.832,0.5949,0.2843,0.6202
5,Federated,other,0.8439,0.6222,0.3952,0.6704
6,Local,asian,0.8715,0.6263,0.456,0.7026
7,Federated,asian,0.8752,0.6695,0.4093,0.6858
8,Local,hispanic_latino,0.8341,0.6885,0.3013,0.6338
9,Federated,hispanic_latino,0.8391,0.7116,0.3199,0.6439


In [36]:
df

Unnamed: 0,Model,Ethnicity,Accuracy,Precision,Recall,AUC
0,Local,unknown,0.8852,0.6856,0.5097,0.7328
1,Federated,unknown,0.8749,0.6006,0.6299,0.7755
2,Local,white,0.8145,0.704,0.3957,0.6715
3,Federated,white,0.8156,0.6779,0.4454,0.6891
4,Local,other,0.832,0.5949,0.2843,0.6202
5,Federated,other,0.8439,0.6222,0.3952,0.6704
6,Local,asian,0.8715,0.6263,0.456,0.7026
7,Federated,asian,0.8752,0.6695,0.4093,0.6858
8,Local,hispanic_latino,0.8341,0.6885,0.3013,0.6338
9,Federated,hispanic_latino,0.8391,0.7116,0.3199,0.6439
