# Random Forest Classifier

This is a fast sklearn classifier, to investigate the model performance in the exploration phase. Further tuning of hyperparameters must be done. Also, the data needs further preprocessing, so we leave out all the object type columns.

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.base import clone

import glob

ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']
path = './data/preprocessing_IV/'

def read_train_val_test(name, path):
    train = pd.read_csv(path + name + '_train.csv')
    validate = pd.read_csv(path + name + '_validate.csv')
    test = pd.read_csv(path + name + '_test.csv')
    return train, validate, test

def readHyperparams():
    return pd.read_csv('hyperparameters.csv')

In [2]:
# split features and labels
def splitFeaturesLabels(df):
    # X = df.copy()
    # X.drop('has_kidney_issue', axis=1)
    X = df[['anchor_age', 'anchor_year']]
    return X, df.has_kidney_issue

In [3]:
def getBestParams(hyperparams, ethnicity):
    return hyperparams.loc[hyperparams['ethnicity']==ethnicity]

In [4]:
def trainModel(name, path, all_params):
    train, val, test = read_train_val_test(name, path)
    hyperparams = getBestParams(all_params, name)
    X_train, y_train = splitFeaturesLabels(train)
    clf = RandomForestClassifier(n_estimators=int(hyperparams['n_estimators']),
                                 max_depth=int(hyperparams['max_depth']),
                                 random_state=42)
    clf.fit(X_train, y_train)
    return clf

In [5]:
def aggregateForests(agg, model):
    agg.estimators_ += model.estimators_
    agg.n_estimators = len(model.estimators_)
    return agg

In [6]:
models = []
params = readHyperparams()
for eth in ethnic_group_names:
    models.append(trainModel(eth, path, params))
print(len(models))

8


In [7]:
def federatedForest(model_list):
    ff = model_list[0]
    for i, model in enumerate(model_list):
        if i > 0:
            ff = aggregateForests(ff, model_list[i])
    return ff

In [8]:
fed_test = federatedForest(models)

In [9]:
train, val, test = read_train_val_test('white', path)
X_test, y_test = splitFeaturesLabels(test)
y_pred = fed_test.predict(X_test)

In [10]:
print(accuracy_score(y_test, y_pred))

0.7592888415572122


In [11]:
from pprint import pprint
clf = RandomForestClassifier(random_state=42)
pprint(clf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [12]:
# predict labels
def pred(clf, X_test):
    return [clf[i].predict(X_test[i]) for i in range(len(X_test))]

# calculate metrics
def printMetrics(clf, y_test, y_pred):
    for i in range(len(clf)):
        print('Accuracy for ', ethnicities[i], ' is ', accuracy_score(y_test, y_pred))
        print('Precision for ', ethnicities[i], ' is ', precision_score(y_test, y_pred))
        print('Recall for ', ethnicities[i], ' is ', recall_score(y_test, y_pred))

In [14]:
federated = federatedForest(models)

In [15]:
y_pred = federated.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7592888415572122


In [21]:
y_pred = models[0].predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7592888415572122


33804