# Random Forest Classifier

This is a fast sklearn classifier, to investigate the model performance in the exploration phase. Further tuning of hyperparameters must be done. Also, the data needs further preprocessing, so we leave out all the object type columns.

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

import glob

ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']

train_arr = [None]*len(ethnic_group_names)
test_arr = [None]*len(ethnic_group_names)
val_arr = [None]*len(ethnic_group_names)

path = './data/preprocessing_IV'
files = glob.glob(path + "/*.csv")

for i, group in enumerate(ethnic_group_names):
    test_arr[i] = pd.read_csv(files[3*i])
    train_arr[i] = pd.read_csv(files[3*i+1])
    val_arr[i] = pd.read_csv(files[3*i+2])

In [47]:
def printEthnicities(csv_files):
    return [csv_files[counter].split("\\")[1].split(".")[0] for counter in range(len(csv_files))]

ethnicities = printEthnicities(files)

In [48]:
def dropCols(dfs, cols):
    for col in cols:
        for df in range(len(dfs)):
            dfs[df] = dfs[df].drop(col, axis=1)
    return dfs

drop_cols = ['Unnamed: 0.1', 'Unnamed: 0', 'hadm_id', 'subject_id', 'edregtime', 'emar_charttime', 'lab_charttime']
dfs = dropCols(dfs, drop_cols)

In [51]:
# split features and labels
y = [df['has_kidney_issue'] for df in dfs]
X = [df.drop('has_kidney_issue', axis=1) for df in dfs]

In [66]:
def train_models(features, labels, test_size=0.3, random_state=42, max_depth=10):
    """
    Function to train a model for each ethnicity
    :param features: list of dataframes containing the features of each ethnicity respectively
    :param labels: list of dataframes containing the labels for each ethnicity respectively
    :param test_size: hyperparameter for random forests
    :param random_state: hyperparameter for random forests
    :param max_depth: hyperparameter for random forests
    :return: list of models for each ethnicity respectively
    """

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

    # n is the number of ethnicities
    n = len(features)
    X_train = [None]*n
    X_test = [None]*n
    y_train = [None]*n
    y_test = [None]*n
    clf = [RandomForestClassifier()]*n
    clf_validated = [RandomizedSearchCV(estimator = clf[i], param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=random_state, n_jobs = -1) for i in range(n)]
    for i, ethnicity in enumerate(features):
    # # split train and test
        X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(features, labels, test_size=test_size, random_state=random_state)

    # # train classifier
    for i in range(n):
        clf_validated[i].fit(X_train[i], y_train[i])

    return clf_validated, X_test, y_test



In [67]:
# predict labels
def pred(clf, X_test):
    return [clf[i].predict(X_test[i]) for i in range(len(X_test))]

# calculate metrics
def printMetrics(clf, y_test, y_pred):
    for i in range(len(clf)):
        print('Accuracy for ', ethnicities[i], ' is ', accuracy_score(y_test, y_pred))
        print('Precision for ', ethnicities[i], ' is ', precision_score(y_test, y_pred))
        print('Recall for ', ethnicities[i], ' is ', recall_score(y_test, y_pred))

In [68]:
models_validated, X_test, y_test = train_models(features=X, labels=y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


300 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\alexa\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\alexa\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "C:\Users\alexa\anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\alexa\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 964, in check_X_y
    X = check_array(
  File "C:\Users\a

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.