# Random Forest Classifier

This is a fast sklearn classifier, to investigate the model performance in the exploration phase. Further tuning of hyperparameters must be done. Also, the data needs further preprocessing, so we leave out all the object type columns.

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [19]:
ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']

for name in ethnic_group_names:
    # import data
    df = pd.read_csv('data/preprocessing_II/' + name +'.csv')
    df['admittime'] = pd.to_numeric(pd.to_datetime(df['admittime']))
    df['icd_code_count'] = df['icd_code_count'].fillna(0)
    df['icd_code_count'] = df['icd_code_count'].astype(int)
    df['emar_count'] = df['emar_count'].fillna(0)
    df['emar_count'] = df['emar_count'].astype(int)
    df['lab_count'] = df['lab_count'].fillna(0)
    df['lab_count'] = df['lab_count'].astype(int)
    # print(df.dtypes)

    # split labels and features
    X = df[['hadm_id', 'subject_id', 'admittime', 'anchor_age', 'anchor_year', 'icd_code_count', 'emar_count', 'lab_count']]
    y = df['has_kidney_issue']

    # split train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    # train classifier
    clf = RandomForestClassifier(max_depth=10, random_state=42)
    clf.fit(X_train, y_train)
    
    # predict labels
    y_pred = clf.predict(X_test)

    # calculate metrics
    print('Accuracy for ', name, ' is ', accuracy_score(y_test, y_pred))
    print('Precision for ', name, ' is ', precision_score(y_test, y_pred))
    print('Recall for ', name, ' is ', recall_score(y_test, y_pred))

Accuracy for  unknown  is  0.8860976751443282
Precision for  unknown  is  0.6598746081504702
Recall for  unknown  is  0.45074946466809424
Accuracy for  white  is  0.8521984671238403
Precision for  white  is  0.6528939306119069
Recall for  white  is  0.39071403795649373
Accuracy for  other  is  0.8807991872671859
Precision for  other  is  0.6501340482573726
Recall for  other  is  0.37890625
Accuracy for  asian  is  0.9099221549487211
Precision for  asian  is  0.6927374301675978
Recall for  asian  is  0.3974358974358974
Accuracy for  hispanic_latino  is  0.8820845584507756
Precision for  hispanic_latino  is  0.6890574214517876
Recall for  hispanic_latino  is  0.42063492063492064
Accuracy for  black_african_american  is  0.8507187476480771
Precision for  black_african_american  is  0.6987478975892356
Recall for  black_african_american  is  0.6135543157203807
Accuracy for  unable_to_obtain  is  0.9360323886639677
Precision for  unable_to_obtain  is  0.625
Recall for  unable_to_obtain  is  