# Preprocessing Ethnicities

This file takes the preprocessed ethnicity files and labels the dependent variables.

In [2]:
import pandas as pd
import numpy as np

## Determine most frequently occuring dissease: kidney disease

We want to label our data with 'has dissease x' or 'does not have dissease x' for our classification model. For the purpose of this research, what the model predicts is not really of importance with respect to the model. Therefore we decided to take the most occuring disease, as this will give the model more data to train on. The general class of disease or failure most occuring is linked to kidneys.

In [3]:
d_icd_diagnoses = pd.read_csv("mimic-iv-0.4/hosp/d_icd_diagnoses.csv.gz")
diagnoses_icd = pd.read_csv("mimic-iv-0.4/hosp/diagnoses_icd.csv.gz")

In [4]:
icd_merged = diagnoses_icd.merge(d_icd_diagnoses, how='left', on='icd_code')
icd_merged

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version_x,icd_version_y,long_title
0,11603789,25552978,3,65801,9,9.0,"Oligohydramnios, delivered, with or without me..."
1,11603789,25552978,4,64881,9,9.0,"Abnormal glucose tolerance of mother, delivere..."
2,11603789,25552978,5,V270,9,9.0,"Outcome of delivery, single liveborn"
3,11603789,25552978,1,65221,9,9.0,Breech presentation without mention of version...
4,11603789,25552978,2,64421,9,9.0,"Early onset of delivery, delivered, with or wi..."
...,...,...,...,...,...,...,...
4694781,13747041,25594844,10,N189,10,10.0,"Chronic kidney disease, unspecified"
4694782,13747041,25594844,5,N179,10,10.0,"Acute kidney failure, unspecified"
4694783,13747041,25594844,6,R531,10,10.0,Weakness
4694784,13747041,25594844,1,T50901A,10,10.0,"Poisoning by unspecified drugs, medicaments an..."


In [5]:
s = icd_merged['long_title'].str.split(expand=True).stack().value_counts()
print (s.head(60))

of               1778104
unspecified      1088674
and               544971
without           449505
Other             442281
or                346949
with              340853
Unspecified       299702
history           255947
kidney            249894
disease           237345
Personal          229676
mention           228753
not               224852
in                221578
chronic           210894
use               194642
other             192502
the               188993
heart             174470
specified         150981
hypertension      149858
Acute             145776
as                144116
neoplasm          141014
stage             131863
Chronic           128664
elsewhere         127653
disease,          126194
for               126045
classified        122058
(current)         118890
disorder,         115209
to                115118
status            111231
malignant         105934
mellitus          103659
failure            95474
initial            90766
encounter          89497


## Collect hadm_ids codes for kidney related diagnoses

In [6]:
# remove subjects without known diagnose
icd_merged = icd_merged.dropna(subset=['long_title'])

# get dataframe where everyone has kidney related diagnose
kidney = icd_merged[icd_merged['long_title'].str.contains('kidney')]
kidney

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version_x,icd_version_y,long_title
76,19586042,23279761,2,5849,9,9.0,"Acute kidney failure, unspecified"
517,16108683,21607477,4,5845,9,9.0,Acute kidney failure with lesion of tubular ne...
537,14411859,24976204,12,5845,9,9.0,Acute kidney failure with lesion of tubular ne...
818,10401102,22101463,2,40390,9,9.0,"Hypertensive chronic kidney disease, unspecifi..."
819,10401102,22101463,3,5859,9,9.0,"Chronic kidney disease, unspecified"
...,...,...,...,...,...,...,...
4694671,17533213,27704968,6,N179,10,10.0,"Acute kidney failure, unspecified"
4694673,17533213,27704968,5,E1122,10,10.0,Type 2 diabetes mellitus with diabetic chronic...
4694690,16781303,20378962,2,N179,10,10.0,"Acute kidney failure, unspecified"
4694781,13747041,25594844,10,N189,10,10.0,"Chronic kidney disease, unspecified"


In [7]:
kidney_hadm_ids = kidney['hadm_id'].unique()
kidney_hadm_ids

array([23279761, 21607477, 24976204, ..., 27704968, 20378962, 25594844],
      dtype=int64)

## Add labels to ethnicity files

Adds column 'has_kidney_issue' with True if the ICD code is contained by the kidney_hadm_icds, and False otherwise. The icd_code column is removed afterwards, otherwise the model could use the pattern in the icd_codes to determine the value of has_kidney_issue.

In [8]:
def add_labels (df_name):
    # import data
    print('Importing ', df_name)
    df = pd.read_csv('data/preprocessing_I/' + df_name + '.csv')

    # add has_kidney_issue column
    print("Adding labels for ", df_name)
    df['has_kidney_issue'] = False
    for row in df.itertuples():
        if row.hadm_id in kidney_hadm_ids:
            df.at[row.Index, 'has_kidney_issue'] = True

    # remove icd column
    df = df.drop(columns='icd_code')

    # save csv
    print('Saving .csv for ', df_name)
    df.to_csv("data/preprocessing_II/" + df_name + ".csv")

In [9]:
ethnic_group_names = ['unknown', 'white', 'other', 'asian', 'hispanic_latino', 'black_african_american', 'unable_to_obtain', 'american_indian_alaska_native']

for name in ethnic_group_names:
    add_labels(name)

Importing  unknown
Adding labels for  unknown
Saving .csv for  unknown
Importing  white
Adding labels for  white
Saving .csv for  white
Importing  other
Adding labels for  other
Saving .csv for  other
Importing  asian
Adding labels for  asian
Saving .csv for  asian
Importing  hispanic_latino
Adding labels for  hispanic_latino
Saving .csv for  hispanic_latino
Importing  black_african_american
Adding labels for  black_african_american
Saving .csv for  black_african_american
Importing  unable_to_obtain
Adding labels for  unable_to_obtain
Saving .csv for  unable_to_obtain
Importing  american_indian_alaska_native
Adding labels for  american_indian_alaska_native
Saving .csv for  american_indian_alaska_native
