### Import most libraries needed 

In [1]:
import pandas as pd
from datetime import datetime
from pathlib import Path

### Define most constants needed

In [33]:
# Define ICD-9/10 codes for DKA
dka_code9 = ["2501"] 
dka_code10 = ["E1010", "E1011", "E1021", "E1022", "E1110", "E1111", "E1121", "E1122"]

# Define CKD stage 5 codes
ckd5_code9 = ""
ckd5_code10 = ["N185", "N186"]
ckd_stage5_codes = {
    "icd9": ["585.6", "753.13"],
    "icd10": ["N18.5", "N18.6"],
}

# mimic PATH (in which there are hosp and icu data)
MIMIC_PATH = Path("../mimiciv2.2/")

### Filter patients 
1. Only patients that caught DKA
1. Filtering repeated admissions during one hospitalization, take first
1. Remove patients with more than 20% missing data

#### Read icd_code 

In [3]:
df_diagnoses_icd = pd.read_csv(str(MIMIC_PATH/"hosp"/"diagnoses_icd.csv"))
df_diagnoses_icd.dtypes

subject_id      int64
hadm_id         int64
seq_num         int64
icd_code       object
icd_version     int64
dtype: object

In [4]:
# refine some column data
df_diagnoses_icd["icd_code"] = df_diagnoses_icd["icd_code"].astype(str)
df_diagnoses_icd["icd_version"] = df_diagnoses_icd["icd_version"].astype(int)

df_diagnoses_icd.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,7070,9
4,10000032,22595853,5,496,9


#### Only patients that caught DKA

In [34]:

dka_condition = ((df_diagnoses_icd["icd_version"] == 10) & df_diagnoses_icd["icd_code"].isin(dka_code10)) | \
    ((df_diagnoses_icd["icd_version"] == 9) & df_diagnoses_icd["icd_code"].str.startswith(dka_code9))

df_dka_diagnoses = df_diagnoses_icd[dka_condition]
df_dka_diagnoses

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
235,10000980,20897796,5,E1122,10
358,10000980,29659838,4,E1121,10
922,10002013,21763296,2,E1110,10
923,10002013,21763296,3,E1122,10
1055,10002013,25442395,25,E1122,10
...,...,...,...,...,...
4753469,19993951,28863685,5,E1122,10
4754014,19995012,27305089,3,E1121,10
4754479,19996654,26946592,8,E1122,10
4755147,19997538,22701415,14,E1121,10


#### Exclude CKD stage 5 patients 

In [None]:
ckd5_condition = ((df_diagnoses_icd["icd_version"] == 10) & df_diagnoses_icd["icd_code"].isin(dka_code10)) | \
    ((df_diagnoses_icd["icd_version"] == 9) & df_diagnoses_icd["icd_code"].str.startswith(dka_code9))

df_dka_diagnoses = df_diagnoses_icd[dka_condition]
df_dka_diagnoses

ckd_stage5_patients = df_diagnoses_icd[
    (df_diagnoses_icd["ICD9_CODE"].isin(ckd_stage5_codes["icd9"]))
    | (df_diagnoses_icd["ICD10_CODE"].isin(ckd_stage5_codes["icd10"]))
]
df_dka_diagnoses = df_dka_diagnoses[~df_dka_diagnoses["SUBJECT_ID"].isin(ckd_stage5_patients["SUBJECT_ID"])]

In [None]:
if True:
    
    

    # Exclude patients with repeated ICU admissions
    icu_stays = pd.read_csv(f"{mimic_path}/ICUSTAYS.csv")
    icu_stays["ICU_IN_TIME"] = pd.to_datetime(icu_stays["ICU_IN_TIME"])
    first_admissions = icu_stays.groupby("SUBJECT_ID").agg(
        first_icu_admission=("ICU_IN_TIME", "min")
    )
    first_admissions.reset_index(inplace=True)
    df_dka_diagnoses = df_dka_diagnoses.merge(
        first_admissions, on="SUBJECT_ID", how="inner"
    )
    df_dka_diagnoses = df_dka_diagnoses.merge(
        icu_stays, on=["SUBJECT_ID", "ICUSTAY_ID"], how="inner"
    )

    # Exclude patients with more than 20% missing data
    # TODO: Implement missing data calculation and filtering




# Example usage
dka_patients_df = filter_dka_patients("../mimiciv2.2/hosp")
print(f"Number of filtered DKA patients: {len(dka_patients_df)}")
