In [10]:
import pandas as pd

ADMISSION_DATA_COLS = ["subject_id", "hadm_id", "marital_status", "race"]
PERSONAL_DATA_COLS = ["subject_id", "gender", "anchor_age"]
MESURE_DATA_COLS = ['hadm_id', 'stay_id', 'charttime', "itemid", "valuenum"]

In [11]:
from constants import TEMP_PATH


dfPatient = pd.read_csv(TEMP_PATH / "icu_target.csv", usecols=["hadm_id", "stay_id"])
dfPatient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8866 entries, 0 to 8865
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   hadm_id  8866 non-null   int64
 1   stay_id  8866 non-null   int64
dtypes: int64(2)
memory usage: 138.7 KB


In [12]:
from constants import MIMIC_PATH


dfPatientAdmission = pd.read_csv(MIMIC_PATH / "hosp" / "admissions.csv", usecols=ADMISSION_DATA_COLS)
dfPatientPersonal = pd.read_csv(MIMIC_PATH / "hosp" / "patients.csv", usecols=PERSONAL_DATA_COLS)

dfPatientInfo = pd.merge(dfPatientAdmission, dfPatientPersonal, on="subject_id")
del dfPatientAdmission
del dfPatientPersonal

dfPatientInfo.drop(columns="subject_id", inplace=True)
dfPatientInfo["marital_status"] = dfPatientInfo["marital_status"].astype("category")
dfPatientInfo["race"] = dfPatientInfo["race"].astype("category")
dfPatientInfo["gender"] = dfPatientInfo["gender"].astype("category")

dfPatientInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431231 entries, 0 to 431230
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   hadm_id         431231 non-null  int64   
 1   marital_status  421998 non-null  category
 2   race            431231 non-null  category
 3   gender          431231 non-null  category
 4   anchor_age      431231 non-null  int64   
dtypes: category(3), int64(2)
memory usage: 7.8 MB


In [13]:
dfPatient = pd.merge(dfPatient, dfPatientInfo, on= "hadm_id")
del dfPatientInfo

dfPatient.drop(columns="hadm_id", inplace=True)
dfPatient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8866 entries, 0 to 8865
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   stay_id         8866 non-null   int64   
 1   marital_status  8293 non-null   category
 2   race            8866 non-null   category
 3   gender          8866 non-null   category
 4   anchor_age      8866 non-null   int64   
dtypes: category(3), int64(2)
memory usage: 166.3 KB


In [14]:
dfIcuStay = pd.read_csv(
    MIMIC_PATH / "icu" / "icustays.csv", 
    usecols=["stay_id", "intime", "outtime"], 
    parse_dates=["intime", "outtime"]
)
dfPatient = pd.merge(dfPatient, dfIcuStay, on= "stay_id")
del dfIcuStay

dfPatient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8866 entries, 0 to 8865
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   stay_id         8866 non-null   int64         
 1   marital_status  8293 non-null   category      
 2   race            8866 non-null   category      
 3   gender          8866 non-null   category      
 4   anchor_age      8866 non-null   int64         
 5   intime          8866 non-null   datetime64[ns]
 6   outtime         8866 non-null   datetime64[ns]
dtypes: category(3), datetime64[ns](2), int64(2)
memory usage: 304.8 KB


In [19]:
for rowId, row in dfPatient.iterrows():
    x = row.index
    break
x

Index(['stay_id', 'marital_status', 'race', 'gender', 'anchor_age', 'intime',
       'outtime'],
      dtype='object')

In [22]:
from constants import IMPORTANT_MESUREMENTS_ICU


MESURE_INTERVAL = pd.Timedelta(hours=6)
MESURE_GROUP_LENGTH = 6

dfs = []

dfWillAkd = pd.read_csv(TEMP_PATH / "will_akd.csv", parse_dates=["charttime"])

for rowId, row in dfPatient.iterrows():
    for i in range(99):
        if (row["intime"] + (i + MESURE_GROUP_LENGTH) * MESURE_INTERVAL > row["outtime"]):
            break

        # get last group, check will_akd
        newDf = pd.DataFrame(data=row, copy=True)
        lastFrameTime = row["intime"] + (i + MESURE_GROUP_LENGTH) * MESURE_INTERVAL
        currentPatientWillAkd = dfWillAkd[dfWillAkd["stay_id"] == row["stay_id"]]
        willAkd = any(
            (lastFrameTime <= currentPatientWillAkd["charttime"]) &
            (currentPatientWillAkd["charttime"] < lastFrameTime + MESURE_INTERVAL) &
            (dfWillAkd["will_akd"])
        )
        newDf["will_akd"] = willAkd
        
        # find mesures 
        for mesureId, mesureName in IMPORTANT_MESUREMENTS_ICU.items():
            dfCurrentMesure = pd.read_csv(TEMP_PATH / ("chartevent_" + mesureName + ".csv"), parse_dates=["charttime"])
            dfCurrentMesure = dfCurrentMesure[dfCurrentMesure["stay_id"] == row["stay_id"]]
            
            mesureValues = []
            for ii in range(MESURE_GROUP_LENGTH):
                lowTime = row["intime"] + (i + ii) * MESURE_INTERVAL
                highTime = lowTime + MESURE_INTERVAL
                
                iiMesures = dfCurrentMesure[
                    (lowTime <= dfCurrentMesure["charttime"]) &
                    (dfCurrentMesure["charttime"] < highTime)                    
                ]
                
                if (len(iiMesures) <= 0):
                    value = 0
                    pass
                else:
                    value = iiMesures["valuenum"].std()
                    pass
                
                mesureValues.append(value)
                pass
            
            for idx, value in enumerate(mesureValues):
                newDf[mesureName + str(idx)] = value
                pass    
            pass
        
        dfs.append(newDf)
        pass
    pass

mainDf = pd.concat(dfs)
mainDf.to_csv(TEMP_PATH / "maindf.csv")