#### Loading libraries

In [1]:
library(dplyr)
library(tidyr)
library(tibble)
library(lubridate)
library(readr)
library(stringr)
library(ggplot2)
library(data.table)
library(odbc)
library(RMariaDB)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year


The following objects are masked from ‘package:dplyr’:

    between, first, last




#### Connecting to the database

In [2]:
con <- dbConnect(
  drv = RMariaDB::MariaDB(),
  username = "gabriel.torresz",
  password = "tie9booX",
  host = "ehr3.deim.urv.cat",
  dbname = "mimiciiiv14",
  port = 3306
)

List and comprobation of the connection

In [3]:
dbListTables(con)

### Data selection
#### First filters: 
1. Age at admission [18 - 85] y.
2. More than 24 hours at ICU

Then we add some other demographic information 

### 1. Get first ICU stay per patient 

In [4]:
sql_first_icu <- "
SELECT 
    icu.SUBJECT_ID,
    icu.HADM_ID,
    MIN(icu.INTIME) AS ICU_ADMIT
FROM ICUSTAYS icu
GROUP BY icu.SUBJECT_ID
;
"

df_first_icu <- dbGetQuery(con, sql_first_icu)

### 2. Get detailed information for first ICU stay

In [5]:
sql_icu_full <- "
SELECT 
    icu.SUBJECT_ID,
    icu.HADM_ID,
    icu.INTIME AS ICU_ADMIT,
    icu.OUTTIME AS ICU_DISCH,
    TIMESTAMPDIFF(HOUR, icu.INTIME, icu.OUTTIME) AS ICU_LOS_HOURS,
    a.ADMITTIME AS HOSP_ADMIT,
    a.DISCHTIME AS HOSP_DISCH,
    TIMESTAMPDIFF(HOUR, a.ADMITTIME, a.DISCHTIME) AS HOSP_LOS_HOURS,
    p.GENDER,
    a.ETHNICITY,
    p.DOB,
    p.DOD
FROM ICUSTAYS icu
JOIN ADMISSIONS a ON icu.HADM_ID = a.HADM_ID
JOIN PATIENTS p ON icu.SUBJECT_ID = p.SUBJECT_ID
JOIN (
    SELECT SUBJECT_ID, MIN(INTIME) AS FIRST_ICU
    FROM ICUSTAYS
    GROUP BY SUBJECT_ID
) first_icu ON icu.SUBJECT_ID = first_icu.SUBJECT_ID AND icu.INTIME = first_icu.FIRST_ICU
WHERE TIMESTAMPDIFF(HOUR, icu.INTIME, icu.OUTTIME) >= 24
;
"

df_first_icu_full <- dbGetQuery(con, sql_icu_full)

### 3. Data cleaning and feature creation

In [6]:
df_clean <- df_first_icu_full %>%
  
  # Calculate age at ICU admission
  mutate(AGE = as.numeric(format(ICU_ADMIT, "%Y")) - as.numeric(format(DOB, "%Y"))) %>%
  
  # Keep only adult patients between 18 and 85 years
  filter(AGE >= 18 & AGE <= 85) %>%
  
  # Define mortality: 1 if patient died before or during hospital discharge
  mutate(MORTALITY = ifelse(!is.na(DOD) & DOD <= HOSP_DISCH, 1, 0)) %>%
  
  # Remove missing or unknown gender/ethnicity
  filter(
    !is.na(GENDER), !is.na(ETHNICITY),
    GENDER != "UNKNOWN", ETHNICITY != "UNKNOWN/NOT SPECIFIED"
  ) %>%
  
  # Group ethnicity into major categories
  mutate(
    ETHNICITY_GROUP = case_when(
      grepl("^WHITE", ETHNICITY) ~ "WHITE",
      grepl("^BLACK", ETHNICITY) ~ "BLACK",
      grepl("^HISPANIC", ETHNICITY) ~ "HISPANIC",
      grepl("^ASIAN", ETHNICITY) ~ "ASIAN",
      TRUE ~ "OTHER"
    ),
    ETHNICITY_GROUP = factor(ETHNICITY_GROUP, levels = c("WHITE", "BLACK", "HISPANIC", "ASIAN", "OTHER"))
  )


### 4. Check cleaned data

In [7]:
table(df_clean$GENDER)
table(df_clean$ETHNICITY_GROUP)
dim(df_clean)
head(df_clean)


    F     M 
10997 15507 


   WHITE    BLACK HISPANIC    ASIAN    OTHER 
   20697     2303     1008      703     1793 

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,ICU_ADMIT,ICU_DISCH,ICU_LOS_HOURS,HOSP_ADMIT,HOSP_DISCH,HOSP_LOS_HOURS,GENDER,ETHNICITY,DOB,DOD,AGE,MORTALITY,ETHNICITY_GROUP
Unnamed: 0_level_1,<int>,<int>,<dttm>,<dttm>,<int64>,<dttm>,<dttm>,<int64>,<chr>,<chr>,<dttm>,<dttm>,<dbl>,<dbl>,<fct>
1,3,145834,2101-10-20 19:10:11,2101-10-26 20:43:09,145,2101-10-20 19:08:00,2101-10-31 13:58:00,258,M,WHITE,2025-04-11,2102-06-14,76,0,WHITE
2,4,185777,2191-03-16 00:29:31,2191-03-17 16:46:31,40,2191-03-16 00:28:00,2191-03-23 18:41:00,186,F,WHITE,2143-05-12,,48,0,WHITE
3,6,107064,2175-05-30 21:30:54,2175-06-03 13:39:54,88,2175-05-30 07:15:00,2175-06-15 16:00:00,392,F,WHITE,2109-06-21,,66,0,WHITE
4,11,194540,2178-04-16 06:19:32,2178-04-17 20:21:05,38,2178-04-16 06:18:00,2178-05-11 19:00:00,612,F,WHITE,2128-02-22,2178-11-14,50,0,WHITE
5,12,112213,2104-08-08 02:08:17,2104-08-15 17:22:25,183,2104-08-07 10:15:00,2104-08-20 02:57:00,304,M,WHITE,2032-03-24,2104-08-20,72,1,WHITE
6,13,143045,2167-01-08 18:44:25,2167-01-12 10:43:31,87,2167-01-08 18:43:00,2167-01-15 15:15:00,164,F,WHITE,2127-02-27,,40,0,WHITE


## Phase 1 of ICU Data Cleaning and Feature Engineering – Summary

**So far, we have:**

1. **Selected the first ICU admission per patient**  
   - For each patient, we keep only their **first ICU stay**.

2. **Joined the ICU, Admissions, and Patients tables**  
   - This allows us to have **demographics** (gender, date of birth, ethnicity) and **admission/discharge times** for both the ICU and the hospital.  
   - We also calculate **lengths of stay (LOS) in hours** for the ICU and hospital.

3. **Calculated age at ICU admission and filtered adult patients (18–85 years old)**  
   - Ensures the analysis focuses on **adult patients** only.

4. **Created a mortality flag**  
   - `MORTALITY = 1` if the patient **died before or during hospital discharge**, otherwise `0`.

5. **Cleaned missing or unknown gender/ethnicity values**  
   - Removes entries with **unknown or missing gender or ethnicity** to ensure data quality.

6. **Grouped ethnicity into five categories and converted to a factor**  
   - Categories: `WHITE`, `BLACK`, `HISPANIC`, `ASIAN`, `OTHER`.

7. **Checked the cleaned dataset**  
   - Verified **tables of gender and ethnicity**, **dimensions**, and a **preview of the first rows**.


# ICU Cohort – Adding Comorbidity Flags

This section adds **binary comorbidity flags** to the existing cleaned ICU cohort. 
The flags are derived from ICD-9 diagnosis descriptions.
## 1. SQL query to extract comorbidities

In [8]:
# ---------------------------------------------
# 1. Traer tablas de diagnósticos desde la base de datos
# ---------------------------------------------
dx <- dbGetQuery(con, "SELECT * FROM DIAGNOSES_ICD")
d_icd_diagnoses <- dbGetQuery(con, "SELECT * FROM D_ICD_DIAGNOSES")

# ---------------------------------------------
# 2. Filtrar solo los HADM_ID de nuestra cohorte
# ---------------------------------------------
df_diag <- dx %>%
  filter(HADM_ID %in% df_clean$HADM_ID) %>%
  left_join(d_icd_diagnoses, by = c("ICD9_CODE" = "ICD9_CODE"))

# ---------------------------------------------
# 3. Crear flags de comorbilidades agrupando por HADM_ID
# ---------------------------------------------
df_flags <- df_diag %>%
  group_by(HADM_ID) %>%
  summarise(
    flag_diabetes = max(grepl("diabetes", tolower(LONG_TITLE))),
    flag_hypertension = max(grepl("hypertension|high blood pressure", tolower(LONG_TITLE))),
    flag_ckd = max(grepl("chronic kidney|renal failure|kidney failure", tolower(LONG_TITLE))),
    flag_chf = max(grepl("heart failure|congestive heart", tolower(LONG_TITLE))),
    flag_copd = max(grepl("copd|chronic obstructive|emphysema|chronic bronchitis", tolower(LONG_TITLE))),
    flag_cancer = max(grepl("malignan|cancer|carcinoma|neoplasm|tumor", tolower(LONG_TITLE)))
  )

# ---------------------------------------------
# 4. Unir los flags con df_clean usando HADM_ID
# ---------------------------------------------
df_final <- df_clean %>%
  left_join(df_flags, by = "HADM_ID")

# ---------------------------------------------
# 5. Revisar resultado
# ---------------------------------------------
head(df_final)
table(df_final$flag_diabetes, useNA = "ifany")
table(df_final$flag_hypertension, useNA = "ifany")
table(df_final$flag_ckd, useNA = "ifany")


Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,ICU_ADMIT,ICU_DISCH,ICU_LOS_HOURS,HOSP_ADMIT,HOSP_DISCH,HOSP_LOS_HOURS,GENDER,ETHNICITY,⋯,DOD,AGE,MORTALITY,ETHNICITY_GROUP,flag_diabetes,flag_hypertension,flag_ckd,flag_chf,flag_copd,flag_cancer
Unnamed: 0_level_1,<int>,<int>,<dttm>,<dttm>,<int64>,<dttm>,<dttm>,<int64>,<chr>,<chr>,⋯,<dttm>,<dbl>,<dbl>,<fct>,<int>,<int>,<int>,<int>,<int>,<int>
1,3,145834,2101-10-20 19:10:11,2101-10-26 20:43:09,145,2101-10-20 19:08:00,2101-10-31 13:58:00,258,M,WHITE,⋯,2102-06-14,76,0,WHITE,0,0,1,1,0,0
2,4,185777,2191-03-16 00:29:31,2191-03-17 16:46:31,40,2191-03-16 00:28:00,2191-03-23 18:41:00,186,F,WHITE,⋯,,48,0,WHITE,0,0,0,0,0,0
3,6,107064,2175-05-30 21:30:54,2175-06-03 13:39:54,88,2175-05-30 07:15:00,2175-06-15 16:00:00,392,F,WHITE,⋯,,66,0,WHITE,0,0,1,0,0,0
4,11,194540,2178-04-16 06:19:32,2178-04-17 20:21:05,38,2178-04-16 06:18:00,2178-05-11 19:00:00,612,F,WHITE,⋯,2178-11-14,50,0,WHITE,0,0,0,0,0,1
5,12,112213,2104-08-08 02:08:17,2104-08-15 17:22:25,183,2104-08-07 10:15:00,2104-08-20 02:57:00,304,M,WHITE,⋯,2104-08-20,72,1,WHITE,0,1,0,0,0,1
6,13,143045,2167-01-08 18:44:25,2167-01-12 10:43:31,87,2167-01-08 18:43:00,2167-01-15 15:15:00,164,F,WHITE,⋯,,40,0,WHITE,1,1,0,0,0,0



    0     1 
19405  7099 


    0     1 
14409 12095 


    0     1 
19434  7070 

In [9]:
# Elimino etnia porque ya tengo una columna agrupada, DOD porque me interesan las horas, no solo la fecha de Discharge, lo mismo con Data Birth, me interesaba calcular los años en ese momento no la fecha como tal 
# Todo este proceso es implicito en la conversion de data -> knowledge!

df_final <- df_final %>%
    select(-c(ETHNICITY, DOD))%>%
    rename(
        diabetes_comorbidity = flag_diabetes, 
        hypertension_comorbidity = flag_hypertension, 
        ckd_comorbidity = flag_ckd, 
        chf_comorbidity = flag_chf, 
        copd_comorbidity = flag_copd, 
        cancer_comorbidity = flag_cancer)

In [10]:
colnames(df_final)

In [11]:
diagnosis_group <- dbGetQuery(con, "
SELECT
  HADM_ID,
  CASE
    -- Cardiovascular
    WHEN LOWER(DIAGNOSIS) LIKE '%coronary%'
      OR LOWER(DIAGNOSIS) LIKE '%myocard%'
      OR LOWER(DIAGNOSIS) LIKE '%angina%'
      OR LOWER(DIAGNOSIS) LIKE '%cardiac%'
      OR LOWER(DIAGNOSIS) LIKE '%chf%'
      OR LOWER(DIAGNOSIS) LIKE '%valve%'
      OR LOWER(DIAGNOSIS) LIKE '%tachy%'
      OR LOWER(DIAGNOSIS) LIKE '%brady%' 
      THEN 'cardiovascular'

    -- Neurological
    WHEN LOWER(DIAGNOSIS) LIKE '%cva%'
      OR LOWER(DIAGNOSIS) LIKE '%seiz%'
      OR LOWER(DIAGNOSIS) LIKE '%brain%'
      OR LOWER(DIAGNOSIS) LIKE '%head%'
      OR LOWER(DIAGNOSIS) LIKE '%spinal%'
      OR LOWER(DIAGNOSIS) LIKE '%cereb%'
      THEN 'neurological'

    -- Infectious
    WHEN LOWER(DIAGNOSIS) LIKE '%sepsis%'
      OR LOWER(DIAGNOSIS) LIKE '%pneumon%'
      OR LOWER(DIAGNOSIS) LIKE '%infection%'
      THEN 'infectious'

    -- Renal
    WHEN LOWER(DIAGNOSIS) LIKE '%renal%'
      OR LOWER(DIAGNOSIS) LIKE '%arf%'
      THEN 'renal'

    -- Respiratory
    WHEN LOWER(DIAGNOSIS) LIKE '%respir%'
      OR LOWER(DIAGNOSIS) LIKE '%copd%'
      OR LOWER(DIAGNOSIS) LIKE '%tracheal%'
      OR LOWER(DIAGNOSIS) LIKE '%pulmonary%'
      THEN 'respiratory'

    -- Oncology
    WHEN LOWER(DIAGNOSIS) LIKE '%cancer%'
      OR LOWER(DIAGNOSIS) LIKE '%lymphoma%'
      OR LOWER(DIAGNOSIS) LIKE '%tumor%'
      OR LOWER(DIAGNOSIS) LIKE '%hemangioma%'
      THEN 'oncology'

    -- Trauma / Musculoskeletal
    WHEN LOWER(DIAGNOSIS) LIKE '%fracture%'
      OR LOWER(DIAGNOSIS) LIKE '%hernia%'
      OR LOWER(DIAGNOSIS) LIKE '%spinal%'
      OR LOWER(DIAGNOSIS) LIKE '%post-operative%'
      OR LOWER(DIAGNOSIS) LIKE '%sda%'
      THEN 'trauma'

    -- Gastrohepatic / Liver / Biliary
    WHEN LOWER(DIAGNOSIS) LIKE '%liver%'
      OR LOWER(DIAGNOSIS) LIKE '%cirrhosis%'
      OR LOWER(DIAGNOSIS) LIKE '%biliary%'
      OR LOWER(DIAGNOSIS) LIKE '%pancreatic%'
      OR LOWER(DIAGNOSIS) LIKE '%chole%'
      THEN 'gastrohepatic'

    -- Metabolic / Endocrine
    WHEN LOWER(DIAGNOSIS) LIKE '%hypoglycem%'
      OR LOWER(DIAGNOSIS) LIKE '%hyperglycem%'
      OR LOWER(DIAGNOSIS) LIKE '%dehydration%'
      OR LOWER(DIAGNOSIS) LIKE '%failure to thrive%'
      THEN 'metabolic'

    -- Psychiatric / Toxicology
    WHEN LOWER(DIAGNOSIS) LIKE '%overdose%'
      OR LOWER(DIAGNOSIS) LIKE '%withdrawal%'
      OR LOWER(DIAGNOSIS) LIKE '%assault%'
      THEN 'psychiatric'

    -- Neonatal
    WHEN LOWER(DIAGNOSIS) LIKE '%newborn%'
      THEN 'neonatal'

    -- Hematologic / Other
    WHEN LOWER(DIAGNOSIS) LIKE '%splenomegaly%'
      OR LOWER(DIAGNOSIS) LIKE '%anemia%'
      THEN 'hematologic'

    -- Catch-all (todo lo demás)
    ELSE 'other'
  END AS diagnosis_group
FROM ADMISSIONS
")


In [12]:
table(diagnosis_group$diagnosis_group)


cardiovascular  gastrohepatic    hematologic     infectious      metabolic 
          8294            938            276           4508            488 
      neonatal   neurological       oncology          other    psychiatric 
          7823           2163            821          26245            654 
         renal    respiratory         trauma 
          1095           1893           3778 

In [13]:
str(df_final)

'data.frame':	26504 obs. of  19 variables:
 $ SUBJECT_ID              : int  3 4 6 11 12 13 17 18 20 22 ...
 $ HADM_ID                 : int  145834 185777 107064 194540 112213 143045 194023 188822 157681 165315 ...
 $ ICU_ADMIT               : POSIXct, format: "2101-10-20 19:10:11" "2191-03-16 00:29:31" ...
 $ ICU_DISCH               : POSIXct, format: "2101-10-26 20:43:09" "2191-03-17 16:46:31" ...
 $ ICU_LOS_HOURS           :integer64 145 40 88 38 183 87 49 30 ... 
 $ HOSP_ADMIT              : POSIXct, format: "2101-10-20 19:08:00" "2191-03-16 00:28:00" ...
 $ HOSP_DISCH              : POSIXct, format: "2101-10-31 13:58:00" "2191-03-23 18:41:00" ...
 $ HOSP_LOS_HOURS          :integer64 258 186 392 612 304 164 104 52 ... 
 $ GENDER                  : chr  "M" "F" "F" "F" ...
 $ DOB                     : POSIXct, format: "2025-04-11" "2143-05-12" ...
 $ AGE                     : num  76 48 66 50 72 40 47 51 76 65 ...
 $ MORTALITY               : num  0 0 0 0 1 0 0 0 0 0 ...
 $ ETHNIC

In [14]:
# Se debe añladir con el hospitial admission id, dado que si se hace con el subject id el datafarame duplicara cada uan de las filas por cada vez que es apersona tenga registro de ingreos 
df_final <- df_final %>%
  left_join(diagnosis_group, by = "HADM_ID")

In [15]:
table(df_final$diagnosis_group)


cardiovascular  gastrohepatic    hematologic     infectious      metabolic 
          5255            540            111           1820            208 
  neurological       oncology          other    psychiatric          renal 
          1144            481          13126            376            554 
   respiratory         trauma 
           875           2014 

In [16]:
min(df_final$HOSP_ADMIT, na.rm = TRUE)
max(df_final$HOSP_DISCH, na.rm = TRUE)


[1] "2100-06-09 01:39:00 UTC"

[1] "2208-08-25 14:59:00 UTC"

Debemos acotar la ventana temporal, no tene sentido hacer un estudio a lo largo de 100 años ( o sí, al ser una simulación, podemos jugar con esto, pero como queremos ser un poco fieles, pongamos que queremos lso 100 años exactos más recientes) 


In [17]:
# Acote de la ventana temporal -> 100 últimos años 

max_adm<-max(df_final$HOSP_DISCH, na.rm = TRUE)
cutoff <- max_adm - (100*365.25*24*60*60)  # en segundos porque es POSIXct
df <- df_final %>%
  filter(HOSP_ADMIT >= cutoff)

In [18]:
str(df)

'data.frame':	24387 obs. of  20 variables:
 $ SUBJECT_ID              : int  4 6 11 13 17 18 20 22 23 25 ...
 $ HADM_ID                 : int  185777 107064 194540 143045 194023 188822 157681 165315 152223 129635 ...
 $ ICU_ADMIT               : POSIXct, format: "2191-03-16 00:29:31" "2175-05-30 21:30:54" ...
 $ ICU_DISCH               : POSIXct, format: "2191-03-17 16:46:31" "2175-06-03 13:39:54" ...
 $ ICU_LOS_HOURS           :integer64 40 88 38 87 49 30 25 27 ... 
 $ HOSP_ADMIT              : POSIXct, format: "2191-03-16 00:28:00" "2175-05-30 07:15:00" ...
 $ HOSP_DISCH              : POSIXct, format: "2191-03-23 18:41:00" "2175-06-15 16:00:00" ...
 $ HOSP_LOS_HOURS          :integer64 186 392 612 164 104 52 125 27 ... 
 $ GENDER                  : chr  "F" "F" "F" "F" ...
 $ DOB                     : POSIXct, format: "2143-05-12" "2109-06-21" ...
 $ AGE                     : num  48 66 50 40 47 51 76 65 71 59 ...
 $ MORTALITY               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ ETHNICIT

No ha sido tan drástico el cambio en el numero de muestras, pero ya tenemos algo de rigor clínico actual en cuento estudios. Es decir, podemos asegurar que nuestro estudio se centra en la ventana de 100 años mas recientes. Si fuera un estudio real, esta ventana seria mucho mas pequeña, y podriamos argumentar la variabilidad en motivos de enfermedad a lo largo de los años. Una sociedad se mantiene viva y por ello todo cambia. Ejemplo: no podemos comparar la incidencia de enfermedades en el año 1800 que en 2026. Pero esque no hace falta ser tan exagerado, sino que solo con mirar del año 2000 a ahora, tendriamos un sesgo muy fuertemente marcado. Por no mencionar la pandemia del COVID. 

In [19]:
#AAqui lo que estoy haciendo e sunir el codigo CIE de cada uno de los tipos de ventilacion intrusiva (por eso es procedure) para poder diferenciar entre aquellos que, al final de su estancia en la UCI (por eso es importante coger esta table y no otras) , necesitaron ser intervenidos para asistir la respiración y lso que no. 
resp_support <- dbGetQuery(con, "
SELECT p.HADM_ID, p.ICD9_CODE, d.LONG_TITLE
FROM PROCEDURES_ICD p
JOIN D_ICD_PROCEDURES d ON p.ICD9_CODE = d.ICD9_CODE
WHERE LOWER(d.LONG_TITLE) LIKE '%ventilation%'
   OR LOWER(d.LONG_TITLE) LIKE '%intubation%'
")

In [20]:
table(resp_support$LONG_TITLE)


  Continuous invasive mechanical ventilation for 96 consecutive hours or more 
                                                                         6048 
Continuous invasive mechanical ventilation for less than 96 consecutive hours 
                                                                         9100 
           Continuous invasive mechanical ventilation of unspecified duration 
                                                                            6 
                                              Intubation of nasolacrimal duct 
                                                                            1 
                                          Non-invasive mechanical ventilation 
                                                                         2727 
                                        Other intubation of respiratory tract 
                                                                          759 

In [21]:
# Tambien creare una columan binaria en la que si han necesitado se les marcara como 1 y si no, o ha sido una interbvencion superficial, sera 0 w
df <- df %>%
  left_join(resp_support, by = "HADM_ID")%>%
    mutate(resp_procedure = if_else(
    is.na(LONG_TITLE) | trimws(LONG_TITLE) == "",
    0L,
    1L
  ))

In [22]:
table(df$resp_procedure)


    0     1 
16806  8518 

No tenemos tablas como tal de las puntuaciones así que la stenemos que calcular nosotros mismos 

In [23]:
# Buscar ITEMID por nombre de variable
dic_items<-dbGetQuery(con, "
SELECT ITEMID, LABEL, CATEGORY
FROM D_ITEMS
WHERE LABEL LIKE '%Heart Rate%'
   OR LABEL LIKE '%Respiratory Rate%'
   OR LABEL LIKE '%Temperature%'
   OR LABEL LIKE '%MAP%'
   OR LABEL LIKE '%Glasgow%'
")

In [24]:
print.data.frame(dic_items, row.names = FALSE)


 ITEMID                                    LABEL               CATEGORY
    211                               Heart Rate                   <NA>
    283                     INV#1 WaveformAppear                   <NA>
    284                     INV#2 WaveformAppear                   <NA>
    285                     INV#3 WaveformAppear                   <NA>
    286                     INV#4 WaveformAppear                   <NA>
    287                     INV#5 WaveformAppear                   <NA>
    288                     INV#6 WaveformAppear                   <NA>
    289                     INV#7 WaveformAppear                   <NA>
    290                     INV#8 WaveformAppear                   <NA>
    438                                      MAP                   <NA>
    591                        RLE [Temperature]                   <NA>
    597                        RUE [Temperature]                   <NA>
    618                         Respiratory Rate                

In [25]:
table(dic_items$CATEGORY)


                Alarms       Free Form Intake                   Labs 
                     2                      7                      1 
           Respiratory    Routine Vital Signs     Scores - APACHE II 
                     4                      5                      6 
Scores - APACHE IV (2)      Skin - Assessment             Toxicology 
                     3                      1                      1 

In [26]:
colnames(df)

yo creo que hasta aqui podemos tener buenos indicadores, falta SOFA score pero no se como lo deberiamos calcular,porque no encuentro la bilirrubina 

INFO: 
SOFA usa 6 sistemas:

Respiratorio → PaO₂/FiO₂ → ya tienes PaO2 y FiO2

Coagulación → plaquetas → no lo tienes aún

Hígado → Bilirrubina → tienes Bilirubin

Cardiovascular → MAP → tienes MapApacheIIScore o MAP_ApacheIV

Sistema nervioso → Glasgow → tienes Glasgow si lo añades

Renal → Creatinina → tienes Creatinine

“simplificado” significa que no estamos implementando todo el APACHE II o IV al detalle, sino que usamos solo las columnas que ya tienes en tu dataset y asignamos puntos básicos para generar un valor aproximado de la puntuación

## Dobutamine treatment 

In [28]:
dobutamine <- dbGetQuery(con, "
  SELECT
      ICUSTAY_ID,
      LINKORDERID,
      RATE AS vaso_rate,
      AMOUNT AS vaso_amount,
      STARTTIME,
      ENDTIME
  FROM INPUTEVENTS_MV
  WHERE ITEMID = 221653
")

In [29]:
dobutamine_24h <- dobutamine %>%
  left_join(
    dbGetQuery(con, "SELECT ICUSTAY_ID, INTIME FROM ICUSTAYS"),
    by = "ICUSTAY_ID"
  ) %>%
  filter(
    STARTTIME <= INTIME + hours(24)
  ) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(
    max_dobutamine_rate = max(vaso_rate, na.rm = TRUE),
    any_dobutamine = as.integer(any(vaso_rate > 0)),
    .groups = "drop"
  )

In [30]:
# Check data about dobutamine 
ls()
head(dobutamine_24h)
str(dobutamine_24h)

ICUSTAY_ID,max_dobutamine_rate,any_dobutamine
<int>,<dbl>,<int>
200586,20.008002,1
200977,0.500002,1
201098,5.286218,1
201234,2.501038,1
201906,2.50115,1
203665,2.50052,1


tibble [134 × 3] (S3: tbl_df/tbl/data.frame)
 $ ICUSTAY_ID         : int [1:134] 200586 200977 201098 201234 201906 203665 204615 204787 204829 207804 ...
 $ max_dobutamine_rate: num [1:134] 20.01 0.5 5.29 2.5 2.5 ...
 $ any_dobutamine     : int [1:134] 1 1 1 1 1 1 1 1 1 1 ...


So at this point we've achieved the following information: 
- Max. Dobutamine rate by patient in the first 24 h
- If there was Dobutamine treatment applied or not 

## Norepinephrine treatment 

In [31]:
norepinephrine <- dbGetQuery(con, "
  SELECT
      ICUSTAY_ID,
      LINKORDERID,
      CASE
        WHEN RATEUOM = 'mg/kg/min' AND PATIENTWEIGHT = 1 THEN RATE
        WHEN RATEUOM = 'mg/kg/min' THEN RATE * 1000.0
        ELSE RATE
      END AS vaso_rate,
      AMOUNT AS vaso_amount,
      STARTTIME,
      ENDTIME
  FROM INPUTEVENTS_MV
  WHERE ITEMID = 221906
")

In [32]:
norepi_24h <- norepinephrine %>%
  left_join(
    dbGetQuery(con, "SELECT ICUSTAY_ID, INTIME FROM ICUSTAYS"),
    by = "ICUSTAY_ID"
  ) %>%
  filter(
    STARTTIME <= INTIME + hours(24)
  ) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(
    max_norepi_rate = ifelse(
      all(is.na(vaso_rate)),
      NA_real_,
      max(vaso_rate, na.rm = TRUE)
    ),
    any_norepi = as.integer(any(vaso_rate > 0, na.rm = TRUE)),
    .groups = "drop"
  )


But since now, we have not the ICUSTAY_ID at our DataFrame, so we have to do the relationship: 

In [33]:
icustays <- dbGetQuery(con, "
  SELECT
    ICUSTAY_ID,
    HADM_ID,
    INTIME,
    OUTTIME
  FROM ICUSTAYS
")

But a HADM_ID can have multiple ICUSTAY_ID : "Reingresos a la UCI en una misma hospitalizacion". 
So, we have to get the first ICU STAY, like the most part of papers. 

In [34]:
icustays_1st <- dbGetQuery(con, "
  SELECT
    ICUSTAY_ID,
    HADM_ID,
    INTIME
  FROM ICUSTAYS
") %>%
  arrange(HADM_ID, INTIME) %>%
  group_by(HADM_ID) %>%
  slice(1) %>%
  ungroup()


In [35]:
df <- df %>%
  left_join(icustays_1st, by = "HADM_ID")

## Merge some treatments to our DF

In [36]:
df <- df %>%
  left_join(dobutamine_24h, by = "ICUSTAY_ID") %>%
  mutate(
    any_dobutamine = coalesce(any_dobutamine, 0),
    max_dobutamine_rate = coalesce(max_dobutamine_rate, 0)
  )


In [37]:
df <- df %>%
  left_join(norepi_24h, by = "ICUSTAY_ID") %>%
  mutate(
    any_norepi = coalesce(any_norepi, 0),
    max_norepi_rate = coalesce(max_norepi_rate, 0)
  )

## SOFA score (cardio) 

In [38]:
# MAP extraction 
dic_items <- dbGetQuery(con, "
  SELECT ITEMID, LABEL
  FROM D_ITEMS
  WHERE LABEL LIKE '%MAP_Apache%'
")
dic_items # here we can find more indicators!!!

ITEMID,LABEL
<int>,<chr>
227023,MAP_ApacheIV


In [39]:
# Traer todos los registros de MAP_ApacheIV
MAP <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS MAP_ApacheIV
  FROM CHARTEVENTS
  WHERE ITEMID = 227023
")

# Unir con ICUSTAYS para obtener INTIME
icustays <- dbGetQuery(con, "SELECT ICUSTAY_ID, INTIME FROM ICUSTAYS")

MAP_24h <- MAP %>%
  left_join(icustays, by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%   # primeras 24h
  group_by(ICUSTAY_ID) %>%
  summarise(
    min_MAP_24h = min(MAP_ApacheIV, na.rm = TRUE),  # peor MAP
    .groups = "drop"
  )

# Unir con tu df
df <- df %>%
  left_join(MAP_24h, by = "ICUSTAY_ID")

In [40]:
#colnames(df)

In [41]:
df <- df %>%
  mutate(
    sofa_cv = case_when(
      max_norepi_rate > 0.1 ~ 4,
      max_norepi_rate > 0   ~ 3,
      any_dobutamine == 1   ~ 2,
      min_MAP_24h < 70     ~ 1,  # usando ApacheIV directamente
      TRUE                  ~ 0
    )
  )

In [42]:
table(df$sofa_cv)


    0     2     3     4 
24123    31   322   848 

## Baseline BMI 

In [52]:
bmi_query <- "
WITH weight_first_day AS (
    SELECT
        ie.icustay_id,
        AVG(CASE WHEN weight_type = 'admit' THEN ce.weight ELSE NULL END) AS weight_admit
    FROM mimiciv_icu.icustays ie
    LEFT JOIN mimiciv_derived.weight_durations ce
        ON ie.icustay_id = ce.stay_id
        AND ce.starttime <= ie.intime + INTERVAL 1 DAY
    GROUP BY ie.icustay_id
),
height_first AS (
    SELECT
        icustay_id,
        FIRST_VALUE(VALUENUM) OVER (PARTITION BY icustay_id ORDER BY charttime) AS height_init
    FROM mimiciv_icu.chartevents
    WHERE ITEMID IN (<ITEMID_altura>)
      AND VALUENUM IS NOT NULL
)
SELECT
    w.icustay_id,
    w.weight_admit,
    h.height_init,
    w.weight_admit / POWER(h.height_init/100, 2) AS BMI
FROM weight_first_day w
LEFT JOIN height_first h
    ON w.icustay_id = h.icustay_id
"

# Ejecutar query
bmi_df <- dbGetQuery(con, bmi_query)

# Unir al df principal
df <- df %>%
  left_join(bmi_df %>% select(icustay_id, BMI), by = "icustay_id")

ERROR: Error: You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near '<ITEMID_altura>)  -- reemplaza con tus ITEMID de altura
      AND VALUENUM IS...' at line 16 [1064]


In [53]:
SELECT DISTINCT itemid, label
FROM mimiciv_icu.d_items
WHERE LOWER(label) LIKE '%height%'
   OR LOWER(label) LIKE '%tall%'
ORDER BY label;

ERROR: Error in parse(text = x, srcfile = src): <text>:1:8: unexpected symbol
1: SELECT DISTINCT
           ^


In [44]:
 ITEMID IN (<ID_ALTURA>)
")

ERROR: Error: You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near '<ID_PESO>)' at line 3 [1064]


In [None]:
# Baseline weigh and height
weight_initial <- weight %>%
  group_by(ICUSTAY_ID) %>%
  summarise(weight_kg = first(weight_kg[!is.na(weight_kg)]), .groups = "drop")

height_initial <- height %>%
  group_by(ICUSTAY_ID) %>%
  summarise(height_cm = first(height_cm[!is.na(height_cm)]), .groups = "drop")

In [None]:
# Compute BMI 
bmi_df <- weight_initial %>%
  left_join(height_initial, by = "ICUSTAY_ID") %>%
  mutate(
    height_m = height_cm / 100,
    BMI = weight_kg / (height_m^2)
  )

In [None]:
# Merge BMI 
df <- df %>%
  left_join(bmi_df %>% select(ICUSTAY_ID, BMI), by = "ICUSTAY_ID")

## IN DEVELOPMENT 

In [None]:

## -> STILL WE DON'T HAVE CALCULATED THEM 

# ---------- Creatinina / Urine output - Renal ----------
Creat <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS Creatinine_ApacheIV
  FROM CHARTEVENTS
  WHERE ITEMID = 227005
")

Urine <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS `Urine output_ApacheIV`
  FROM CHARTEVENTS
  WHERE ITEMID = 227519
")

Creat_Urine_24h <- Creat %>%
  left_join(Urine, by = "ICUSTAY_ID") %>%
  left_join(icustays, by = "ICUSTAY_ID") %>%
  filter(CHARTTIME.x <= INTIME + hours(24) | CHARTTIME.y <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(
    max_Creat = max(Creatinine_ApacheIV, na.rm = TRUE),
    total_urine = sum(`Urine output_ApacheIV`, na.rm = TRUE),
    .groups = "drop"
  )

df <- df %>% left_join(Creat_Urine_24h, by = "ICUSTAY_ID")

# ---------- Bilirrubina - Hepático ----------
Bili <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS Bilirubin_ApacheIV
  FROM CHARTEVENTS
  WHERE ITEMID = 226998
")

Bili_24h <- Bili %>%
  left_join(icustays, by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(
    max_Bili = max(Bilirubin_ApacheIV, na.rm = TRUE),
    .groups = "drop"
  )

df <- df %>% left_join(Bili_24h, by = "ICUSTAY_ID")

# ---------- GCS - SNC ----------
GCS <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS GcsScore_ApacheIV
  FROM CHARTEVENTS
  WHERE ITEMID =227013
")

GCS_24h <- GCS %>%
  left_join(icustays, by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(
    min_GCS = min(GcsScore_ApacheIV, na.rm = TRUE),
    .groups = "drop"
  )

df <- df %>% left_join(GCS_24h, by = "ICUSTAY_ID")



In [None]:
# We get information from chartevents 
# Platelets
platelets <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS platelet
  FROM CHARTEVENTS
  WHERE ITEMID IN (828, 30006, 30105, 225170)
")

# PaO2
pao2 <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS pao2
  FROM CHARTEVENTS
  WHERE ITEMID IN (490, 779)
")

# FiO2
fio2 <- dbGetQuery(con, "
  SELECT ICUSTAY_ID, CHARTTIME, VALUENUM AS fio2
  FROM CHARTEVENTS
  WHERE ITEMID IN (185,186,189,190,191,727,3420,3421,3422,
                   1040,1206,5955,1863,2518,2981,7018,7041,7570,
                   8517,226754,227009,227010)
")

In [None]:
# Traer INTIME de ICUSTAYS
icustays <- dbGetQuery(con, "SELECT ICUSTAY_ID, INTIME FROM ICUSTAYS")

# Plaquetas - mínimo en 24h (peor coagulación)
platelets_24h <- platelets %>%
  left_join(icustays, by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(min_platelet_24h = min(platelet, na.rm = TRUE), .groups = "drop")

# PaO2 - mínimo en 24h
pao2_24h <- pao2 %>%
  left_join(icustays, by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(min_pao2_24h = min(pao2, na.rm = TRUE), .groups = "drop")

# FiO2 - máximo en 24h
fio2_24h <- fio2 %>%
  left_join(icustays, by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID) %>%
  summarise(max_fio2_24h = max(fio2, na.rm = TRUE), .groups = "drop")

In [None]:
resp_24h <- pao2_24h %>%
  inner_join(fio2_24h, by = "ICUSTAY_ID") %>%
  mutate(pao2fio2_min = min_pao2_24h / (max_fio2_24h/100))  # PaO2/FiO2 ratio


In [None]:
df <- df %>%
  left_join(platelets_24h, by = "ICUSTAY_ID") %>%
  left_join(resp_24h %>% select(ICUSTAY_ID, pao2fio2_min), by = "ICUSTAY_ID")

In [None]:
df <- df %>%
  mutate(
    # Coagulación
    sofa_coag = case_when(
      min_platelet_24h < 20 ~ 4,
      min_platelet_24h < 50 ~ 3,
      min_platelet_24h < 100 ~ 2,
      min_platelet_24h < 150 ~ 1,
      is.na(min_platelet_24h) ~ NA_real_,
      TRUE ~ 0
    ),
    # Respiratorio
    sofa_resp = case_when(
      pao2fio2_min < 100 ~ 4,
      pao2fio2_min < 200 ~ 3,
      pao2fio2_min < 300 ~ 2,
      pao2fio2_min < 400 ~ 1,
      is.na(pao2fio2_min) ~ NA_real_,
      TRUE ~ 0
    )
  )
