#### Loading libraries

In [2]:
library(dplyr)
library(tidyr)
library(tibble)
library(lubridate)
library(readr)
library(stringr)
library(ggplot2)
library(data.table)
library(DBI)
library(RSQLite)

#### Loading CSV and creating SQL database

In [3]:
# Load CSV data
df_a3 <- read_csv("/Users/arriazui/Desktop/master/ELECTRONIC_HEALTH_RECORDS/resources_p3/df_a3.csv")

# Create in-memory SQLite database
con <- dbConnect(RSQLite::SQLite(), ":memory:")

# Write the dataframe to SQL table
dbWriteTable(con, "df_a3", df_a3, overwrite = TRUE)

[1mRows: [22m[34m25324[39m [1mColumns: [22m[34m23[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m   (4): GENDER, ETHNICITY_GROUP, diagnosis_group, LONG_TITLE
[32mdbl[39m  (14): SUBJECT_ID, HADM_ID, ICU_LOS_HOURS, HOSP_LOS_HOURS, AGE, MORTALIT...
[34mdttm[39m  (4): ICU_ADMIT, ICU_DISCH, HOSP_ADMIT, HOSP_DISCH
[34mdate[39m  (1): DOB

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Check the structure and preview of the loaded CSV data

In [None]:
# Check what's in the database
dbListTables(con)

# Check structure of the df_a3 table
str(df_a3)

# Preview the data
head(df_a3)

### Data selection
#### First filters: 
1. Age at admission [18 - 85] y.
2. More than 24 hours at ICU

Then we add some other demographic information 

### 1. Get first ICU stay per patient 

In [5]:
sql_first_icu <- "
SELECT 
    icu.SUBJECT_ID,
    icu.HADM_ID,
    MIN(icu.INTIME) AS ICU_ADMIT
FROM ICUSTAYS icu
GROUP BY icu.SUBJECT_ID
;
"

df_first_icu <- dbGetQuery(con, sql_first_icu)

ERROR: Error: no such table: ICUSTAYS


### 2. Get detailed information for first ICU stay

In [None]:
sql_icu_full <- "
SELECT 
    icu.SUBJECT_ID,
    icu.HADM_ID,
    icu.INTIME AS ICU_ADMIT,
    icu.OUTTIME AS ICU_DISCH,
    TIMESTAMPDIFF(HOUR, icu.INTIME, icu.OUTTIME) AS ICU_LOS_HOURS,
    a.ADMITTIME AS HOSP_ADMIT,
    a.DISCHTIME AS HOSP_DISCH,
    TIMESTAMPDIFF(HOUR, a.ADMITTIME, a.DISCHTIME) AS HOSP_LOS_HOURS,
    p.GENDER,
    a.ETHNICITY,
    p.DOB,
    p.DOD
FROM ICUSTAYS icu
JOIN ADMISSIONS a ON icu.HADM_ID = a.HADM_ID
JOIN PATIENTS p ON icu.SUBJECT_ID = p.SUBJECT_ID
JOIN (
    SELECT SUBJECT_ID, MIN(INTIME) AS FIRST_ICU
    FROM ICUSTAYS
    GROUP BY SUBJECT_ID
) first_icu ON icu.SUBJECT_ID = first_icu.SUBJECT_ID AND icu.INTIME = first_icu.FIRST_ICU
WHERE TIMESTAMPDIFF(HOUR, icu.INTIME, icu.OUTTIME) >= 24
;
"

df_first_icu_full <- dbGetQuery(con, sql_icu_full)

### 3. Data cleaning and feature creation

In [None]:
df_clean <- df_first_icu_full %>%
  
  # Calculate age at ICU admission
  mutate(AGE = as.numeric(format(ICU_ADMIT, "%Y")) - as.numeric(format(DOB, "%Y"))) %>%
  
  # Keep only adult patients between 18 and 85 years
  filter(AGE >= 18 & AGE <= 85) %>%
  
  # Define mortality: 1 if patient died before or during hospital discharge
  mutate(MORTALITY = ifelse(!is.na(DOD) & DOD <= HOSP_DISCH, 1, 0)) %>%
  
  # Remove missing or unknown gender/ethnicity
  filter(
    !is.na(GENDER), !is.na(ETHNICITY),
    GENDER != "UNKNOWN", ETHNICITY != "UNKNOWN/NOT SPECIFIED"
  ) %>%
  
  # Group ethnicity into major categories
  mutate(
    ETHNICITY_GROUP = case_when(
      grepl("^WHITE", ETHNICITY) ~ "WHITE",
      grepl("^BLACK", ETHNICITY) ~ "BLACK",
      grepl("^HISPANIC", ETHNICITY) ~ "HISPANIC",
      grepl("^ASIAN", ETHNICITY) ~ "ASIAN",
      TRUE ~ "OTHER"
    ),
    ETHNICITY_GROUP = factor(ETHNICITY_GROUP, levels = c("WHITE", "BLACK", "HISPANIC", "ASIAN", "OTHER"))
  )


### 4. Check cleaned data

In [None]:
table(df_clean$GENDER)
table(df_clean$ETHNICITY_GROUP)
dim(df_clean)
head(df_clean)


    F     M 
10997 15507 


   WHITE    BLACK HISPANIC    ASIAN    OTHER 
   20697     2303     1008      703     1793 

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,ICU_ADMIT,ICU_DISCH,ICU_LOS_HOURS,HOSP_ADMIT,HOSP_DISCH,HOSP_LOS_HOURS,GENDER,ETHNICITY,DOB,DOD,AGE,MORTALITY,ETHNICITY_GROUP
Unnamed: 0_level_1,<int>,<int>,<dttm>,<dttm>,<int64>,<dttm>,<dttm>,<int64>,<chr>,<chr>,<dttm>,<dttm>,<dbl>,<dbl>,<fct>
1,3,145834,2101-10-20 19:10:11,2101-10-26 20:43:09,145,2101-10-20 19:08:00,2101-10-31 13:58:00,258,M,WHITE,2025-04-11,2102-06-14,76,0,WHITE
2,4,185777,2191-03-16 00:29:31,2191-03-17 16:46:31,40,2191-03-16 00:28:00,2191-03-23 18:41:00,186,F,WHITE,2143-05-12,,48,0,WHITE
3,6,107064,2175-05-30 21:30:54,2175-06-03 13:39:54,88,2175-05-30 07:15:00,2175-06-15 16:00:00,392,F,WHITE,2109-06-21,,66,0,WHITE
4,11,194540,2178-04-16 06:19:32,2178-04-17 20:21:05,38,2178-04-16 06:18:00,2178-05-11 19:00:00,612,F,WHITE,2128-02-22,2178-11-14,50,0,WHITE
5,12,112213,2104-08-08 02:08:17,2104-08-15 17:22:25,183,2104-08-07 10:15:00,2104-08-20 02:57:00,304,M,WHITE,2032-03-24,2104-08-20,72,1,WHITE
6,13,143045,2167-01-08 18:44:25,2167-01-12 10:43:31,87,2167-01-08 18:43:00,2167-01-15 15:15:00,164,F,WHITE,2127-02-27,,40,0,WHITE


## Phase 1 of ICU Data Cleaning and Feature Engineering – Summary

**So far, we have:**

1. **Selected the first ICU admission per patient**  
   - For each patient, we keep only their **first ICU stay**.

2. **Joined the ICU, Admissions, and Patients tables**  
   - This allows us to have **demographics** (gender, date of birth, ethnicity) and **admission/discharge times** for both the ICU and the hospital.  
   - We also calculate **lengths of stay (LOS) in hours** for the ICU and hospital.

3. **Calculated age at ICU admission and filtered adult patients (18–85 years old)**  
   - Ensures the analysis focuses on **adult patients** only.

4. **Created a mortality flag**  
   - `MORTALITY = 1` if the patient **died before or during hospital discharge**, otherwise `0`.

5. **Cleaned missing or unknown gender/ethnicity values**  
   - Removes entries with **unknown or missing gender or ethnicity** to ensure data quality.

6. **Grouped ethnicity into five categories and converted to a factor**  
   - Categories: `WHITE`, `BLACK`, `HISPANIC`, `ASIAN`, `OTHER`.

7. **Checked the cleaned dataset**  
   - Verified **tables of gender and ethnicity**, **dimensions**, and a **preview of the first rows**.


# ICU Cohort – Adding Comorbidity Flags

This section adds **binary comorbidity flags** to the existing cleaned ICU cohort. 
The flags are derived from ICD-9 diagnosis descriptions.
## 1. SQL query to extract comorbidities

In [None]:
# ---------------------------------------------
# 1. Traer tablas de diagnósticos desde la base de datos
# ---------------------------------------------
dx <- dbGetQuery(con, "SELECT * FROM DIAGNOSES_ICD")
d_icd_diagnoses <- dbGetQuery(con, "SELECT * FROM D_ICD_DIAGNOSES")

# ---------------------------------------------
# 2. Filtrar solo los HADM_ID de nuestra cohorte
# ---------------------------------------------
df_diag <- dx %>%
  filter(HADM_ID %in% df_clean$HADM_ID) %>%
  left_join(d_icd_diagnoses, by = c("ICD9_CODE" = "ICD9_CODE"))

# ---------------------------------------------
# 3. Crear flags de comorbilidades agrupando por HADM_ID
# ---------------------------------------------
df_flags <- df_diag %>%
  group_by(HADM_ID) %>%
  summarise(
    flag_diabetes = max(grepl("diabetes", tolower(LONG_TITLE))),
    flag_hypertension = max(grepl("hypertension|high blood pressure", tolower(LONG_TITLE))),
    flag_ckd = max(grepl("chronic kidney|renal failure|kidney failure", tolower(LONG_TITLE))),
    flag_chf = max(grepl("heart failure|congestive heart", tolower(LONG_TITLE))),
    flag_copd = max(grepl("copd|chronic obstructive|emphysema|chronic bronchitis", tolower(LONG_TITLE))),
    flag_cancer = max(grepl("malignan|cancer|carcinoma|neoplasm|tumor", tolower(LONG_TITLE)))
  )

# ---------------------------------------------
# 4. Unir los flags con df_clean usando HADM_ID
# ---------------------------------------------
df_final <- df_clean %>%
  left_join(df_flags, by = "HADM_ID")

# ---------------------------------------------
# 5. Revisar resultado
# ---------------------------------------------
head(df_final)
table(df_final$flag_diabetes, useNA = "ifany")
table(df_final$flag_hypertension, useNA = "ifany")
table(df_final$flag_ckd, useNA = "ifany")


Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,ICU_ADMIT,ICU_DISCH,ICU_LOS_HOURS,HOSP_ADMIT,HOSP_DISCH,HOSP_LOS_HOURS,GENDER,ETHNICITY,⋯,DOD,AGE,MORTALITY,ETHNICITY_GROUP,flag_diabetes,flag_hypertension,flag_ckd,flag_chf,flag_copd,flag_cancer
Unnamed: 0_level_1,<int>,<int>,<dttm>,<dttm>,<int64>,<dttm>,<dttm>,<int64>,<chr>,<chr>,⋯,<dttm>,<dbl>,<dbl>,<fct>,<int>,<int>,<int>,<int>,<int>,<int>
1,3,145834,2101-10-20 19:10:11,2101-10-26 20:43:09,145,2101-10-20 19:08:00,2101-10-31 13:58:00,258,M,WHITE,⋯,2102-06-14,76,0,WHITE,0,0,1,1,0,0
2,4,185777,2191-03-16 00:29:31,2191-03-17 16:46:31,40,2191-03-16 00:28:00,2191-03-23 18:41:00,186,F,WHITE,⋯,,48,0,WHITE,0,0,0,0,0,0
3,6,107064,2175-05-30 21:30:54,2175-06-03 13:39:54,88,2175-05-30 07:15:00,2175-06-15 16:00:00,392,F,WHITE,⋯,,66,0,WHITE,0,0,1,0,0,0
4,11,194540,2178-04-16 06:19:32,2178-04-17 20:21:05,38,2178-04-16 06:18:00,2178-05-11 19:00:00,612,F,WHITE,⋯,2178-11-14,50,0,WHITE,0,0,0,0,0,1
5,12,112213,2104-08-08 02:08:17,2104-08-15 17:22:25,183,2104-08-07 10:15:00,2104-08-20 02:57:00,304,M,WHITE,⋯,2104-08-20,72,1,WHITE,0,1,0,0,0,1
6,13,143045,2167-01-08 18:44:25,2167-01-12 10:43:31,87,2167-01-08 18:43:00,2167-01-15 15:15:00,164,F,WHITE,⋯,,40,0,WHITE,1,1,0,0,0,0



    0     1 
19405  7099 


    0     1 
14409 12095 


    0     1 
19434  7070 

In [None]:
# Elimino etnia porque ya tengo una columna agrupada, DOD porque me interesan las horas, no solo la fecha de Discharge, lo mismo con Data Birth, me interesaba calcular los años en ese momento no la fecha como tal 
# Todo este proceso es implicito en la conversion de data -> knowledge!

df_final <- df_final %>%
    select(-c(ETHNICITY, DOD))%>%
    rename(
        diabetes_comorbidity = flag_diabetes, 
        hypertension_comorbidity = flag_hypertension, 
        ckd_comorbidity = flag_ckd, 
        chf_comorbidity = flag_chf, 
        copd_comorbidity = flag_copd, 
        cancer_comorbidity = flag_cancer)

In [None]:
colnames(df_final)

In [None]:
diagnosis_group <- dbGetQuery(con, "
SELECT
  HADM_ID,
  CASE
    -- Cardiovascular
    WHEN LOWER(DIAGNOSIS) LIKE '%coronary%'
      OR LOWER(DIAGNOSIS) LIKE '%myocard%'
      OR LOWER(DIAGNOSIS) LIKE '%angina%'
      OR LOWER(DIAGNOSIS) LIKE '%cardiac%'
      OR LOWER(DIAGNOSIS) LIKE '%chf%'
      OR LOWER(DIAGNOSIS) LIKE '%valve%'
      OR LOWER(DIAGNOSIS) LIKE '%tachy%'
      OR LOWER(DIAGNOSIS) LIKE '%brady%' 
      THEN 'cardiovascular'

    -- Neurological
    WHEN LOWER(DIAGNOSIS) LIKE '%cva%'
      OR LOWER(DIAGNOSIS) LIKE '%seiz%'
      OR LOWER(DIAGNOSIS) LIKE '%brain%'
      OR LOWER(DIAGNOSIS) LIKE '%head%'
      OR LOWER(DIAGNOSIS) LIKE '%spinal%'
      OR LOWER(DIAGNOSIS) LIKE '%cereb%'
      THEN 'neurological'

    -- Infectious
    WHEN LOWER(DIAGNOSIS) LIKE '%sepsis%'
      OR LOWER(DIAGNOSIS) LIKE '%pneumon%'
      OR LOWER(DIAGNOSIS) LIKE '%infection%'
      THEN 'infectious'

    -- Renal
    WHEN LOWER(DIAGNOSIS) LIKE '%renal%'
      OR LOWER(DIAGNOSIS) LIKE '%arf%'
      THEN 'renal'

    -- Respiratory
    WHEN LOWER(DIAGNOSIS) LIKE '%respir%'
      OR LOWER(DIAGNOSIS) LIKE '%copd%'
      OR LOWER(DIAGNOSIS) LIKE '%tracheal%'
      OR LOWER(DIAGNOSIS) LIKE '%pulmonary%'
      THEN 'respiratory'

    -- Oncology
    WHEN LOWER(DIAGNOSIS) LIKE '%cancer%'
      OR LOWER(DIAGNOSIS) LIKE '%lymphoma%'
      OR LOWER(DIAGNOSIS) LIKE '%tumor%'
      OR LOWER(DIAGNOSIS) LIKE '%hemangioma%'
      THEN 'oncology'

    -- Trauma / Musculoskeletal
    WHEN LOWER(DIAGNOSIS) LIKE '%fracture%'
      OR LOWER(DIAGNOSIS) LIKE '%hernia%'
      OR LOWER(DIAGNOSIS) LIKE '%spinal%'
      OR LOWER(DIAGNOSIS) LIKE '%post-operative%'
      OR LOWER(DIAGNOSIS) LIKE '%sda%'
      THEN 'trauma'

    -- Gastrohepatic / Liver / Biliary
    WHEN LOWER(DIAGNOSIS) LIKE '%liver%'
      OR LOWER(DIAGNOSIS) LIKE '%cirrhosis%'
      OR LOWER(DIAGNOSIS) LIKE '%biliary%'
      OR LOWER(DIAGNOSIS) LIKE '%pancreatic%'
      OR LOWER(DIAGNOSIS) LIKE '%chole%'
      THEN 'gastrohepatic'

    -- Metabolic / Endocrine
    WHEN LOWER(DIAGNOSIS) LIKE '%hypoglycem%'
      OR LOWER(DIAGNOSIS) LIKE '%hyperglycem%'
      OR LOWER(DIAGNOSIS) LIKE '%dehydration%'
      OR LOWER(DIAGNOSIS) LIKE '%failure to thrive%'
      THEN 'metabolic'

    -- Psychiatric / Toxicology
    WHEN LOWER(DIAGNOSIS) LIKE '%overdose%'
      OR LOWER(DIAGNOSIS) LIKE '%withdrawal%'
      OR LOWER(DIAGNOSIS) LIKE '%assault%'
      THEN 'psychiatric'

    -- Neonatal
    WHEN LOWER(DIAGNOSIS) LIKE '%newborn%'
      THEN 'neonatal'

    -- Hematologic / Other
    WHEN LOWER(DIAGNOSIS) LIKE '%splenomegaly%'
      OR LOWER(DIAGNOSIS) LIKE '%anemia%'
      THEN 'hematologic'

    -- Catch-all (todo lo demás)
    ELSE 'other'
  END AS diagnosis_group
FROM ADMISSIONS
")


In [None]:
table(diagnosis_group$diagnosis_group)


cardiovascular  gastrohepatic    hematologic     infectious      metabolic 
          8294            938            276           4508            488 
      neonatal   neurological       oncology          other    psychiatric 
          7823           2163            821          26245            654 
         renal    respiratory         trauma 
          1095           1893           3778 

In [None]:
str(df_final)

'data.frame':	26504 obs. of  20 variables:
 $ SUBJECT_ID              : int  3 4 6 11 12 13 17 18 20 22 ...
 $ HADM_ID                 : int  145834 185777 107064 194540 112213 143045 194023 188822 157681 165315 ...
 $ ICU_ADMIT               : POSIXct, format: "2101-10-20 19:10:11" "2191-03-16 00:29:31" ...
 $ ICU_DISCH               : POSIXct, format: "2101-10-26 20:43:09" "2191-03-17 16:46:31" ...
 $ ICU_LOS_HOURS           :integer64 145 40 88 38 183 87 49 30 ... 
 $ HOSP_ADMIT              : POSIXct, format: "2101-10-20 19:08:00" "2191-03-16 00:28:00" ...
 $ HOSP_DISCH              : POSIXct, format: "2101-10-31 13:58:00" "2191-03-23 18:41:00" ...
 $ HOSP_LOS_HOURS          :integer64 258 186 392 612 304 164 104 52 ... 
 $ GENDER                  : chr  "M" "F" "F" "F" ...
 $ DOB                     : POSIXct, format: "2025-04-11" "2143-05-12" ...
 $ AGE                     : num  76 48 66 50 72 40 47 51 76 65 ...
 $ MORTALITY               : num  0 0 0 0 1 0 0 0 0 0 ...
 $ ETHNIC

In [None]:
# Se debe añladir con el hospitial admission id, dado que si se hace con el subject id el datafarame duplicara cada uan de las filas por cada vez que es apersona tenga registro de ingreos 
df_final <- df_final %>%
  left_join(diagnosis_group, by = "HADM_ID")

In [None]:
table(df_final$diagnosis_group)


cardiovascular  gastrohepatic    hematologic     infectious      metabolic 
          5255            540            111           1820            208 
  neurological       oncology          other    psychiatric          renal 
          1144            481          13126            376            554 
   respiratory         trauma 
           875           2014 

In [None]:
min(df_final$HOSP_ADMIT, na.rm = TRUE)
max(df_final$HOSP_DISCH, na.rm = TRUE)


[1] "2100-06-09 01:39:00 UTC"

[1] "2208-08-25 14:59:00 UTC"

Debemos acotar la ventana temporal, no tene sentido hacer un estudio a lo largo de 100 años ( o sí, al ser una simulación, podemos jugar con esto, pero como queremos ser un poco fieles, pongamos que queremos lso 100 años exactos más recientes) 


In [None]:
# Acote de la ventana temporal -> 100 últimos años 

max_adm<-max(df_final$HOSP_DISCH, na.rm = TRUE)
cutoff <- max_adm - (100*365.25*24*60*60)  # en segundos porque es POSIXct
df <- df_final %>%
  filter(HOSP_ADMIT >= cutoff)

In [None]:
str(df)

'data.frame':	24387 obs. of  20 variables:
 $ SUBJECT_ID              : int  4 6 11 13 17 18 20 22 23 25 ...
 $ HADM_ID                 : int  185777 107064 194540 143045 194023 188822 157681 165315 152223 129635 ...
 $ ICU_ADMIT               : POSIXct, format: "2191-03-16 00:29:31" "2175-05-30 21:30:54" ...
 $ ICU_DISCH               : POSIXct, format: "2191-03-17 16:46:31" "2175-06-03 13:39:54" ...
 $ ICU_LOS_HOURS           :integer64 40 88 38 87 49 30 25 27 ... 
 $ HOSP_ADMIT              : POSIXct, format: "2191-03-16 00:28:00" "2175-05-30 07:15:00" ...
 $ HOSP_DISCH              : POSIXct, format: "2191-03-23 18:41:00" "2175-06-15 16:00:00" ...
 $ HOSP_LOS_HOURS          :integer64 186 392 612 164 104 52 125 27 ... 
 $ GENDER                  : chr  "F" "F" "F" "F" ...
 $ DOB                     : POSIXct, format: "2143-05-12" "2109-06-21" ...
 $ AGE                     : num  48 66 50 40 47 51 76 65 71 59 ...
 $ MORTALITY               : num  0 0 0 0 0 0 0 0 0 0 ...
 $ ETHNICIT

No ha sido tan drástico el cambio en el numero de muestras, pero ya tenemos algo de rigor clínico actual en cuento estudios. Es decir, podemos asegurar que nuestro estudio se centra en la ventana de 100 años mas recientes. Si fuera un estudio real, esta ventana seria mucho mas pequeña, y podriamos argumentar la variabilidad en motivos de enfermedad a lo largo de los años. Una sociedad se mantiene viva y por ello todo cambia. Ejemplo: no podemos comparar la incidencia de enfermedades en el año 1800 que en 2026. Pero esque no hace falta ser tan exagerado, sino que solo con mirar del año 2000 a ahora, tendriamos un sesgo muy fuertemente marcado. Por no mencionar la pandemia del COVID. 

In [None]:
#AAqui lo que estoy haciendo e sunir el codigo CIE de cada uno de los tipos de ventilacion intrusiva (por eso es procedure) para poder diferenciar entre aquellos que, al final de su estancia en la UCI (por eso es importante coger esta table y no otras) , necesitaron ser intervenidos para asistir la respiración y lso que no. 
resp_support <- dbGetQuery(con, "
SELECT p.HADM_ID, p.ICD9_CODE, d.LONG_TITLE
FROM PROCEDURES_ICD p
JOIN D_ICD_PROCEDURES d ON p.ICD9_CODE = d.ICD9_CODE
WHERE LOWER(d.LONG_TITLE) LIKE '%ventilation%'
   OR LOWER(d.LONG_TITLE) LIKE '%intubation%'
")

In [None]:
table(resp_support$LONG_TITLE)


  Continuous invasive mechanical ventilation for 96 consecutive hours or more 
                                                                         6048 
Continuous invasive mechanical ventilation for less than 96 consecutive hours 
                                                                         9100 
           Continuous invasive mechanical ventilation of unspecified duration 
                                                                            6 
                                              Intubation of nasolacrimal duct 
                                                                            1 
                                          Non-invasive mechanical ventilation 
                                                                         2727 
                                        Other intubation of respiratory tract 
                                                                          759 

In [None]:
# Tambien creare una columan binaria en la que si han necesitado se les marcara como 1 y si no, o ha sido una interbvencion superficial, sera 0 w
df <- df %>%
  left_join(resp_support, by = "HADM_ID")%>%
    mutate(resp_procedure = if_else(
    is.na(LONG_TITLE) | trimws(LONG_TITLE) == "",
    0L,
    1L
  ))

In [None]:
table(df$resp_procedure)


    0     1 
16806  8518 

No tenemos tablas como tal de las puntuaciones así que la stenemos que calcular nosotros mismos 

In [None]:
# Buscar ITEMID por nombre de variable
dic_items<-dbGetQuery(con, "
SELECT ITEMID, LABEL, CATEGORY
FROM D_ITEMS
WHERE LABEL LIKE '%Heart Rate%'
   OR LABEL LIKE '%Respiratory Rate%'
   OR LABEL LIKE '%Temperature%'
   OR LABEL LIKE '%MAP%'
   OR LABEL LIKE '%Glasgow%'
")

In [None]:
print.data.frame(dic_items, row.names = FALSE)


 ITEMID                                    LABEL               CATEGORY
    211                               Heart Rate                   <NA>
    283                     INV#1 WaveformAppear                   <NA>
    284                     INV#2 WaveformAppear                   <NA>
    285                     INV#3 WaveformAppear                   <NA>
    286                     INV#4 WaveformAppear                   <NA>
    287                     INV#5 WaveformAppear                   <NA>
    288                     INV#6 WaveformAppear                   <NA>
    289                     INV#7 WaveformAppear                   <NA>
    290                     INV#8 WaveformAppear                   <NA>
    438                                      MAP                   <NA>
    591                        RLE [Temperature]                   <NA>
    597                        RUE [Temperature]                   <NA>
    618                         Respiratory Rate                

In [None]:
table(dic_items$CATEGORY)


                Alarms       Free Form Intake                   Labs 
                     2                      7                      1 
           Respiratory    Routine Vital Signs     Scores - APACHE II 
                     4                      5                      6 
Scores - APACHE IV (2)      Skin - Assessment             Toxicology 
                     3                      1                      1 

In [None]:
# Filtrar solo los ITEMID que necesitamos
vital_itemids <- c(220045, 220210, 223761, 438)  # Heart Rate, Respiratory Rate, Temperature, MAP

icu_vitals <- dbGetQuery(con, paste0("
SELECT SUBJECT_ID, HADM_ID, ICUSTAY_ID, ITEMID, VALUENUM
FROM CHARTEVENTS
WHERE ITEMID IN (", paste(vital_itemids, collapse=","), ")
"))


In [None]:
library(dplyr)
library(tidyr)

icu_vitals_wide <- icu_vitals %>%
  group_by(ICUSTAY_ID, ITEMID) %>%
  summarise(value = first(VALUENUM), .groups = 'drop') %>%
  pivot_wider(names_from = ITEMID, values_from = value)

colnames(icu_vitals_wide) <- c("ICUSTAY_ID", "Heart_Rate", "Resp_Rate", "Temperature", "MAP")

In [None]:
apache_itemids <- c(226329, 226765, 226771, 226772)  # Ejemplo MAP y Potassium APACHE II

icu_apache <- dbGetQuery(con, paste0("
SELECT SUBJECT_ID, HADM_ID, ICUSTAY_ID, ITEMID, VALUENUM
FROM CHARTEVENTS
WHERE ITEMID IN (", paste(apache_itemids, collapse=","), ")
"))

In [None]:
lab_itemids <- c(
  220227, # PaO2
  223835, # FiO2
  50885,  # Bilirubin -> No es este codigo pero imaginaoslo 
  50912,  # Creatinine -> tambien hay que buscar este, o estaria bien 
  51265   # Platelets -> est etb 
)

icu_labs <- dbGetQuery(con, paste0("
SELECT l.HADM_ID, ic.ICUSTAY_ID, l.ITEMID, l.VALUENUM
FROM LABEVENTS l
JOIN D_LABITEMS li ON l.ITEMID = li.ITEMID
JOIN ICUSTAYS ic ON l.HADM_ID = ic.HADM_ID
WHERE l.ITEMID IN (", paste(lab_itemids, collapse=","), ")
"))

In [None]:
icu_labs_24h <- icu_labs %>%
  left_join(
    dbGetQuery(con, "SELECT ICUSTAY_ID, INTIME FROM ICUSTAYS"),
    by = "ICUSTAY_ID"
  )

In [None]:
icu_labs <- dbGetQuery(con, paste0("
SELECT l.HADM_ID, ic.ICUSTAY_ID, l.ITEMID, l.VALUENUM, l.CHARTTIME
FROM LABEVENTS l
JOIN D_LABITEMS li ON l.ITEMID = li.ITEMID
JOIN ICUSTAYS ic ON l.HADM_ID = ic.HADM_ID
WHERE l.ITEMID IN (", paste(lab_itemids, collapse=","), ")
"))

In [None]:
# Medi ade lso indicadores sanguineos en las primeras 24 horas 
library(lubridate)
icu_labs_24h <- icu_labs %>%
  left_join(dbGetQuery(con, "SELECT HADM_ID, ICUSTAY_ID, INTIME FROM ICUSTAYS"), by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID, ITEMID) %>%
  summarise(value = mean(VALUENUM, na.rm = TRUE), .groups = 'drop') %>%
  pivot_wider(names_from = ITEMID, values_from = value)

colnames(icu_labs_24h) <- c("HADM_ID","ICUSTAY_ID", "PaO2", "FiO2", "Bilirubin", "Creatinine", "Platelets")

In [None]:
df <- df %>%
  left_join(icu_labs_24h, by = "HADM_ID")

In [None]:
colnames(df)

In [None]:
# Otras variables interesantes a añadir (signos vitales, scores, los qu ehe encontrado) 

# Guardamos los ITEMID de interés (basados en lo que obtuviste)
vital_itemids <- c(226765, 226766, 227023, 227024, 227054)  # completa con todos los que quieras

# Extraemos los registros de los ítems fisiológicos
icu_vitals <- dbGetQuery(con, paste0("
SELECT l.HADM_ID, ic.ICUSTAY_ID, l.ITEMID, l.VALUENUM, l.CHARTTIME
FROM CHARTEVENTS l
JOIN D_ITEMS li ON l.ITEMID = li.ITEMID
JOIN ICUSTAYS ic ON l.HADM_ID = ic.HADM_ID
WHERE l.ITEMID IN (", paste(vital_itemids, collapse=","), ")
"))

# Calculamos la media en las primeras 24 horas
library(lubridate)
icu_vitals_24h <- icu_vitals %>%
  left_join(dbGetQuery(con, "SELECT HADM_ID, ICUSTAY_ID, INTIME FROM ICUSTAYS"), by = "ICUSTAY_ID") %>%
  filter(CHARTTIME <= INTIME + hours(24)) %>%
  group_by(ICUSTAY_ID, ITEMID) %>%
  summarise(value = mean(VALUENUM, na.rm = TRUE), .groups = 'drop') %>%
  pivot_wider(names_from = ITEMID, values_from = value)

# Renombramos columnas a algo legible
colnames(icu_vitals_24h) <- c(
  "ICUSTAY_ID",
  "MapApacheIIScore", 
  "MapApacheIIValue", 
  "MAP_ApacheIV", 
  "MapScore_ApacheIV", 
  "TemperatureF_ApacheIV"
)

# Unimos con tu dataframe principal
df <- df %>%
  left_join(icu_vitals_24h, by = "ICUSTAY_ID")


In [None]:
colnames(df)

yo creo que hasta aqui podemos tener buenos indicadores, falta SOFA score pero no se como lo deberiamos calcular,porque no encuentro la bilirrubina 

INFO: 
SOFA usa 6 sistemas:

Respiratorio → PaO₂/FiO₂ → ya tienes PaO2 y FiO2

Coagulación → plaquetas → no lo tienes aún

Hígado → Bilirrubina → tienes Bilirubin

Cardiovascular → MAP → tienes MapApacheIIScore o MAP_ApacheIV

Sistema nervioso → Glasgow → tienes Glasgow si lo añades

Renal → Creatinina → tienes Creatinine

“simplificado” significa que no estamos implementando todo el APACHE II o IV al detalle, sino que usamos solo las columnas que ya tienes en tu dataset y asignamos puntos básicos para generar un valor aproximado de la puntuación

In [None]:
df <- df %>%
  # APACHE II 
  mutate(
    TempC = (TemperatureF_ApacheIV - 32) * 5/9,  # convertir °F a °C
    MAP_points = case_when(
      MapApacheIIValue < 50 ~ 4,
      MapApacheIIValue >= 50 & MapApacheIIValue <= 69 ~ 2,
      MapApacheIIValue >= 70 & MapApacheIIValue <= 109 ~ 0,
      MapApacheIIValue >= 110 & MapApacheIIValue <= 129 ~ 2,
      MapApacheIIValue >= 130 ~ 4,
      TRUE ~ 0
    ),
    Temp_points = case_when(
      TempC < 29.9 ~ 4,
      TempC >= 30 & TempC <= 31.9 ~ 3,
      TempC >= 32 & TempC <= 33.9 ~ 2,
      TempC >= 34 & TempC <= 35.9 ~ 1,
      TempC >= 36 & TempC <= 38.4 ~ 0,
      TempC >= 38.5 & TempC <= 38.9 ~ 1,
      TempC >= 39 & TempC <= 40 ~ 3,
      TempC > 40 ~ 4,
      TRUE ~ 0
    ),
    PaO2_FiO2_points = case_when(
      (PaO2 / (FiO2/100)) < 100 ~ 4,
      (PaO2 / (FiO2/100)) >= 100 & (PaO2 / (FiO2/100)) < 200 ~ 3,
      (PaO2 / (FiO2/100)) >= 200 & (PaO2 / (FiO2/100)) < 300 ~ 2,
      TRUE ~ 0
    ),
    APACHE_II = MAP_points + Temp_points + PaO2_FiO2_points
  ) %>%
  # APACHE IV 
  mutate(
    MAP_points_IV = case_when(
      MAP_ApacheIV < 50 ~ 4,
      MAP_ApacheIV >= 50 & MAP_ApacheIV <= 69 ~ 2,
      MAP_ApacheIV >= 70 & MAP_ApacheIV <= 109 ~ 0,
      MAP_ApacheIV >= 110 & MAP_ApacheIV <= 129 ~ 2,
      MAP_ApacheIV >= 130 ~ 4,
      TRUE ~ 0
    ),
    Temp_points_IV = Temp_points,  
    APACHE_IV = MAP_points_IV + Temp_points_IV
  )


Usamos el codigo de mimic oficial https://github.com/MIT-LCP/mimic-iv/blob/master/concepts/score/sofa.sql pero adaptado a Maria DB 

In [None]:
sofa_sql <- "
WITH co AS (
    SELECT ih.stay_id,
           ie.hadm_id,
           ih.hr,
           DATE_SUB(ih.endtime, INTERVAL 1 HOUR) AS starttime,
           ih.endtime
    FROM mimic_derived_icustay_hourly ih
    INNER JOIN mimic_icu_icustays ie
        ON ih.stay_id = ie.stay_id
),
-- todas las CTEs pafi, vs, gcs, bili, cr, plt, pf, uo, vaso, scorecomp, scorecalc
-- exactamente como en el SQL adaptado que te pasé
score_final AS (
    SELECT stay_id,
           MAX(respiration) AS respiration_24h,
           MAX(coagulation) AS coagulation_24h,
           MAX(liver) AS liver_24h,
           MAX(cardiovascular) AS cardiovascular_24h,
           MAX(cns) AS cns_24h,
           MAX(renal) AS renal_24h,
           MAX(respiration) + MAX(coagulation) + MAX(liver) +
           MAX(cardiovascular) + MAX(cns) + MAX(renal) AS sofa_24h
    FROM scorecalc
    WHERE hr BETWEEN 0 AND 23
    GROUP BY stay_id
)
SELECT * FROM score_final;
"

In [None]:
df_sofa <- dbGetQuery(con, sofa_sql)


ERROR: Error: Table 'mimiciiiv14.mimic_derived_icustay_hourly' doesn't exist [1146]
