#### Loading libraries

In [12]:
library(dplyr)
library(tidyr)
library(tibble)
library(lubridate)
library(readr)
library(stringr)
library(ggplot2)
library(data.table)
library(odbc)
library(RMariaDB)

#### Connecting to the database

In [13]:
con <- dbConnect(
  drv = RMariaDB::MariaDB(),
  username = "gabriel.torresz",
  password = "tie9booX",
  host = "ehr3.deim.urv.cat",
  dbname = "mimiciiiv14",
  port = 3306
)

List and comprobation of the connection

In [14]:
dbListTables(con)

### Data selection
#### First filters: 
1. Age at admission [18 - 85] y.
2. More than 24 hours at ICU

Then we add some other demographic information 

### 1. Get first ICU stay per patient 

In [15]:
sql_first_icu <- "
SELECT 
    icu.SUBJECT_ID,
    icu.HADM_ID,
    MIN(icu.INTIME) AS ICU_ADMIT
FROM ICUSTAYS icu
GROUP BY icu.SUBJECT_ID
;
"

df_first_icu <- dbGetQuery(con, sql_first_icu)

### 2. Get detailed information for first ICU stay

In [16]:
sql_icu_full <- "
SELECT 
    icu.SUBJECT_ID,
    icu.HADM_ID,
    icu.INTIME AS ICU_ADMIT,
    icu.OUTTIME AS ICU_DISCH,
    TIMESTAMPDIFF(HOUR, icu.INTIME, icu.OUTTIME) AS ICU_LOS_HOURS,
    a.ADMITTIME AS HOSP_ADMIT,
    a.DISCHTIME AS HOSP_DISCH,
    TIMESTAMPDIFF(HOUR, a.ADMITTIME, a.DISCHTIME) AS HOSP_LOS_HOURS,
    p.GENDER,
    a.ETHNICITY,
    p.DOB,
    p.DOD
FROM ICUSTAYS icu
JOIN ADMISSIONS a ON icu.HADM_ID = a.HADM_ID
JOIN PATIENTS p ON icu.SUBJECT_ID = p.SUBJECT_ID
JOIN (
    SELECT SUBJECT_ID, MIN(INTIME) AS FIRST_ICU
    FROM ICUSTAYS
    GROUP BY SUBJECT_ID
) first_icu ON icu.SUBJECT_ID = first_icu.SUBJECT_ID AND icu.INTIME = first_icu.FIRST_ICU
WHERE TIMESTAMPDIFF(HOUR, icu.INTIME, icu.OUTTIME) >= 24
;
"

df_first_icu_full <- dbGetQuery(con, sql_icu_full)

### 3. Data cleaning and feature creation

In [17]:
df_clean <- df_first_icu_full %>%
  
  # Calculate age at ICU admission
  mutate(AGE = as.numeric(format(ICU_ADMIT, "%Y")) - as.numeric(format(DOB, "%Y"))) %>%
  
  # Keep only adult patients between 18 and 85 years
  filter(AGE >= 18 & AGE <= 85) %>%
  
  # Define mortality: 1 if patient died before or during hospital discharge
  mutate(MORTALITY = ifelse(!is.na(DOD) & DOD <= HOSP_DISCH, 1, 0)) %>%
  
  # Remove missing or unknown gender/ethnicity
  filter(
    !is.na(GENDER), !is.na(ETHNICITY),
    GENDER != "UNKNOWN", ETHNICITY != "UNKNOWN/NOT SPECIFIED"
  ) %>%
  
  # Group ethnicity into major categories
  mutate(
    ETHNICITY_GROUP = case_when(
      grepl("^WHITE", ETHNICITY) ~ "WHITE",
      grepl("^BLACK", ETHNICITY) ~ "BLACK",
      grepl("^HISPANIC", ETHNICITY) ~ "HISPANIC",
      grepl("^ASIAN", ETHNICITY) ~ "ASIAN",
      TRUE ~ "OTHER"
    ),
    ETHNICITY_GROUP = factor(ETHNICITY_GROUP, levels = c("WHITE", "BLACK", "HISPANIC", "ASIAN", "OTHER"))
  )


### 4. Check cleaned data

In [18]:
table(df_clean$GENDER)
table(df_clean$ETHNICITY_GROUP)
dim(df_clean)
head(df_clean)


    F     M 
10997 15507 


   WHITE    BLACK HISPANIC    ASIAN    OTHER 
   20697     2303     1008      703     1793 

Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,ICU_ADMIT,ICU_DISCH,ICU_LOS_HOURS,HOSP_ADMIT,HOSP_DISCH,HOSP_LOS_HOURS,GENDER,ETHNICITY,DOB,DOD,AGE,MORTALITY,ETHNICITY_GROUP
Unnamed: 0_level_1,<int>,<int>,<dttm>,<dttm>,<int64>,<dttm>,<dttm>,<int64>,<chr>,<chr>,<dttm>,<dttm>,<dbl>,<dbl>,<fct>
1,3,145834,2101-10-20 19:10:11,2101-10-26 20:43:09,145,2101-10-20 19:08:00,2101-10-31 13:58:00,258,M,WHITE,2025-04-11,2102-06-14,76,0,WHITE
2,4,185777,2191-03-16 00:29:31,2191-03-17 16:46:31,40,2191-03-16 00:28:00,2191-03-23 18:41:00,186,F,WHITE,2143-05-12,,48,0,WHITE
3,6,107064,2175-05-30 21:30:54,2175-06-03 13:39:54,88,2175-05-30 07:15:00,2175-06-15 16:00:00,392,F,WHITE,2109-06-21,,66,0,WHITE
4,11,194540,2178-04-16 06:19:32,2178-04-17 20:21:05,38,2178-04-16 06:18:00,2178-05-11 19:00:00,612,F,WHITE,2128-02-22,2178-11-14,50,0,WHITE
5,12,112213,2104-08-08 02:08:17,2104-08-15 17:22:25,183,2104-08-07 10:15:00,2104-08-20 02:57:00,304,M,WHITE,2032-03-24,2104-08-20,72,1,WHITE
6,13,143045,2167-01-08 18:44:25,2167-01-12 10:43:31,87,2167-01-08 18:43:00,2167-01-15 15:15:00,164,F,WHITE,2127-02-27,,40,0,WHITE


## Phase 1 of ICU Data Cleaning and Feature Engineering – Summary

**So far, we have:**

1. **Selected the first ICU admission per patient**  
   - For each patient, we keep only their **first ICU stay**.

2. **Joined the ICU, Admissions, and Patients tables**  
   - This allows us to have **demographics** (gender, date of birth, ethnicity) and **admission/discharge times** for both the ICU and the hospital.  
   - We also calculate **lengths of stay (LOS) in hours** for the ICU and hospital.

3. **Calculated age at ICU admission and filtered adult patients (18–85 years old)**  
   - Ensures the analysis focuses on **adult patients** only.

4. **Created a mortality flag**  
   - `MORTALITY = 1` if the patient **died before or during hospital discharge**, otherwise `0`.

5. **Cleaned missing or unknown gender/ethnicity values**  
   - Removes entries with **unknown or missing gender or ethnicity** to ensure data quality.

6. **Grouped ethnicity into five categories and converted to a factor**  
   - Categories: `WHITE`, `BLACK`, `HISPANIC`, `ASIAN`, `OTHER`.

7. **Checked the cleaned dataset**  
   - Verified **tables of gender and ethnicity**, **dimensions**, and a **preview of the first rows**.


# ICU Cohort – Adding Comorbidity Flags

This section adds **binary comorbidity flags** to the existing cleaned ICU cohort. 
The flags are derived from ICD-9 diagnosis descriptions.
## 1. SQL query to extract comorbidities

In [25]:
# ---------------------------------------------
# 1. Traer tablas de diagnósticos desde la base de datos
# ---------------------------------------------
dx <- dbGetQuery(con, "SELECT * FROM DIAGNOSES_ICD")
d_icd_diagnoses <- dbGetQuery(con, "SELECT * FROM D_ICD_DIAGNOSES")

# ---------------------------------------------
# 2. Filtrar solo los HADM_ID de nuestra cohorte
# ---------------------------------------------
df_diag <- dx %>%
  filter(HADM_ID %in% df_clean$HADM_ID) %>%
  left_join(d_icd_diagnoses, by = c("ICD9_CODE" = "ICD9_CODE"))

# ---------------------------------------------
# 3. Crear flags de comorbilidades agrupando por HADM_ID
# ---------------------------------------------
df_flags <- df_diag %>%
  group_by(HADM_ID) %>%
  summarise(
    flag_diabetes = max(grepl("diabetes", tolower(LONG_TITLE))),
    flag_hypertension = max(grepl("hypertension|high blood pressure", tolower(LONG_TITLE))),
    flag_ckd = max(grepl("chronic kidney|renal failure|kidney failure", tolower(LONG_TITLE))),
    flag_chf = max(grepl("heart failure|congestive heart", tolower(LONG_TITLE))),
    flag_copd = max(grepl("copd|chronic obstructive|emphysema|chronic bronchitis", tolower(LONG_TITLE))),
    flag_cancer = max(grepl("malignan|cancer|carcinoma|neoplasm|tumor", tolower(LONG_TITLE)))
  )

# ---------------------------------------------
# 4. Unir los flags con df_clean usando HADM_ID
# ---------------------------------------------
df_final <- df_clean %>%
  left_join(df_flags, by = "HADM_ID")

# ---------------------------------------------
# 5. Revisar resultado
# ---------------------------------------------
head(df_final)
table(df_final$flag_diabetes, useNA = "ifany")
table(df_final$flag_hypertension, useNA = "ifany")
table(df_final$flag_ckd, useNA = "ifany")


Unnamed: 0_level_0,SUBJECT_ID,HADM_ID,ICU_ADMIT,ICU_DISCH,ICU_LOS_HOURS,HOSP_ADMIT,HOSP_DISCH,HOSP_LOS_HOURS,GENDER,ETHNICITY,⋯,DOD,AGE,MORTALITY,ETHNICITY_GROUP,flag_diabetes,flag_hypertension,flag_ckd,flag_chf,flag_copd,flag_cancer
Unnamed: 0_level_1,<int>,<int>,<dttm>,<dttm>,<int64>,<dttm>,<dttm>,<int64>,<chr>,<chr>,⋯,<dttm>,<dbl>,<dbl>,<fct>,<int>,<int>,<int>,<int>,<int>,<int>
1,3,145834,2101-10-20 19:10:11,2101-10-26 20:43:09,145,2101-10-20 19:08:00,2101-10-31 13:58:00,258,M,WHITE,⋯,2102-06-14,76,0,WHITE,0,0,1,1,0,0
2,4,185777,2191-03-16 00:29:31,2191-03-17 16:46:31,40,2191-03-16 00:28:00,2191-03-23 18:41:00,186,F,WHITE,⋯,,48,0,WHITE,0,0,0,0,0,0
3,6,107064,2175-05-30 21:30:54,2175-06-03 13:39:54,88,2175-05-30 07:15:00,2175-06-15 16:00:00,392,F,WHITE,⋯,,66,0,WHITE,0,0,1,0,0,0
4,11,194540,2178-04-16 06:19:32,2178-04-17 20:21:05,38,2178-04-16 06:18:00,2178-05-11 19:00:00,612,F,WHITE,⋯,2178-11-14,50,0,WHITE,0,0,0,0,0,1
5,12,112213,2104-08-08 02:08:17,2104-08-15 17:22:25,183,2104-08-07 10:15:00,2104-08-20 02:57:00,304,M,WHITE,⋯,2104-08-20,72,1,WHITE,0,1,0,0,0,1
6,13,143045,2167-01-08 18:44:25,2167-01-12 10:43:31,87,2167-01-08 18:43:00,2167-01-15 15:15:00,164,F,WHITE,⋯,,40,0,WHITE,1,1,0,0,0,0



    0     1 
19405  7099 


    0     1 
14409 12095 


    0     1 
19434  7070 