In [1]:
library(bigrquery)
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# Store the project ID
projectid = "yhcr-prd-phm-bia-core"

In [3]:
# CVD Episodes
sql3 <- "SELECT *  FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Cardiovascular_Diseases_Episodes`"
tb3 <- bq_project_query(projectid, sql3)
cvd_ep <-bq_table_download(tb3)

# Diabetes
sql1 <- "SELECT *  FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Diabetes`"
tb1 <- bq_project_query(projectid, sql1)
diabetes <-bq_table_download(tb1)

In [5]:
colnames(cvd_ep)

In [12]:
# Keep only relevant variables, choose earliest CVD date and LOS for that CVD
cvd <- cvd_ep %>%
    mutate(earliest_cvd_date = pmin(date_angina_diagnosis, date_cardiomyopathy_diagnosis, date_heart_failure_diagnosis, date_hypertension_diagnosis, date_ihd_diagnosis, date_mi_diagnosis, date_pvd_diagnosis, date_stroke_diagnosis, date_valve_disease_diagnosis, na.rm = TRUE)) %>%
    mutate(age_earliest_cvd = case_when(
        earliest_cvd_date == date_angina_diagnosis ~ age_angina_diag,
        earliest_cvd_date == date_cardiomyopathy_diagnosis ~ age_cardiomyopathy_diag,
        earliest_cvd_date == date_heart_failure_diagnosis ~ age_heart_failure_diag,
        earliest_cvd_date == date_hypertension_diagnosis ~ age_hypertension_diag,
        earliest_cvd_date == date_ihd_diagnosis ~ age_ihd_diag,
        earliest_cvd_date == date_mi_diagnosis ~ age_mi_diag,
        earliest_cvd_date == date_pvd_diagnosis ~ age_pvd_diag,
        earliest_cvd_date == date_stroke_diagnosis ~ age_stroke_diag,
        earliest_cvd_date == date_valve_disease_diagnosis ~ age_valve_disease_diag)) %>%
    mutate(los_earliest_cvd = case_when(
        earliest_cvd_date == date_angina_diagnosis ~ los_angina_days,
        earliest_cvd_date == date_cardiomyopathy_diagnosis ~ los_cardiomyopathy_days,
        earliest_cvd_date == date_heart_failure_diagnosis ~ los_heart_failure_days,
        earliest_cvd_date == date_hypertension_diagnosis ~ los_hypertension_days,
        earliest_cvd_date == date_ihd_diagnosis ~ los_ihd_days,
        earliest_cvd_date == date_mi_diagnosis ~ los_mi_days,
        earliest_cvd_date == date_pvd_diagnosis ~ los_pvd_days,
        earliest_cvd_date == date_stroke_diagnosis ~ los_stroke_days,
        earliest_cvd_date == date_valve_disease_diagnosis ~ los_valve_disease_days)) %>%
    select(person_id, has_cvd, cvd_number, earliest_cvd_date, los_earliest_cvd, age_earliest_cvd)

In [15]:
# Keep relevant vars in diabetes 
diab <- diabetes %>%
    select(person_id, age_earliest_diab, earliest_diabetes_diag, any_diabetes)

In [17]:
head(cvd)
head(diab)

person_id,has_cvd,cvd_number,earliest_cvd_date,los_earliest_cvd,age_earliest_cvd
<int>,<int>,<int>,<date>,<int>,<int>
13377215,1,2,2013-04-21,12,81
12693795,1,2,2016-10-21,8,73
13674850,1,2,2020-10-27,37,50
12517678,1,2,2021-07-27,0,50
12457945,1,2,2014-11-21,15,86
13054895,1,2,2014-01-15,15,77


person_id,age_earliest_diab,earliest_diabetes_diag,any_diabetes
<int>,<int>,<date>,<int>
13548546,60,2007-01-11,1
12975170,80,2020-04-28,1
13450523,36,2011-10-13,1
12431500,75,2011-08-16,1
13235332,47,2007-07-05,1
12805324,39,2008-04-08,1


In [20]:
# Join cvd and diab - inner join, left join or full join?
db_cvd <- inner_join(diab, cvd, by = "person_id")
colnames(db_cvd)

In [25]:
sum(db_cvd$los_earliest_cvd == 1)

exclude_1_day <- db_cvd %>%
    filter(!los_earliest_cvd <= 1)

summary(exclude_1_day$los_earliest_cvd)
IQR(exclude_1_day$los_earliest_cvd)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.000   2.000   4.000   6.963   7.000 223.000 