In [1]:
library(bigrquery)
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# Store the project ID
projectid = "yhcr-prd-phm-bia-core"

In [3]:
# load cvd tables
# CVD Primary Care
sql1 <- "SELECT *  FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Cardiovascular_Diseases_Primary_Care`"
tb1 <- bq_project_query(projectid, sql1)
cvd_pc <-bq_table_download(tb1)

# CVD A&E
sql2 <- "SELECT *  FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Cardiovascular_Diseases_AE`"
tb2 <- bq_project_query(projectid, sql2)
cvd_ae <-bq_table_download(tb2)

# CVD Episodes
sql3 <- "SELECT *  FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Cardiovascular_Diseases_Episodes`"
tb3 <- bq_project_query(projectid, sql3)
cvd_ep <-bq_table_download(tb3)


In [4]:
# Keep personid, date diagnosis and has_* variables
# Primary Care
pc_angina <- cvd_pc %>%
    select(person_id, date_angina_diagnosis, has_angina)
pc_cardiomyopathy <- cvd_pc %>%
    select(person_id, date_cardiomyopathy_diagnosis, has_cardiomyopathy)
pc_hf <- cvd_pc %>%
    select(person_id, date_heart_failure_diagnosis, has_heart_failure)
pc_ht <- cvd_pc %>%
    select(person_id, date_hypertension_diagnosis, has_hypertension)
pc_ihd <- cvd_pc %>%
    select(person_id, date_ihd_diagnosis, has_ihd)
pc_mi <- cvd_pc %>%
    select(person_id, date_mi_diagnosis, has_mi)
pc_pvd <- cvd_pc %>%
    select(person_id, date_pvd_diagnosis, has_pvd)
pc_stroke <- cvd_pc %>%
    select(person_id, date_stroke_diagnosis, has_stroke)
pc_vd <- cvd_pc %>%
    select(person_id, date_valve_disease_diagnosis, has_valve_disease)

# A&E
ae_angina <- cvd_ae %>%
    select(person_id, date_angina_diagnosis, has_angina)
ae_cardiomyopathy <- cvd_ae %>%
    select(person_id, date_cardiomyopathy_diagnosis, has_cardiomyopathy)
ae_hf <- cvd_ae %>%
    select(person_id, date_heart_failure_diagnosis, has_heart_failure)
ae_ht <- cvd_ae %>%
    select(person_id, date_hypertension_diagnosis, has_hypertension)
ae_ihd <- cvd_ae %>%
    select(person_id, date_ihd_diagnosis, has_ihd)
ae_mi <- cvd_ae %>%
    select(person_id, date_mi_diagnosis, has_mi)
ae_pvd <- cvd_ae %>%
    select(person_id, date_pvd_diagnosis, has_pvd)
ae_stroke <- cvd_ae %>%
    select(person_id, date_stroke_diagnosis, has_stroke)
ae_vd <- cvd_ae %>%
    select(person_id, date_valve_disease_diagnosis, has_valve_disease)

# Episodes
ep_angina <- cvd_ep %>%
    select(person_id, date_angina_diagnosis, has_angina)
ep_cardiomyopathy <- cvd_ep %>%
    select(person_id, date_cardiomyopathy_diagnosis, has_cardiomyopathy)
ep_hf <- cvd_ep %>%
    select(person_id, date_heart_failure_diagnosis, has_heart_failure)
ep_ht <- cvd_ep %>%
    select(person_id, date_hypertension_diagnosis, has_hypertension)
ep_ihd <- cvd_ep %>%
    select(person_id, date_ihd_diagnosis, has_ihd)
ep_mi <- cvd_ep %>%
    select(person_id, date_mi_diagnosis, has_mi)
ep_pvd <- cvd_ep %>%
    select(person_id, date_pvd_diagnosis, has_pvd)
ep_stroke <- cvd_ep %>%
    select(person_id, date_stroke_diagnosis, has_stroke)
ep_vd <- cvd_ep %>%
    select(person_id, date_valve_disease_diagnosis, has_valve_disease)

In [5]:
# join each cvd together
# angina
angina <- rbind(pc_angina, ae_angina, ep_angina)
cardiomyopathy <- rbind(pc_cardiomyopathy, ae_cardiomyopathy, ep_cardiomyopathy) 
heart_failure <- rbind(pc_hf, ae_hf, ep_hf)
hypertension <- rbind(pc_ht, ae_ht, ep_ht)
ihd <- rbind(pc_ihd, ae_ihd, ep_ihd)
mi <- rbind(pc_mi, ae_mi, ep_mi)
pvd <- rbind(pc_pvd, ae_pvd, ep_pvd)
stroke <- rbind(pc_stroke, ae_stroke, ep_stroke)
valve_disease <- rbind(pc_vd, ae_vd, ep_vd)

In [6]:
# Keep first occurrence for each cvd
# Angina
angina <- angina %>%
    filter(!has_angina == 0)
nrow(angina)
angina <- angina %>%
    group_by(person_id) %>%
    slice_min(date_angina_diagnosis, with_ties = FALSE)
nrow(angina)

# Cardiomyopathy
cardiomyopathy <- cardiomyopathy %>%
    filter(!has_cardiomyopathy == 0)
nrow(cardiomyopathy)
cardiomyopathy <- cardiomyopathy %>%
    group_by(person_id) %>%
    slice_min(date_cardiomyopathy_diagnosis, with_ties = FALSE)
nrow(cardiomyopathy)

# heart_failure
heart_failure <- heart_failure %>%
    filter(!has_heart_failure == 0)
nrow(heart_failure)
heart_failure <- heart_failure %>%
    group_by(person_id) %>%
    slice_min(date_heart_failure_diagnosis, with_ties = FALSE)
nrow(heart_failure)

# hypertension
hypertension <- hypertension %>%
    filter(!has_hypertension == 0)
nrow(hypertension)
hypertension <- hypertension %>%
    group_by(person_id) %>%
    slice_min(date_hypertension_diagnosis, with_ties = FALSE)
nrow(hypertension)

# ihd
ihd <- ihd %>%
    filter(!has_ihd == 0)
nrow(ihd)
ihd <- ihd %>%
    group_by(person_id) %>%
    slice_min(date_ihd_diagnosis, with_ties = FALSE)
nrow(ihd)

# mi
mi <- mi %>%
    filter(!has_mi == 0)
nrow(mi)
mi <- mi %>%
    group_by(person_id) %>%
    slice_min(date_mi_diagnosis, with_ties = FALSE)
nrow(mi)

# pvd
pvd <- pvd %>%
    filter(!has_pvd == 0)
nrow(pvd)
pvd <- pvd %>%
    group_by(person_id) %>%
    slice_min(date_pvd_diagnosis, with_ties = FALSE)
nrow(pvd)

# stroke
stroke <- stroke %>%
    filter(!has_stroke == 0)
nrow(stroke)
stroke <- stroke %>%
    group_by(person_id) %>%
    slice_min(date_stroke_diagnosis, with_ties = FALSE)
nrow(stroke)

# valve_disease
valve_disease <- valve_disease %>%
    filter(!has_valve_disease == 0)
nrow(valve_disease)
valve_disease <- valve_disease %>%
    group_by(person_id) %>%
    slice_min(date_valve_disease_diagnosis, with_ties = FALSE)
nrow(valve_disease)

In [7]:
head(angina)
head(cardiomyopathy)

person_id,date_angina_diagnosis,has_angina
<int>,<date>,<int>
1393,2002-01-22,1
1836,2004-11-03,1
1947,2006-05-05,1
2718,2012-05-29,1
5071,2004-01-05,1
5571,2004-06-22,1


person_id,date_cardiomyopathy_diagnosis,has_cardiomyopathy
<int>,<date>,<int>
5218,2009-12-01,1
6847,2021-06-02,1
17269,2013-10-21,1
18995,2005-05-11,1
25173,2011-12-09,1
33683,2007-10-12,1


In [8]:
# join all CVDs
cvd <- full_join(angina, cardiomyopathy, by = "person_id")
cvd <- full_join(cvd, heart_failure, by = "person_id")
cvd <- full_join(cvd, hypertension, by = "person_id")
cvd <- full_join(cvd, ihd, by = "person_id")
cvd <- full_join(cvd, mi, by = "person_id")
cvd <- full_join(cvd, pvd, by = "person_id")
cvd <- full_join(cvd, stroke, by = "person_id")
cvd <- full_join(cvd, valve_disease, by = "person_id")

In [9]:
cvd <- cvd %>%
    mutate_at(c('has_angina','has_cardiomyopathy', 'has_heart_failure', 'has_hypertension', 'has_ihd','has_mi', 'has_pvd', 'has_stroke', 'has_valve_disease'), ~replace_na(.,0))
head(cvd)

person_id,date_angina_diagnosis,has_angina,date_cardiomyopathy_diagnosis,has_cardiomyopathy,date_heart_failure_diagnosis,has_heart_failure,date_hypertension_diagnosis,has_hypertension,date_ihd_diagnosis,has_ihd,date_mi_diagnosis,has_mi,date_pvd_diagnosis,has_pvd,date_stroke_diagnosis,has_stroke,date_valve_disease_diagnosis,has_valve_disease
<int>,<date>,<int>,<date>,<int>,<date>,<int>,<date>,<int>,<date>,<int>,<date>,<int>,<date>,<int>,<date>,<int>,<date>,<int>
1393,2002-01-22,1,,0,,0,,0,,0,,0,,0,,0,,0
1836,2004-11-03,1,,0,,0,2002-06-14,1,,0,,0,,0,,0,,0
1947,2006-05-05,1,,0,,0,,0,,0,,0,,0,,0,,0
2718,2012-05-29,1,,0,,0,,0,,0,,0,,0,,0,,0
5071,2004-01-05,1,,0,,0,2002-10-02,1,,0,,0,,0,,0,,0
5571,2004-06-22,1,,0,,0,,0,,0,,0,,0,,0,,0


In [10]:
nrow(cvd)

In [11]:
sum(duplicated(cvd))

In [12]:
# save dataset
# set destination table - do this regardless of whether it already exists
dest_dataset <- bq_dataset("yhcr-prd-phm-bia-core","CB_MYSPACE_AH")
dest_table <- bq_table(dest_dataset, "All_CVD_First_Occurrence")

# create a full table spec as follows
# then pass this to bq_table_upload
dest_fields <- bq_fields(list(bq_field("person_id", "INT64", "REQUIRED"),
                              bq_field("date_angina_diagnosis", "DATE"),
                              bq_field("has_angina", "INT64"),
                              bq_field("date_cardiomyopathy_diagnosis", "DATE"),
                              bq_field("has_cardiomyopathy", "INT64"),
                              bq_field("date_heart_failure_diagnosis", "DATE"),
                              bq_field("has_heart_failure", "INT64"),
                              bq_field("date_hypertension_diagnosis", "DATE"),
                              bq_field("has_hypertension", "INT64"),
                              bq_field("date_ihd_diagnosis", "DATE"),
                              bq_field("has_ihd", "INT64"),
                              bq_field("date_mi_diagnosis", "DATE"),
                              bq_field("has_mi", "INT64"),
                              bq_field("date_pvd_diagnosis", "DATE"),
                              bq_field("has_pvd", "INT64"),
                              bq_field("date_stroke_diagnosis", "DATE"),
                              bq_field("has_stroke", "INT64"),
                              bq_field("date_valve_disease_diagnosis", "DATE"),
                              bq_field("has_valve_disease", "INT64")))

# delete an existing table if necessary
if(bq_table_exists(dest_table)) bq_table_delete(dest_table)
# upload file_data to dest_table
bq_table_upload(dest_table, cvd, fields = dest_fields, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE')

Auto-refreshing stale OAuth token.

