In [1]:
library(bigrquery)
library(tidyverse)
library(lubridate)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
# Store the project ID
projectid = "yhcr-prd-phm-bia-core"

In [3]:
# Set your query
sql1 <- "SELECT * FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Type1Diabetes`"
sql2 <- "SELECT * FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Type2Diabetes`"
sql3 <- "SELECT * FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Diabetic_Meds`"

# Run the query; this returns a bq_table object that you can query further
tb1 <- bq_project_query(projectid, sql1)
tb2 <- bq_project_query(projectid, sql2)
tb3 <- bq_project_query(projectid, sql3)

# Store data in a tibble
t1d <-bq_table_download(tb1)
t2d <-bq_table_download(tb2)
meds <-bq_table_download(tb3)

In [4]:
# join t1d and t2d tables
diabetes <- full_join(t1d, t2d, by = "person_id")
head(diabetes)
rm(t1d, t2d)

person_id,src_dateevent.x,t1d_status,src_dateevent.y,t2d_status
<int>,<date>,<int>,<date>,<int>
12822328,2007-11-07,1,,
711752,2009-12-14,1,,
13561032,2009-12-14,1,,
12763062,2014-02-27,1,,
12883541,2014-02-27,1,2011-10-28,1.0
13481752,2015-07-24,1,,


In [5]:
diabetes <- diabetes %>%
    mutate(t1d_status = ifelse(is.na(t1d_status),0,1)) %>%
    mutate(t2d_status = ifelse(is.na(t2d_status),0,1)) %>%
    rename(date_t1d_diag = src_dateevent.x) %>%
    rename(date_t2d_diag = src_dateevent.y) %>%
    mutate(any_diabetes = ifelse(t1d_status == 1 | t2d_status == 1,1,0))

head(diabetes)

person_id,date_t1d_diag,t1d_status,date_t2d_diag,t2d_status,any_diabetes
<int>,<date>,<dbl>,<date>,<dbl>,<dbl>
12822328,2007-11-07,1,,0,1
711752,2009-12-14,1,,0,1
13561032,2009-12-14,1,,0,1
12763062,2014-02-27,1,,0,1
12883541,2014-02-27,1,2011-10-28,1,1
13481752,2015-07-24,1,,0,1


In [7]:
nrow(diabetes)
print("type 1 and type 2 diabetes")
table(diabetes$t1d_status, diabetes$t2d_status)
print("type 1 diabetes")
table(diabetes$t1d_status)
print("type 2 diabetes")
table(diabetes$t2d_status)
sum(duplicated(diabetes))

[1] "type 1 and type 2 diabetes"


   
        0     1
  0     0 55923
  1  2371   679

[1] "type 1 diabetes"



    0     1 
55923  3050 

[1] "type 2 diabetes"



    0     1 
 2371 56602 

In [8]:
# Join patient table to diabetes patients
sql3 <- "SELECT * FROM `yhcr-prd-phm-bia-core.CB_MYSPACE_AH.Patient_Denom`"
tb3 <- bq_project_query(projectid, sql3)
patient <- bq_table_download(tb3)
head(patient)

person_id,birth_datetime,death_datetime,gender,ethnicity,imd_decile
<int>,<date>,<date>,<int>,<int>,<int>
16813063,,,2,2,1
16832562,,,1,3,1
16769967,,,1,1,1
16832450,,,1,2,1
16814272,,,1,2,1
13715865,,,1,2,1


In [9]:
sum(is.na(patient$birth_datetime))
nrow(patient)

In [10]:
diabetes_patient <- inner_join(patient, diabetes, by = "person_id")
head(diabetes_patient)

person_id,birth_datetime,death_datetime,gender,ethnicity,imd_decile,date_t1d_diag,t1d_status,date_t2d_diag,t2d_status,any_diabetes
<int>,<date>,<date>,<int>,<int>,<int>,<date>,<dbl>,<date>,<dbl>,<dbl>
12736614,1931-06-15,2012-11-15,1,1,1,,0,2007-01-10,1,1
12663073,1931-06-15,2014-12-15,2,2,1,,0,2010-04-01,1,1
12679588,1931-06-15,,2,1,1,,0,2016-03-30,1,1
13203209,1931-06-15,2020-01-15,1,2,1,,0,2007-09-13,1,1
12760436,1931-06-15,2020-07-15,1,2,1,,0,2009-12-29,1,1
12602566,1931-06-15,2014-01-15,2,1,1,,0,2007-09-06,1,1


In [11]:
# calculate age at diabetes
diabetes_patient$age_t1d_diag <- interval(diabetes_patient$birth_datetime, diabetes_patient$date_t1d_diag) / years(1)
diabetes_patient$age_t1d_diag <- floor(diabetes_patient$age_t1d_diag)
diabetes_patient$age_t2d_diag <- interval(diabetes_patient$birth_datetime, diabetes_patient$date_t2d_diag) / years(1)
diabetes_patient$age_t2d_diag <- floor(diabetes_patient$age_t2d_diag)
head(diabetes_patient)

person_id,birth_datetime,death_datetime,gender,ethnicity,imd_decile,date_t1d_diag,t1d_status,date_t2d_diag,t2d_status,any_diabetes,age_t1d_diag,age_t2d_diag
<int>,<date>,<date>,<int>,<int>,<int>,<date>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>
12736614,1931-06-15,2012-11-15,1,1,1,,0,2007-01-10,1,1,,75
12663073,1931-06-15,2014-12-15,2,2,1,,0,2010-04-01,1,1,,78
12679588,1931-06-15,,2,1,1,,0,2016-03-30,1,1,,84
13203209,1931-06-15,2020-01-15,1,2,1,,0,2007-09-13,1,1,,76
12760436,1931-06-15,2020-07-15,1,2,1,,0,2009-12-29,1,1,,78
12602566,1931-06-15,2014-01-15,2,1,1,,0,2007-09-06,1,1,,76


In [12]:
# age at earliest diabetes 
diabetes_patient <- diabetes_patient %>%
    mutate(age_earliest_diabetes = case_when(
    (age_t1d_diag <= age_t2d_diag) | is.na(age_t2d_diag) ~ age_t1d_diag,
    (age_t1d_diag > age_t2d_diag) | is.na(age_t1d_diag) ~ age_t2d_diag)) %>%
select(-gender, -ethnicity, -imd_decile)
sum(is.na(diabetes_patient$birth_datetime))
nrow(diabetes_patient)

In [13]:
table(diabetes_patient$t2d_status)


    0     1 
 1661 42589 

In [14]:
(15228/43568)*100

In [15]:
# add date at earliest diabetes
diabetes_patient <- diabetes_patient %>%
    mutate(date_earliest_diabetes = case_when(
    (date_t1d_diag <= date_t2d_diag) | is.na(date_t2d_diag) ~ date_t1d_diag,
    (date_t1d_diag > date_t2d_diag) | is.na(date_t1d_diag) ~ date_t2d_diag,
    is.na(date_t1d_diag) & is.na(date_t2d_diag) ~ NA))
head(diabetes_patient)

person_id,birth_datetime,death_datetime,date_t1d_diag,t1d_status,date_t2d_diag,t2d_status,any_diabetes,age_t1d_diag,age_t2d_diag,age_earliest_diabetes,date_earliest_diabetes
<int>,<date>,<date>,<date>,<dbl>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<date>
12736614,1931-06-15,2012-11-15,,0,2007-01-10,1,1,,75,75,2007-01-10
12663073,1931-06-15,2014-12-15,,0,2010-04-01,1,1,,78,78,2010-04-01
12679588,1931-06-15,,,0,2016-03-30,1,1,,84,84,2016-03-30
13203209,1931-06-15,2020-01-15,,0,2007-09-13,1,1,,76,76,2007-09-13
12760436,1931-06-15,2020-07-15,,0,2009-12-29,1,1,,78,78,2009-12-29
12602566,1931-06-15,2014-01-15,,0,2007-09-06,1,1,,76,76,2007-09-06


In [17]:
nrow(diabetes_patient)
table(diabetes_patient$t1d_status)
table(diabetes_patient$t2d_status)
table(diabetes_patient$t1d_status, diabetes_patient$t2d_status)


    0     1 
42084  2166 


    0     1 
 1661 42589 

   
        0     1
  0     0 42084
  1  1661   505

In [18]:
# Join medication data to diabetes_patient
dm <- full_join(diabetes_patient, meds, by = "person_id")

In [20]:
nrow(dm)

In [21]:
# Remove females who are less than 50 who are on metformin but don't have a diabetes diagnostic code
dm <- dm %>%
    mutate(diab_date_present = ifelse(!is.na(date_earliest_diabetes),1,0)) %>% # 1 = diabetes date is present i.e. patient has diabetes diagnostic code
    mutate(metformin_pls_5yrs = (date_metformin %m+% years(5))) %>% # calculate 5 years from metformin prescription
    mutate(diab_met_diff = abs(floor(difftime(date_earliest_diabetes, date_metformin, units = c("days"))))) %>% # absolute number of days between diabetes diagnosis date and metformin date - if >1826 days, drop
    mutate(diab_met_diff = as.integer(diab_met_diff))

In [22]:
dm %>%
select(date_earliest_diabetes, diab_date_present, metformin_pls_5yrs, diab_met_diff, met_fe_lsthn_50, met_plus, gender) %>%
filter(!is.na(diab_met_diff)) %>%
arrange(desc(diab_met_diff)) 

date_earliest_diabetes,diab_date_present,metformin_pls_5yrs,diab_met_diff,met_fe_lsthn_50,met_plus,gender
<date>,<dbl>,<date>,<int>,<int>,<int>,<int>
2007-02-08,1,2021-06-08,3408,0,2,2
2007-05-10,1,2021-08-22,3392,0,2,1
2018-07-06,1,2014-12-01,3139,0,2,1
2021-11-25,1,2019-09-24,2619,0,2,2
2017-10-18,1,2015-09-22,2583,0,1,2
2016-03-08,1,2014-04-29,2505,0,2,1
2007-01-22,1,2018-02-19,2220,0,2,2
2011-09-22,1,2021-11-08,1874,0,2,1
2008-07-01,1,2018-04-23,1757,0,2,1
2011-09-22,1,2012-02-01,1694,0,2,1


In [18]:
colnames(dm)

In [23]:
# diabetes diagnosis from medications, choose earliest diabetes diagnosis date between earliest date and earliest med, calculate age at earliest diabetes diag
dm <- dm %>%
    mutate(diabetes_meds = ifelse(is.na(date_earliest_diabetes) & (has_metformin == 1 | has_thiazolid == 1 | has_gliptins == 1 | has_sulfon == 1 | has_glpra == 1 |
                                                                  has_glucinhib == 1 | has_combo == 1 | has_insulin == 1),1,0)) %>%
    mutate(earliest_diabetes_diag = case_when(
    (date_earliest_diabetes <= date_earliest_med) | is.na(date_earliest_med) ~ date_earliest_diabetes,
    (date_earliest_diabetes > date_earliest_med) | is.na(date_earliest_diabetes) ~ date_earliest_med)) %>%
    mutate(age_earliest_diab = floor(interval(birth_datetime, earliest_diabetes_diag) / years(1))) %>%
    # time between diabetes and death
    mutate(diab_death_time = abs(difftime(earliest_diabetes_diag, death_datetime, units = "days")),
          diab_death_time = as.integer(diab_death_time))

In [24]:
dm <- dm %>% select(person_id, date_t1d_diag, age_t1d_diag, t1d_status, date_t2d_diag, age_t2d_diag, t2d_status, any_diabetes, diabetes_meds, age_earliest_diab, earliest_diabetes_diag, diab_death_time)

In [25]:
nrow(dm)
table(dm$t1d_status)
table(dm$t2d_status)
table(dm$t1d_status, dm$t2d_status)
table(dm$any_diabetes)


    0     1 
42084  2166 


    0     1 
 1661 42589 

   
        0     1
  0     0 42084
  1  1661   505


    1 
44250 

In [21]:
colnames(dm)

In [26]:
# set destination table - do this regardless of whether it already exists
dest_dataset <- bq_dataset("yhcr-prd-phm-bia-core","CB_MYSPACE_AH")
dest_table <- bq_table(dest_dataset, "Diabetes")

# create a full table spec as follows
# then pass this to bq_table_upload
dest_fields <- bq_fields(list(bq_field("person_id", "INT64", "REQUIRED"),
                              bq_field("age_t1d_diag", "INT64"),
                              bq_field("date_t1d_diag", "DATE"),
                              bq_field("t1d_status", "INT64"),
                              bq_field("age_t2d_diag", "INT64"),
                              bq_field("date_t2d_diag", "DATE"),
                              bq_field("t2d_status", "INT64"),
                              bq_field("age_earliest_diab", "INT64"),
                              bq_field("earliest_diabetes_diag", "DATE"),
                              bq_field("any_diabetes", "INT64"),
                              bq_field("diab_death_time", "INT64"),
                              bq_field("diabetes_meds", "INT64")
))

# delete an existing table if necessary
if(bq_table_exists(dest_table)) bq_table_delete(dest_table)
# upload file_data to dest_table
bq_table_upload(dest_table, dm, fields = dest_fields, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE')

In [23]:
sum(is.na(diabetes_patient$gender))

“Unknown or uninitialised column: `gender`.”


In [21]:
nrow(diabetes_patient)

In [None]:
43568-15244