In [9]:
library(bigrquery)
library(tidyverse)

In [10]:
# Store the project ID
projectid = "yhcr-prd-phm-bia-core"

In [11]:
# Set your query
sql <- "SELECT * FROM `yhcr-prd-phm-bia-core.CB_FDM_PrimaryCare_v5.tbl_SRCode` WHERE src_dateevent BETWEEN '2007-01-01' AND '2022-12-31' AND (src_ctv3code LIKE '%C1010%' OR
src_ctv3code LIKE '%C1030%' OR
src_ctv3code LIKE '%C1080%' OR
src_ctv3code LIKE '%C1081%' OR
src_ctv3code LIKE '%C1082%' OR
src_ctv3code LIKE '%C1083%' OR
src_ctv3code LIKE '%C1085%' OR
src_ctv3code LIKE '%C1086%' OR
src_ctv3code LIKE '%C1087%' OR
src_ctv3code LIKE '%C1088%' OR
src_ctv3code LIKE '%C1089%' OR
src_ctv3code LIKE '%L1805%' OR
src_ctv3code LIKE '%X40J4%' OR
src_ctv3code LIKE '%X40JY%' OR
src_ctv3code LIKE '%Xa4g7%' OR
src_ctv3code LIKE '%XaELP%' OR
src_ctv3code LIKE '%XaEnn%' OR
src_ctv3code LIKE '%XaEno%' OR
src_ctv3code LIKE '%XaF04%' OR
src_ctv3code LIKE '%XaFWG%' OR
src_ctv3code LIKE '%XaFm8%' OR
src_ctv3code LIKE '%XaFmK%' OR
src_ctv3code LIKE '%XaFmL%' OR
src_ctv3code LIKE '%XaFmM%' OR
src_ctv3code LIKE '%XaIzM%' OR
src_ctv3code LIKE '%XaIzN%' OR
src_ctv3code LIKE '%XaJSr%' OR
src_ctv3code LIKE '%XaKyW%')"

In [12]:
# Run the query; this returns a bq_table object that you can query further
tb <- bq_project_query(projectid, sql)

In [13]:
# Store data in a tibble
t1d <-bq_table_download(tb)

“NAs produced by integer overflow”


In [14]:
# keep only columns 1,4,7, and 8
t1d <- t1d %>% select(1,4,7,8)
# number of rows, NAs, duplicated
nrow(t1d)
sum(is.na(t1d))
sum(duplicated(t1d))

In [15]:
# remove duplicated rows and view number of rows remaining
t1d <- t1d[!duplicated(t1d),]
nrow(t1d)

In [16]:
# add indicator variable for t1d status i.e. t1d = 1 and drop ctv3code and ctv3name
t1d <- t1d %>%
  mutate(t1d_status = 1) %>%
  select(-3,-4)
head(t1d)

person_id,src_dateevent,t1d_status
<int>,<dttm>,<dbl>
7777930,2013-03-20 11:10:06,1
11093217,2022-02-21 10:46:35,1
12562319,2007-07-10 00:00:00,1
12669405,2008-12-03 15:53:16,1
13384762,2009-10-20 00:00:01,1
12391036,2012-03-06 00:00:01,1


In [17]:
# change class of t1d_status and dateevent
t1d$src_dateevent <- as.Date(t1d$src_dateevent)
t1d$t1d_status <- as.integer(t1d$t1d_status)

In [18]:
t1d <- t1d %>%
group_by(person_id) %>%
arrange(src_dateevent) %>%
mutate(event_number = row_number()) %>%
slice_min(event_number) %>%
select(-event_number)

head(t1d)

person_id,src_dateevent,t1d_status
<int>,<date>,<int>
1153,2007-02-19,1
5867,2015-04-10,1
8985,2007-05-15,1
11297,2007-06-12,1
13003,2022-07-26,1
19698,2020-11-04,1


In [19]:
nrow(t1d)

In [20]:
# set destination table - do this regardless of whether it already exists
dest_dataset <- bq_dataset("yhcr-prd-phm-bia-core","CB_MYSPACE_AH")
dest_table <- bq_table(dest_dataset, "Type1Diabetes")

In [21]:
# create a full table spec as follows
# then pass this to bq_table_upload
dest_fields <- bq_fields(list(bq_field("person_id", "INT64", "REQUIRED"),
                              bq_field("src_dateevent", "DATE"),
                              bq_field("t1d_status", "INT64")
))

# delete an existing table if necessary
if(bq_table_exists(dest_table)) bq_table_delete(dest_table)
# upload file_data to dest_table
bq_table_upload(dest_table, t1d, fields = dest_fields, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE')