# Getting the final file to use in our analysis

In this section we are going to access the clinical data needed for our analysis, and join it with the imputed outcome variables as well as the PCs

# Setup

## Loading libraries

In [1]:
    library(tidyverse)  # Data wrangling packages.
library(reticulate) # R Interface to Python

# Expect these to be installed by default
library(Ronaldo)    # Leonardo R package.
library(bigrquery)  # BigQuery R client.
library(ggplot2)
library(repr)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.4     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



## Set up utilities

In [2]:
# Utility routine for printing a shell command before executing it
shell_do <- function(command) {
    print(paste('Executing: ', command))
    system(command, intern = TRUE)
}

# Utility routines for reading files from Google Cloud Storage
gcs_read_file <- function(path) {
    pipe(str_glue('gsutil -u {BILLING_PROJECT_ID} cat {path}'))
}
gcs_read_csv <- function(path, sep=',') {
    readr::read_csv(gcs_read_file(path))
}

# Utility routine for display a message and a link
display_html_link <- function(description, link_text, url) {
    html = str_glue('
    <p>
    </p>
    <p>
    {description}
    <a target=_blank href="{url}">{link_text}</a>.
    </p>
    ')

    IRdisplay::display_html(html)
}

# Utility routine for displaying a message and link to Cloud Console
link_to_cloud_console_gcs <- function(description, link_text, gcs_path) {
    url_path <- file.path('https://console.cloud.google.com/storage/browser',
                          str_replace(gcs_path, "gs://",""))
    url_query <- str_glue('userProject={URLencode(BILLING_PROJECT_ID)}')

    url = str_glue('{url_path}?{url_query}')

    display_html_link(description, link_text, url)
}


# Setup notebook globals

In [3]:
BILLING_PROJECT_ID <- Sys.getenv('GOOGLE_PROJECT')
WORKSPACE_NAMESPACE <- Sys.getenv('WORKSPACE_NAMESPACE')
WORKSPACE_NAME <- Sys.getenv('WORKSPACE_NAME')

fapi <- import("firecloud.api")
WORKSPACE_ATTRIBUTES <- fapi$get_workspace(WORKSPACE_NAMESPACE, WORKSPACE_NAME)$json()$workspace$attributes

# Setting the path to my workspace
PATH_MYWORKSPACE = 'gs://fc-cd759889-2702-4f72-a832-0be756073417'
print(PATH_MYWORKSPACE)

print(shell_do(str_glue('gsutil -u {BILLING_PROJECT_ID} ls {PATH_MYWORKSPACE}')))
print(shell_do(str_glue('gsutil -u {BILLING_PROJECT_ID} ls {PATH_MYWORKSPACE}/notebooks')))
print(shell_do(str_glue('gsutil -u {BILLING_PROJECT_ID} ls {PATH_MYWORKSPACE}/files')))


## Getting the path to AMP-PD clinical data

# GS_RELEASE_PATH = 'gs://amp-pd-data/releases/2021_v2-5release_0510'
# GS_CLINICAL_RELEASE_PATH = f'{GS_RELEASE_PATH}/clinical/'
GS_TIER1_RELEASE_PATH <- 'gs://amp-pd-data/releases/2021_v2-5release_0510'
GS_CLINICAL_RELEASE_PATH <- str_glue('{GS_TIER1_RELEASE_PATH}/clinical')

[1] "gs://fc-cd759889-2702-4f72-a832-0be756073417"
[1] "Executing:  gsutil -u terra-ed19e231 ls gs://fc-cd759889-2702-4f72-a832-0be756073417"
[1] "gs://fc-cd759889-2702-4f72-a832-0be756073417/files/"    
[2] "gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks/"
[1] "Executing:  gsutil -u terra-ed19e231 ls gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks"
 [1] "gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks/PD_MDSUPDRSIII.csv"                             
 [2] "gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks/Py - 1. Exploration_Filtering.ipynb"            
 [3] "gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks/Py - 2. data_QC.ipynb"                          
 [4] "gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks/Py - data_QC.ipynb"                             
 [5] "gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks/R - 0. Start here.ipynb"                        
 [6] "gs://fc-cd759889-2702-4f72-a832-0be756073417/notebooks/R - 1. MDS_UPDRSIII_im

In [4]:
shell_do(str_glue('gsutil -u {BILLING_PROJECT_ID} ls {GS_TIER1_RELEASE_PATH}'))

[1] "Executing:  gsutil -u terra-ed19e231 ls gs://amp-pd-data/releases/2021_v2-5release_0510"


In [5]:
shell_do(str_glue('gsutil -u {BILLING_PROJECT_ID} ls {GS_TIER1_RELEASE_PATH}/clinical'))

[1] "Executing:  gsutil -u terra-ed19e231 ls gs://amp-pd-data/releases/2021_v2-5release_0510/clinical"


# Loading data

In [14]:
demographics_df <- gcs_read_csv(file.path(GS_CLINICAL_RELEASE_PATH, 'Demographics.csv'))
UPDRSIII_HY <- gcs_read_csv(file.path(GS_CLINICAL_RELEASE_PATH, 'MDS_UPDRS_Part_III.csv'))
UPDRSII <-  gcs_read_csv(file.path(GS_CLINICAL_RELEASE_PATH, 'MDS_UPDRS_Part_II.csv'))

print('Basic information about the demographics data:')
print('')
summary(demographics_df)

PDBP.PC <- read_tsv("/home/jupyter/working_dir/PDBP_QCed.PCA.eigenvec")
PDBP.outcome_all <- read.csv("/home/jupyter/working_dir/PDBPoutput_imputed_allVisits.csv")
PDBP.outcome <- read.csv("/home/jupyter/working_dir/PDBPoutput_imputed_36m.csv")

head(PDBP.PC)
head(PDBP.outcome)

[1m[1mRows: [1m[22m[34m[34m11226[34m[39m [1m[1mColumns: [1m[22m[34m[34m9[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (7): participant_id, GUID, visit_name, sex, ethnicity, race, education_l...
[32mdbl[39m (2): visit_month, age_at_baseline


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

[1m[1mRows: [1m[22m[34m[34m23353[34m[39m [1m[1mColumns: [1m[22m[34m[34m77[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (41): participant_id, GUID, visit_name, upd2301_speech_problems, upd2302...
[32mdbl[39m (36): visit_month, c

[1] "Basic information about the demographics data:"
[1] ""


 participant_id         GUID            visit_name         visit_month    
 Length:11226       Length:11226       Length:11226       Min.   :-1.000  
 Class :character   Class :character   Class :character   1st Qu.: 0.000  
 Mode  :character   Mode  :character   Mode  :character   Median : 0.000  
                                                          Mean   : 1.252  
                                                          3rd Qu.: 0.000  
                                                          Max.   :72.000  
 age_at_baseline     sex             ethnicity             race          
 Min.   :16.00   Length:11226       Length:11226       Length:11226      
 1st Qu.:60.00   Class :character   Class :character   Class :character  
 Median :69.00   Mode  :character   Mode  :character   Mode  :character  
 Mean   :68.07                                                           
 3rd Qu.:77.00                                                           
 Max.   :90.00                 

[1m[1mRows: [1m[22m[34m[34m736[34m[39m [1m[1mColumns: [1m[22m[34m[34m12[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m  (2): #FID, IID
[32mdbl[39m (10): PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



#FID,IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
PD-PDAA503EF5,PD-PDAA503EF5,-0.0135112,0.0390564,-0.058038,0.0164341,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951
PD-PDAB074CYQ,PD-PDAB074CYQ,0.0169406,-0.0153572,-0.0297712,0.0188203,-0.0536203,-0.0602583,0.00141079,-0.00146921,-0.0265564,0.026531
PD-PDAB549YWB,PD-PDAB549YWB,0.0149839,-0.0155857,-0.0423481,0.0639633,-0.00291348,0.0356209,-0.00103406,-0.0198908,-0.0372961,0.0315869
PD-PDAB729HWD,PD-PDAB729HWD,-0.00104895,-0.0303585,0.0976878,0.015057,-0.000200239,-0.00496022,-0.00660119,-0.037698,0.00750216,-0.0741169
PD-PDAB762PA3,PD-PDAB762PA3,0.0299967,0.0192727,0.00312318,-0.0253144,0.0176776,0.040062,-0.0244905,0.0266556,-0.00305509,-0.0136971
PD-PDAC268KWV,PD-PDAC268KWV,-0.0758863,-0.0107498,0.0598167,0.00539207,-0.000236216,-0.0502614,-0.0145071,0.016059,-0.0105057,0.0464046


Unnamed: 0_level_0,ID,visit_month,UPDRSIIItotal_imputed,UPDRSIII_measure_total,UPDRSIIIaxial_imputed,UPDRSIII_measure_axial,UPDRSIIIlimb_imputed,UPDRSIII_measure_limb
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<chr>,<dbl>,<chr>,<lgl>,<lgl>
1,PD-PDAA503EF5,0,20,V0_UPDRS_III_total,20,V0_UPDRS_III_axial,,
2,PD-PDAA503EF5,12,31,V12_UPDRS_III_total,31,V12_UPDRS_III_axial,,
3,PD-PDAA503EF5,18,27,V18_UPDRS_III_total,27,V18_UPDRS_III_axial,,
4,PD-PDAA503EF5,24,20,V24_UPDRS_III_total,20,V24_UPDRS_III_axial,,
5,PD-PDAA503EF5,30,25,V30_UPDRS_III_total,25,V30_UPDRS_III_axial,,
6,PD-PDAA503EF5,36,34,V36_UPDRS_III_total,34,V36_UPDRS_III_axial,,


In [22]:
clinical_PCs_df = PDBP.outcome %>% 
    inner_join(PDBP.PC %>% select(-`#FID`), by = c("ID" = "IID")) %>%
    inner_join(demographics_df %>% select(participant_id, age_at_baseline, sex), by = c("ID"="participant_id")) %>%
    mutate(sex = relevel(as.factor(sex), ref= "Male")) %>%
    inner_join(UPDRSII %>% select(participant_id, visit_month, mds_updrs_part_ii_summary_score),
              by = c("ID"="participant_id", "visit_month")) %>%
    inner_join(UPDRSIII_HY %>% select(participant_id, visit_month, code_upd2hy_hoehn_and_yahr_stage),
              by = c("ID"="participant_id", "visit_month"))
  

clinical_PCs_df_all = PDBP.outcome_all %>% 
    inner_join(PDBP.PC %>% select(-`#FID`), by = c("ID" = "IID")) %>%
    inner_join(demographics_df %>% select(participant_id, age_at_baseline, sex), by = c("ID"="participant_id")) %>%
    mutate(sex = relevel(as.factor(sex), ref= "Male")) %>%
    inner_join(UPDRSII %>% select(participant_id, visit_month, mds_updrs_part_ii_summary_score),
              by = c("ID"="participant_id", "visit_month")) %>%
    inner_join(UPDRSIII_HY %>% select(participant_id, visit_month, code_upd2hy_hoehn_and_yahr_stage),
              by = c("ID"="participant_id", "visit_month"))


In [23]:
head(clinical_PCs_df)
str(clinical_PCs_df)
dim(clinical_PCs_df)

head(clinical_PCs_df_all)
str(clinical_PCs_df_all)
dim(clinical_PCs_df_all)

Unnamed: 0_level_0,ID,visit_month,UPDRSIIItotal_imputed,UPDRSIII_measure_total,UPDRSIIIaxial_imputed,UPDRSIII_measure_axial,UPDRSIIIlimb_imputed,UPDRSIII_measure_limb,PC1,PC2,⋯,PC5,PC6,PC7,PC8,PC9,PC10,age_at_baseline,sex,mds_updrs_part_ii_summary_score,code_upd2hy_hoehn_and_yahr_stage
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<lgl>,<lgl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>
1,PD-PDAA503EF5,0,20,V0_UPDRS_III_total,20,V0_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,13,2
2,PD-PDAA503EF5,12,31,V12_UPDRS_III_total,31,V12_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,11,2
3,PD-PDAA503EF5,18,27,V18_UPDRS_III_total,27,V18_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,13,2
4,PD-PDAA503EF5,24,20,V24_UPDRS_III_total,20,V24_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,13,2
5,PD-PDAA503EF5,30,25,V30_UPDRS_III_total,25,V30_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,9,2
6,PD-PDAA503EF5,36,34,V36_UPDRS_III_total,34,V36_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,15,2


'data.frame':	2466 obs. of  22 variables:
 $ ID                              : chr  "PD-PDAA503EF5" "PD-PDAA503EF5" "PD-PDAA503EF5" "PD-PDAA503EF5" ...
 $ visit_month                     : num  0 12 18 24 30 36 6 0 0 12 ...
 $ UPDRSIIItotal_imputed           : num  20 31 27 20 25 34 19 15 5 5 ...
 $ UPDRSIII_measure_total          : chr  "V0_UPDRS_III_total" "V12_UPDRS_III_total" "V18_UPDRS_III_total" "V24_UPDRS_III_total" ...
 $ UPDRSIIIaxial_imputed           : num  20 31 27 20 25 34 19 15 5 5 ...
 $ UPDRSIII_measure_axial          : chr  "V0_UPDRS_III_axial" "V12_UPDRS_III_axial" "V18_UPDRS_III_axial" "V24_UPDRS_III_axial" ...
 $ UPDRSIIIlimb_imputed            : logi  NA NA NA NA NA NA ...
 $ UPDRSIII_measure_limb           : logi  NA NA NA NA NA NA ...
 $ PC1                             : num  -0.0135 -0.0135 -0.0135 -0.0135 -0.0135 ...
 $ PC2                             : num  0.0391 0.0391 0.0391 0.0391 0.0391 ...
 $ PC3                             : num  -0.058 -0.058 -0.058 -0

Unnamed: 0_level_0,ID,visit_month,UPDRSIIItotal_imputed,UPDRSIII_measure_total,UPDRSIIIaxial_imputed,UPDRSIII_measure_axial,UPDRSIIIlimb_imputed,UPDRSIII_measure_limb,PC1,PC2,⋯,PC5,PC6,PC7,PC8,PC9,PC10,age_at_baseline,sex,mds_updrs_part_ii_summary_score,code_upd2hy_hoehn_and_yahr_stage
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<lgl>,<lgl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>,<dbl>
1,PD-PDAA503EF5,0,20,V0_UPDRS_III_total,20,V0_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,13,2
2,PD-PDAA503EF5,12,31,V12_UPDRS_III_total,31,V12_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,11,2
3,PD-PDAA503EF5,18,27,V18_UPDRS_III_total,27,V18_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,13,2
4,PD-PDAA503EF5,24,20,V24_UPDRS_III_total,20,V24_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,13,2
5,PD-PDAA503EF5,30,25,V30_UPDRS_III_total,25,V30_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,9,2
6,PD-PDAA503EF5,36,34,V36_UPDRS_III_total,34,V36_UPDRS_III_axial,,,-0.0135112,0.0390564,⋯,-0.020268,-0.0157824,0.00145366,0.0117963,-0.00368082,-0.0332951,70,Female,15,2


'data.frame':	2680 obs. of  22 variables:
 $ ID                              : chr  "PD-PDAA503EF5" "PD-PDAA503EF5" "PD-PDAA503EF5" "PD-PDAA503EF5" ...
 $ visit_month                     : num  0 12 18 24 30 36 42 48 6 0 ...
 $ UPDRSIIItotal_imputed           : num  20 31 27 20 25 34 30 38 19 15 ...
 $ UPDRSIII_measure_total          : chr  "V0_UPDRS_III_total" "V12_UPDRS_III_total" "V18_UPDRS_III_total" "V24_UPDRS_III_total" ...
 $ UPDRSIIIaxial_imputed           : num  20 31 27 20 25 34 30 38 19 15 ...
 $ UPDRSIII_measure_axial          : chr  "V0_UPDRS_III_axial" "V12_UPDRS_III_axial" "V18_UPDRS_III_axial" "V24_UPDRS_III_axial" ...
 $ UPDRSIIIlimb_imputed            : logi  NA NA NA NA NA NA ...
 $ UPDRSIII_measure_limb           : logi  NA NA NA NA NA NA ...
 $ PC1                             : num  -0.0135 -0.0135 -0.0135 -0.0135 -0.0135 ...
 $ PC2                             : num  0.0391 0.0391 0.0391 0.0391 0.0391 ...
 $ PC3                             : num  -0.058 -0.058 -0.0

In [24]:
# Checking there is no NAs, specially on the the outcome variables
sapply(X = clinical_PCs_df, FUN = function(x) sum(is.na(x)))

# Writing up the the clinical_PCs file

In [26]:
saveRDS(clinical_PCs_df, "/home/jupyter/working_dir/PDBP_final.rds")
str(readRDS("/home/jupyter/working_dir/PDBP_final.rds"))

saveRDS(clinical_PCs_df_all, "/home/jupyter/working_dir/PDBP_final_allVisits.rds")

'data.frame':	2466 obs. of  22 variables:
 $ ID                              : chr  "PD-PDAA503EF5" "PD-PDAA503EF5" "PD-PDAA503EF5" "PD-PDAA503EF5" ...
 $ visit_month                     : num  0 12 18 24 30 36 6 0 0 12 ...
 $ UPDRSIIItotal_imputed           : num  20 31 27 20 25 34 19 15 5 5 ...
 $ UPDRSIII_measure_total          : chr  "V0_UPDRS_III_total" "V12_UPDRS_III_total" "V18_UPDRS_III_total" "V24_UPDRS_III_total" ...
 $ UPDRSIIIaxial_imputed           : num  20 31 27 20 25 34 19 15 5 5 ...
 $ UPDRSIII_measure_axial          : chr  "V0_UPDRS_III_axial" "V12_UPDRS_III_axial" "V18_UPDRS_III_axial" "V24_UPDRS_III_axial" ...
 $ UPDRSIIIlimb_imputed            : logi  NA NA NA NA NA NA ...
 $ UPDRSIII_measure_limb           : logi  NA NA NA NA NA NA ...
 $ PC1                             : num  -0.0135 -0.0135 -0.0135 -0.0135 -0.0135 ...
 $ PC2                             : num  0.0391 0.0391 0.0391 0.0391 0.0391 ...
 $ PC3                             : num  -0.058 -0.058 -0.058 -0

In [31]:
#Copying data into the google bucket
shell_do(str_glue('gsutil -mu {BILLING_PROJECT_ID} cp -r /home/jupyter/working_dir/PDBP_final.rds {PATH_MYWORKSPACE}/files'))
shell_do(str_glue('gsutil -mu {BILLING_PROJECT_ID} cp -r /home/jupyter/working_dir/PDBP_final_allVisits.rds {PATH_MYWORKSPACE}/files'))

[1] "Executing:  gsutil -mu terra-ed19e231 cp -r /home/jupyter/working_dir/PDBP_final.rds gs://fc-cd759889-2702-4f72-a832-0be756073417/files"


[1] "Executing:  gsutil -mu terra-ed19e231 cp -r /home/jupyter/working_dir/PDBP_final_allVisits.rds gs://fc-cd759889-2702-4f72-a832-0be756073417/files"
