**This notebook saves R functions (even though it is a py notebook) to the workspace bucket that can be called from any notebook in the workspace.**

# Function to save a file to bucket

In [None]:
from google.cloud import storage
import os
import pandas as pd
import subprocess

def load_to_bucket(source_filename, destination_blob_name = 'notebooks/all_x_all'):
    
    """Uploads a file to the bucket."""
    my_bucket = os.getenv('WORKSPACE_BUCKET')

    args = ["gsutil", "cp", "-R", f"./{source_filename}", f"{my_bucket}/{destination_blob_name}/{source_filename}"]
    output = subprocess.run(args, capture_output=True)
    print(output.stderr)
    print(f'\n file in bucket at {destination_blob_name}/{source_filename}')

In [None]:
my_bucket = os.getenv('WORKSPACE_BUCKET')
dataset = os.getenv('WORKSPACE_CDR')

In [None]:
# import subprocess
# import glob

# notebooks_directory = "/path/to/your/notebooks/"
# output_directory = "/path/to/your/output/"

# for notebook in glob.glob(f"{notebooks_directory}/**/*.ipynb", recursive=True):
#     output_notebook = os.path.join(output_directory, os.path.basename(notebook))
#     command = ["jupyter", "nbconvert", "--to", "notebook", "--execute", notebook, "--output", output_notebook]
#     subprocess.run(command)

In [None]:
def read_csv_from_bucket(name_of_file_in_bucket, directory = 'notebooks/all_x_all', remove_dot = 'no'):
    my_bucket = os.getenv('WORKSPACE_BUCKET')
    if remove_dot == 'no':
        df = pd.read_csv(f"gsutil cp '{my_bucket}/{directory}/{name_of_file_in_bucket}' .")
    else:
        df = pd.read_csv(f"gsutil cp '{my_bucket}/{directory}/{name_of_file_in_bucket}'")
    return df

In [None]:
read_csv_from_bucket('r_drug_table.csv')

# `allxall_pm_summary_functions.R`

In [None]:
%%writefile allxall_pm_summary_functions.R

##################################################### SET UP ##############################################################
package_list <- c('bigrquery','tidyverse','dplyr','janitor','data.table')
for (pkg in package_list) if(!pkg %in% installed.packages()) {install.packages(pkg, quiet = T)}

library(bigrquery, warn.conflicts = F, quietly = T)
library(tidyverse, warn.conflicts = F, quietly = T)
library(janitor, warn.conflicts = F, quietly = T)
library(dplyr, warn.conflicts = F, quietly = T)
library(data.table, warn.conflicts = F, quietly = T)
options(dplyr.summarise.inform = FALSE)

read_csv_cols_from_bucket <- function(directory = 'notebooks/all_x_all/', name_of_file_in_bucket, concept_id
                                      , remove_dot = 'no'){
    
    #reads person_id and concept id columns in a df from the input csv file in teh bucket
    #if input csv file is already read from the bucket to the env, it simply reads the person_id and concept id columns in a df
    my_bucket = Sys.getenv('WORKSPACE_BUCKET')
    
    if (file.exists(name_of_file_in_bucket)){my_dataframe  <- fread(name_of_file_in_bucket
                                                                  , select = c("person_id", str_glue("{concept_id}")))
    } else {
            if (remove_dot == 'no'){
                    system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket, " ."), intern=T)              
            } else {system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket), intern=T)}
        
            my_dataframe  <- fread(name_of_file_in_bucket, select = c("person_id", str_glue("{concept_id}")))
            }

    return(my_dataframe)
    }

read_csv_from_bucket <- function(directory = 'notebooks/all_x_all/', name_of_file_in_bucket, remove_dot = 'no'){
    
    #reads person_id and concept id columns in a df from the input csv file in teh bucket
    #if input csv file is already read from the bucket to the env, it simply reads the person_id and concept id columns in a df
    my_bucket = Sys.getenv('WORKSPACE_BUCKET')
    
    if (file.exists(name_of_file_in_bucket)){my_dataframe  <- fread(name_of_file_in_bucket)
    } else {
            if (remove_dot == 'no'){
                    system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket, " ."), intern=T)              
            } else {system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket), intern=T)}
        
            my_dataframe  <- fread(name_of_file_in_bucket)
            }
    my_dataframe  <- as.data.frame(my_dataframe)
    return(my_dataframe)
    }

                      
barplot <- function(df, plot_title, n_col, X, Y = `Percentage`, Fill = "grey", Facet = "Cases"
                    , facet_nrow = NULL, facet_col = NULL, base_text_size =12, w = 14, h = 8){
    
    df["Percentage"] <- round((df[n_col]/df['numerator'])*100,2)
    df["Label"] <- paste0(format(df[[n_col]], big.mark=","), ' (', df$Percentage, '%)')    

    if (!is.null(Facet)){df["Facet"] <- paste0(df[[Facet]], 'N=', format(df$numerator, big.mark=","), ' (100%)')}
    
    ###### Plot #####

    options(repr.plot.width = w, repr.plot.height = h) #~1.5 per bar
    p <- ggplot(data=df, aes(x={{X}}, y={{Y}}#, fill = {{Fill}}
                            )) +
            geom_bar(stat="identity", position = 'dodge2', fill = Fill) +
            geom_text(aes(label= Label), hjust="inward", vjust = 0.5, size=(base_text_size/3)+1
                      , position = position_dodge(width = 0.9)) +
            labs(x = '', y = '', title = plot_title) +
            theme_minimal()+ 
            theme(axis.text.x = element_blank(), axis.text.y = element_text(size = base_text_size+4)
                  , legend.title = element_blank()
                  , legend.text = element_text(size = base_text_size+5)
                  , legend.position = "top", legend.box = "horizontal"
                  , plot.title = element_text(hjust = 0.5)
                  , title = element_text(size = base_text_size+4)) +
            coord_flip()
  
    if (!is.null(Facet)){
        p <- p+facet_wrap(~Facet, nrow = facet_nrow, ncol = facet_col)+
                theme(strip.text.x = element_text(size = base_text_size+6))+theme(legend.position = "none")
            }
    return(p)
    }

##################################################DATA ############################################################
system2('gsutil',args = c('cp','gs://fc-aou-preprod-datasets-controlled/v7/wgs/without_ext_aian_prod/vds/aux/ancestry/delta_v1_gt_no_ext_aian_gq0_prod.ancestry_preds.tsv','./ancestry.tsv'))
ancestry_df = read_tsv('ancestry.tsv', col_types='ic-c-') %>% rename(person_id=research_id, ancestry = ancestry_pred)
ancestry_df$ancestry = toupper(ancestry_df$ancestry)

demographics_df <- read_csv_from_bucket(name_of_file_in_bucket = 'demographics_table.csv') %>% rename(age_group_at_cdr=age_group)

################################# physical measurements specific functions #########################################
#physical_measurement_table <- read_csv_from_bucket(name_of_file_in_bucket = 'physical_measurement_table.csv')
continuous_data_summary <- function(df_measurement){
    
    value_column = colnames(select(df_measurement, -c('person_id')))
    cat(paste0('Histogram and Descriptive Statistics of ', value_column))
    
    #df_measurement <- drop_na(df_measurement)
    stats_df <- df_measurement %>%
            dplyr::summarise('Mean' = mean(df_measurement[[value_column]], na.rm = TRUE)
                             ,'Min' = min(df_measurement[[value_column]], na.rm = TRUE)
                             ,'Max' = max(df_measurement[[value_column]], na.rm = TRUE)
                             ,'1% Quantile' = quantile(df_measurement[[value_column]], 0.01, na.rm = TRUE)
                             ,'2% Quantile' = quantile(df_measurement[[value_column]], 0.02, na.rm = TRUE)
                             ,'25th Quantile' = quantile(df_measurement[[value_column]], 0.25, na.rm = TRUE)
                             ,'50th Quantile (Median)' = quantile(df_measurement[[value_column]], 0.5, na.rm = TRUE)
                             ,'75th Quantile' = quantile(df_measurement[[value_column]], 0.75, na.rm = TRUE)
                             ,'98th Quantile' = quantile(df_measurement[[value_column]], 0.98, na.rm = TRUE)
                             ,'99th Quantile' = quantile(df_measurement[[value_column]], 0.99, na.rm = TRUE)
                             ,'Standard Deviation' = sd(df_measurement[[value_column]], na.rm = TRUE)
                            )
    options(repr.plot.width = 8, repr.plot.height = 6)
    hist(x = df_measurement[[value_column]], main= str_glue('Histogram of {value_column}'), xlab= "") #paste0(value_column, " values")
    
    df_plot <- df_measurement
    boxplot(df_plot[[value_column]], main = str_glue('Boxplot of {value_column}'), xlab="")  
    View(stats_df)
    
    return(stats_df)
}

wrangle_cont_data <- function(data, demog_vars, num_col = 'name'){
    remove_cols <- append('person_id',demog_vars)
    pivot_cols <- unlist(colnames(data))
    pivot_cols <- pivot_cols[pivot_cols %in% remove_cols == FALSE]
    
    
    data_long <- pivot_longer(data, cols = all_of(pivot_cols))
    data_long <- subset(data_long[!is.na(data_long$value),], select = -c(value))
    
    groupby_cols = unlist(append(demog_vars, 'name'))
    data_long_count <- data_long%>%group_by(across(all_of(groupby_cols)))%>%summarize(n_pids = n_distinct(person_id))
    
    if (num_col == 'overall'){ #For overall counts
        num <- n_distinct(data$person_id)
        data_long_count$numerator <- num}
    else{
        num_df <- data_long[c('person_id',num_col)]%>%group_by(across(all_of(num_col)))%>%
                  summarize(numerator = n_distinct(person_id))
        data_long_count <- left_join(data_long_count, num_df, by = num_col)
        }
    
    return(data_long_count)
    }

demographic_data_summary <- function(df_measurement){
#     df_measurement = read_csv_cols_from_bucket(name_of_file_in_bucket = 'physical_measurement_table.csv'
#                                                 , concept_id = concept_id)
#     title = str_to_upper(gsub('-',' ', concept_id))
     
#     cat(str_glue('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ {title} Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'))
#     cat("\n\n")
#     continuous_data_summary(df_measurement)
    
    # ~~~Demographics~~~
    # Age
    age_pm_long_count<- wrangle_cont_data(data = inner_join(df_measurement, demographics_df[c('person_id','age_group_at_cdr')]
                                                            , by = 'person_id'), demog_var = 'age_group_at_cdr')
    View(barplot(df = age_pm_long_count, X = `age_group_at_cdr`, n_col = 'n_pids'
                 , plot_title = '\n\n~~~Demographics~~~\n\n\n\nAge at CDR')+ guides(fill="none"))

    # Sex at Birth
    sex_pm_long_count<- wrangle_cont_data(data = inner_join(df_measurement, demographics_df[c('person_id','sex_at_birth')]
                                                          , by = 'person_id'), demog_vars = 'sex_at_birth')
    View(barplot(df = sex_pm_long_count, X = `sex_at_birth`, n_col = 'n_pids'
                 , plot_title = 'Sex at Birth', h = 5)+ guides(fill="none"))


    # Ancestry
    ancestry_pm_long_count<- wrangle_cont_data(data = inner_join(df_measurement, ancestry_df[c('person_id','ancestry')]
                                                                 , by = 'person_id'), demog_vars = 'ancestry')
    View(barplot(df = ancestry_pm_long_count, X = `ancestry`, n_col = 'n_pids'
                 , plot_title = 'Ancestry\n', h = 5)+ guides(fill="none"))


    # Sex at birth and Ancestry
    ancestry_sex_m <- inner_join(df_measurement, demographics_df[c('person_id','sex_at_birth')], by = 'person_id')
    ancestry_sex_m <- inner_join(ancestry_sex_m, ancestry_df[c('person_id','ancestry')], by = 'person_id')
    ancestry_sex_m_long_count<- wrangle_cont_data(data = ancestry_sex_m, demog_vars = list('sex_at_birth','ancestry')
                                                   , num_col = 'ancestry')

    View(barplot(df = ancestry_sex_m_long_count, X = `sex_at_birth`, n_col = 'n_pids', Facet = "ancestry"
                , plot_title = 'Sex at Birth & Ancestry\n\n'
                , facet_nrow = 4, facet_col = 2, h = 10))

    # Age at CDR and Ancestry
    ancestry_age_m <- inner_join(df_measurement, demographics_df[c('person_id','age_group_at_cdr')], by = 'person_id')
    ancestry_age_m <- inner_join(ancestry_age_m, ancestry_df[c('person_id','ancestry')], by = 'person_id')
    ancestry_age_m_long_count<- wrangle_cont_data(data = ancestry_age_m, demog_vars = list('age_group_at_cdr','ancestry')
                                                   , num_col = 'ancestry')
    View(barplot(df = ancestry_age_m_long_count, X = `age_group_at_cdr`, n_col = 'n_pids', Facet = "ancestry"
                , plot_title = 'Age at CDR & Ancestry\n\n'
                , facet_nrow = 4, facet_col = 2, h = 15))
     
 }

pm_data_summary <- function(concept_id){
    df_measurement = read_csv_cols_from_bucket(name_of_file_in_bucket = 'physical_measurement_table.csv'
                                                , concept_id = concept_id)
    title = str_to_upper(gsub('-',' ', concept_id))
     
    cat(str_glue('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ {title} Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'))
    cat("\n\n")
    continuous_data_summary(df_measurement)
    
    # ~~~Demographics~~~
    demographic_data_summary(df_measurement)
     
 }
     

## save to bucket

In [None]:
r_file = 'allxall_pm_summary_functions.R'
load_to_bucket(source_filename = r_file)

# `allxall_lab_summary_functions.R`

In [None]:
%%writefile allxall_lab_summary_functions.R

package_list <- c('bigrquery','tidyverse','dplyr','janitor','data.table')
for (pkg in package_list) if(!pkg %in% installed.packages()) {install.packages(pkg, quiet = T)}

library(bigrquery, warn.conflicts = F, quietly = T)
library(tidyverse, warn.conflicts = F, quietly = T)
library(janitor, warn.conflicts = F, quietly = T)
library(dplyr, warn.conflicts = F, quietly = T)
library(data.table, warn.conflicts = F, quietly = T)
options(dplyr.summarise.inform = FALSE)

bq <- function(query) {
    bq_table_download(bq_project_query(Sys.getenv('GOOGLE_PROJECT'), page_size = 25000,
                                       query=query, default_dataset = Sys.getenv('WORKSPACE_CDR')))
    }


lab_data <- function(ancestor_cid){
    
    df <- bq(str_glue("
                SELECT DISTINCT person_id
                , measurement_concept_id
                , LOWER(cm.concept_name) as measurement_concept_name, LOWER(cam.concept_name) as ancestor_concept_name
                , value_as_number
                , range_low, range_high
                , unit_concept_id, LOWER(u.concept_name) as unit_concept_name
                , value_as_concept_id, LOWER(cv.concept_name) as value_as_concept_name
                , operator_concept_id, LOWER(co.concept_name) as operator_concept_name 
                , measurement_datetime --, measurement_date
                , src_id ###, MAX(measurement_date) AS most_recent_measurement_date
                
                #see note 2 - only use those when the standard concepts are not available/do not make sense
                , measurement_source_concept_id, LOWER(measurement_source_value) AS measurement_source_value
                , LOWER(unit_source_value) AS unit_source_value
                #, CASE WHEN value_as_number IS NOT NULL THEN 'numeric measurement' 
                       #WHEN value_as_number IS NULL and (value_as_concept_id IS NOT NULL OR value_as_concept_id !=0)
                            #THEN 'categorical measurement' 
                      # ELSE 'other' END as measurement_category

            FROM `measurement` m 
            JOIN `measurement_ext` m_ext ON m.measurement_id = m_ext.measurement_id
            JOIN `concept_ancestor` ON descendant_concept_id = measurement_concept_id
            JOIN `concept` as cm on cm.concept_id = measurement_concept_id
            JOIN `concept` as cam on cam.concept_id = ancestor_concept_id
            LEFT JOIN (SELECT c2.concept_name, c1.concept_id 
                    FROM `concept` c1 JOIN `concept` c2 on c1.concept_name = c2.concept_code
                    WHERE c1.domain_id = 'Meas Value') as cv on cv.concept_id = value_as_concept_id            
            LEFT JOIN `concept` as co on co.concept_id = operator_concept_id
            LEFT JOIN `concept` as u on u.concept_id = unit_concept_id
                      
            WHERE LOWER(m_ext.src_id) LIKE '%ehr%' AND ancestor_concept_id IN ({ancestor_cid})
                      AND value_as_number IS NOT NULL
            
            "))
    filename = str_glue('measurement_{ancestor_cid}.csv')
    print(str_glue("\nFinal output will be saved to bucket as {filename}\n"))
    
    
    return(df)
    }
                      
simple_boxplot<- function(measurement_df, title = '', ancestor_concept_name, w = 20, h = 8){   
    
     #units = paste0(unique(measurement_df$unit_concept_name),collapse = ', ')
    #title = paste0(str_to_upper(title), "\n", str_glue("Units: {units}"))
    
    options(repr.plot.width = w, repr.plot.height = h)
    
    measurement_df$scr_id <- gsub('EHR site ', '#', measurement_df$src_id) 
    boxplot(value_as_number~src_id, data=measurement_df,main=title, xlab="EHR Site"
            , ylab= str_glue("{ancestor_concept_name} Values"))
    
    }
                      

simple_histogram<- function(measurement_df, value_col = 'value_as_number'
                            ,title = '', ancestor_concept_name, w = 15, h = 8){   
    
    #units = paste0(unique(measurement_df$unit_concept_name),collapse = ', ')
    #title = paste0(str_to_upper(title), "\n", str_glue("Units: {units}"))
    
    options(repr.plot.width = w, repr.plot.height = h)
    hist(measurement_df[[value_col]], main = title, xlab= str_glue("{ancestor_concept_name} Values"))
    
    }

stats_table <- function(df_measurement){
    
    #value_column = 'value_as_number'
    
    #df_measurement <- drop_na(df_measurement)
    stats_df <- df_measurement %>%
            dplyr::group_by(src_id) %>%
            dplyr::summarize('Mean' = mean(value_as_number, na.rm = TRUE)
                             ,'Min' = min(value_as_number, na.rm = TRUE)
                             ,'Max' = max(value_as_number, na.rm = TRUE)
                             ,'1% Quantile' = quantile(value_as_number, 0.01, na.rm = TRUE)
                             ,'2% Quantile' = quantile(value_as_number, 0.02, na.rm = TRUE)
                             ,'25th Quantile' = quantile(value_as_number, 0.25, na.rm = TRUE)
                             ,'50th Quantile (Median)' = quantile(value_as_number, 0.5, na.rm = TRUE)
                             ,'75th Quantile' = quantile(value_as_number, 0.75, na.rm = TRUE)
                             ,'98th Quantile' = quantile(value_as_number, 0.98, na.rm = TRUE)
                             ,'99th Quantile' = quantile(value_as_number, 0.99, na.rm = TRUE)
                             ,'Standard Deviation' = sd(value_as_number, na.rm = TRUE)
                            ) %>%
            rename('EHR Site' = src_id)
    
    return(stats_df)
}
                      
final_dataframe <- function(measurement_df, concept_id){
    latest_df <- measurement_df[c('person_id', 'measurement_datetime', 'value_as_number', 'unit_concept_name')] %>%
                        dplyr::group_by(person_id) %>%
                        filter(measurement_datetime == max(measurement_datetime)) %>%
                        summarize(value_as_number = paste0(value_as_number,  collapse = ', ')
                                 , unit_concept_name = paste0(unit_concept_name,  collapse = ', ')) %>%
                        rename(latest_value = value_as_number, latest_value_unit = unit_concept_name)

    count_df <- drop_na(measurement_df[c('person_id','value_as_number')]) %>%
                dplyr::group_by(person_id) %>%
                dplyr::summarize('values_count' = n_distinct(value_as_number, na.rm = TRUE))

    output_df <- measurement_df[c('person_id','value_as_number')] %>%
                dplyr::group_by(person_id) %>%
                dplyr::summarize('Mean' = mean(value_as_number, na.rm = TRUE)
                                 ,'Min' = min(value_as_number, na.rm = TRUE)
                                 ,'Max' = max(value_as_number, na.rm = TRUE)
                                 ,'1% Quantile' = quantile(value_as_number, 0.01, na.rm = TRUE)
                                 ,'2% Quantile' = quantile(value_as_number, 0.02, na.rm = TRUE)
                                 ,'25th Quantile' = quantile(value_as_number, 0.25, na.rm = TRUE)
                                 ,'50th Quantile (Median)' = quantile(value_as_number, 0.5, na.rm = TRUE)
                                 ,'75th Quantile' = quantile(value_as_number, 0.75, na.rm = TRUE)
                                 ,'98th Quantile' = quantile(value_as_number, 0.98, na.rm = TRUE)
                                 ,'99th Quantile' = quantile(value_as_number, 0.99, na.rm = TRUE)
                                 ,'Standard Deviation' = sd(value_as_number, na.rm = TRUE)
                                ) %>%
                merge(latest_df) %>%
                merge(count_df)
    
    #### final dataframe output
    filename = str_glue('measurement_{concept_id}.csv')
    write_csv(output_df, filename)
    
    my_bucket = Sys.getenv('WORKSPACE_BUCKET')
    directory = 'notebooks/all_x_all/'
    
    system(paste0("gsutil cp ", filename, " ", my_bucket, "/", directory, filename), intern=T)
    system(paste0("gsutil ls ", my_bucket, "/", directory, filename), intern=T)
    

    return(output_df) 
    }
                      
# lab_summary <- function(measurement_df, ancestor_concept_name){
    
#     as.data.frame(stats_table(measurement_df))
    
#     simple_histogram(measurement_df, ancestor_concept_name = ancestor_concept_name)
    
#     simple_boxplot(measurement_df, ancestor_concept_name = ancestor_concept_name)
    
#     return(df)
#     }
                      
lab_data_summary <- function(concept_id){
    df_measurement <- lab_data(ancestor_cid = concept_id)
    
    ancestor_concept_name = unique(df_measurement$ancestor_concept_name)
    title = str_to_upper(str_glue('\n\n\n~~~~~~~~~EHR {ancestor_concept_name} Summary & Distributions\n\n~~~~~~~~~~'))
    
    cat(title)
    n_pids = n_distinct(df_measurement$person_id)
    print(str_glue('N Pids :{n_pids}'))                     

    #cat("\n\n")
    df_measurement$src_id<- gsub('EHR site ', '#', df_measurement$src_id)
    
    simple_histogram(df_measurement, ancestor_concept_name = ancestor_concept_name)
    
    simple_boxplot(df_measurement, ancestor_concept_name = ancestor_concept_name)
    
    #lab_summary(df_measurement, ancestor_concept_name)   
    stats_df <- stats_table(df_measurement)
    final_df <- final_dataframe(measurement_df = df_measurement, concept_id = concept_id)
    stats_df
 }
                      

## Save to bucket

In [None]:
r_file = 'allxall_lab_summary_functions.R'
load_to_bucket(source_filename = r_file)

# `allxall_cat_data_summary_functions.R`

In [None]:
%%writefile allxall_cat_data_summary_functions.R

# print("All packages and custom functions loaded.")
# cat("                                                ")
##################################################### SET UP ##############################################################
package_list <- c('bigrquery','tidyverse','dplyr','janitor', 'data.table')
for (pkg in package_list) if(!pkg %in% installed.packages()) {install.packages(pkg, quiet = T)}

library(bigrquery, warn.conflicts = F, quietly = T)
library(tidyverse, warn.conflicts = F, quietly = T)
library(janitor, warn.conflicts = F, quietly = T)
library(dplyr, warn.conflicts = F, quietly = T)
library(data.table)
options(dplyr.summarise.inform = FALSE)

# the current workspace's dataset variable (to use in the query)
dataset <- Sys.getenv('WORKSPACE_CDR')
billing_project <- Sys.getenv('GOOGLE_PROJECT')
my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

# helper function to make the download easier
download_data <- function(query) {
    tb <- bq_project_query(Sys.getenv('GOOGLE_PROJECT'), page_size = 25000,
                           query = query, default_dataset = Sys.getenv('WORKSPACE_CDR'))
    bq_table_download(tb)
}

# read_csv_from_bucket <- function(directory = 'notebooks/all_x_all/', name_of_file_in_bucket, remove_dot = 'no'){
                
#                 if (remove_dot == 'no'){
#                     system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket, " ."), intern=T)
#                 } else {
#                     system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket), intern=T)
#                     }
                    
#                 # Load the file into a dataframe
#                 my_dataframe  <- read_csv(gsub('notebooks/all_x_all/','',name_of_file_in_bucket), show_col_types = FALSE)
#                 return(my_dataframe)
#             }

read_csv_cols_from_bucket <- function(directory = 'notebooks/all_x_all/', name_of_file_in_bucket, concept_id, remove_dot = 'no'){
    
    #reads person_id and concept id columns in a df from the input csv file in teh bucket
    #if input csv file is already read from the bucket to the env, it simply reads the person_id and concept id columns in a df
    my_bucket = Sys.getenv('WORKSPACE_BUCKET')
    
    if (file.exists(name_of_file_in_bucket)){my_dataframe  <- fread(name_of_file_in_bucket
                                                                  , select = c("person_id", str_glue("{concept_id}")))
    } else {
            if (remove_dot == 'no'){
                    system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket, " ."), intern=T)              
            } else {system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket), intern=T)}
        
            my_dataframe  <- fread(name_of_file_in_bucket, select = c("person_id", str_glue("{concept_id}")))
            }   
    my_dataframe  <- as.data.frame(my_dataframe)
    return(my_dataframe)
    }

read_csv_from_bucket <- function(directory = 'notebooks/all_x_all/', name_of_file_in_bucket, remove_dot = 'no'){
    
    #reads person_id and concept id columns in a df from the input csv file in teh bucket
    #if input csv file is already read from the bucket to the env, it simply reads the person_id and concept id columns in a df
    my_bucket = Sys.getenv('WORKSPACE_BUCKET')
    
    if (file.exists(name_of_file_in_bucket)){my_dataframe  <- fread(name_of_file_in_bucket)
    } else {
            if (remove_dot == 'no'){
                    system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket, " ."), intern=T)              
            } else {system(paste0("gsutil cp ", my_bucket, "/", directory, name_of_file_in_bucket), intern=T)}
        
            my_dataframe  <- fread(name_of_file_in_bucket)
            }
    my_dataframe  <- as.data.frame(my_dataframe)
    return(my_dataframe)
    }


barplot <- function(df, plot_title, n_col, X, Y = `Percentage`
                             , Fill = "Cases", Facet = "Cases"
                             , fill_palette = "Pastel1"
                             , facet_nrow = 1, facet_col = 2, base_text_size =9, w = 14, h = 5){
    
    df["Percentage"] <- round((df[n_col]/df['numerator'])*100,2)
    df["Label"] <- paste0(format(df[[n_col]], big.mark=","), ' (', df$Percentage, '%)')    
    #df$Legend <- factor(paste0('\n',df[[Fill]], ': N=', format(df$numerator, big.mark=","), ' (100%)'))
    if (!is.null(Facet)){df["Facet"] <- paste0('\n\n',df[[Facet]], '\nN=', format(df$numerator, big.mark=","), ' (100%)')}
    ###### Plot #####
    #X_string = deparse(substitute(X)); h*(n_distinct(df[X_string])
    options(repr.plot.width = w, repr.plot.height = h) #~1.5 per bar
    p <- ggplot(data=df, aes(x={{X}}, y={{Y}}, fill = {{Fill}}
                            )) +
            geom_bar(stat="identity", position = 'dodge2') +
            geom_text(aes(label= Label), hjust="inward", vjust = 0.5, size=(base_text_size/3)+1
                      , position = position_dodge(width = 0.9)) +
            labs(x = '', y = '', title = plot_title) +
            theme_minimal()+ 
            #scale_fill_grey(start = 0.7, end = 0.9) +
            scale_fill_brewer(palette = fill_palette) +
            #scale_fill_brewer()+
            theme(axis.text.x = element_blank(), axis.text.y = element_text(size = base_text_size+4)
                  , legend.title = element_blank()
                  , legend.text = element_text(size = base_text_size+5)
                  , legend.position = "top", legend.box = "horizontal"
                  , plot.title = element_text(hjust = 0.5)
                  , title = element_text(size = base_text_size+4)) +
            #scale_fill_mn(labels = c(new_legend)) +
            coord_flip()

    
    if (!is.null(Facet)){
        p <- p+facet_wrap(~Facet, nrow = facet_nrow, ncol = facet_col)+
                theme(strip.text.x = element_text(size = base_text_size+6))+theme(legend.position = "none")
            }
    return(p)
    }

############################ Phecode AND Survey data specific (cat dat with TRUE/FALSE (CASE/CONTROLS))

wrangle_cat_data <- function(data_df){
    colnames(data_df) = c('person_id', 'Cases')
    data_df$Cases[data_df$Cases ==TRUE]<-'Cases'
    data_df$Cases[data_df$Cases ==FALSE]<-'Controls'
    #phecode_df$Cases[is.na(phecode_df$Cases)]<- 'NA'    
    data_df <- data_df[!is.na(data_df$Cases),] # NEW removing NAs
    return(data_df)
    }

count_by <- function(data_df, var, var_df){    

    var_df = unique(var_df[c('person_id',var)])    
    merged_df = inner_join(data_df, var_df, by = 'person_id') 
    counts_df <- merged_df %>% dplyr::group_by(merged_df[var], Cases) %>% 
                    dplyr::summarise('n_pids' = n_distinct(person_id))
    
    counts_df <- cbind(counts_df, numerator=NA)
    counts_df$numerator[counts_df$Cases =='Cases'] <- n_distinct(merged_df$person_id[merged_df$Cases =='Cases'])
    counts_df$numerator[counts_df$Cases =='Controls'] <- n_distinct(merged_df$person_id[merged_df$Cases =='Controls'])
    counts_df$numerator[counts_df$Cases =='NA'] <- n_distinct(merged_df$person_id[merged_df$Cases =='NA'])
    
    return(counts_df)
    }

categorical_data_summary <- function(concept_id, name_of_file_in_bucket = 'pfhh_survey.csv'
                                     , datatype = 'Survey', map_concept_name = TRUE){
    # Function is ffor phecode, drug and survey data only (not measurements or physical measurements)
    
    system2('gsutil',args = c('cp','gs://fc-aou-preprod-datasets-controlled/v7/wgs/without_ext_aian_prod/vds/aux/ancestry/delta_v1_gt_no_ext_aian_gq0_prod.ancestry_preds.tsv','./ancestry.tsv'))        
    # LOAD Data
    
    ## Demographics Data
    demographics_df <- read_csv_from_bucket(name_of_file_in_bucket = 'demographics_table.csv')
    ## Ancestry Data
    ancestry_df = read_tsv('ancestry.tsv', col_types='ic-c-') %>% rename(person_id=research_id)
    ancestry_df$ancestry_pred = toupper(ancestry_df$ancestry_pred)
    ## Survey, drug or phecode Data
    #concept_filename = str_glue('{base_filename}_{concept_id}.csv')
    data_df = read_csv_cols_from_bucket(name_of_file_in_bucket = name_of_file_in_bucket, concept_id = concept_id)

    # TRANSFORM DATA
    Data_df <- wrangle_cat_data(data_df)
    age_count_df <- count_by(Data_df, var= 'age_group', var_df = demographics_df)
    
    sex_count_df <- count_by(Data_df, var= 'sex_at_birth', var_df = demographics_df)
    ancestry_count_df <- count_by(Data_df, var= 'ancestry_pred', var_df = ancestry_df)
    sex_and_ancestry_counts_df <- count_by(Data_df, var= c('ancestry_pred','sex_at_birth')
                                           , var_df = inner_join(ancestry_df[c('person_id','ancestry_pred')]
                                                                 , demographics_df[c('person_id','sex_at_birth')]
                                                                 , by = 'person_id'))

    age_and_ancestry_counts_df <- count_by(Data_df, var= c('ancestry_pred','age_group')
                                           , var_df = inner_join(ancestry_df[c('person_id','ancestry_pred')]
                                                                 , demographics_df[c('person_id','age_group')]
                                                                 , by = 'person_id'))

    
    ############################################################################################
    if (map_concept_name == TRUE){
        Map <- download_data(str_glue("SELECT concept_name FROM `{dataset}.concept` \
                                        WHERE concept_id = {concept_id} or concept_code = '{concept_id}'"))
        if (nrow(Map) == 0){concept_name = ''} 
        else {concept_name <- Map$concept_name; concept_name <- str_glue(' - {concept_name}')}
      } else {concept_name = ''}   
    concept_id = str_glue("{concept_id}{concept_name}")
    datatype = toupper(datatype)
    
    if (tolower(datatype) == 'pfhh'){
        concept_id = str_glue('{concept_id}\n(NB: Cases = Participants Who Reported Personally Having This Condition)')}
    else {concept_id = concept_id}
    
    ##################################################PLOT SUMMARIES#######################################################
    #'OVERALL SUMMARIES BY AGE, SEX AT BIRTH AND ANCESTRY
    n = 1
    View(barplot(age_count_df, n_col = "n_pids", X = `age_group`, h = 6
                , plot_title= str_glue('{datatype} {concept_id}:\n\n\n\nOVERALL SUMMARIES BY AGE, SEX AT BIRTH AND ANCESTRY\n\n\nFigure {n}: Age at CDR')))
    n = n+1
    View(barplot(sex_count_df, n_col = "n_pids", X = `sex_at_birth`, plot_title= str_glue('\nFigure {n}: Sex at Birth')
                    , fill_palette = "Blues"))#+theme(legend.position = "none")
    n = n+1
    View(barplot(ancestry_count_df, n_col = "n_pids", X = `ancestry_pred`, plot_title= str_glue('\nFigure {n}: Ancestry')
                     , fill_palette = "Greens"))
    
    
    # DETAILED SUMMARIES BY SEX AT BIRTH AND ANCESTRY
    sex_at_births = unique(sex_and_ancestry_counts_df$sex_at_birth)
    n = n+1
    var_sex1 = sex_at_births[1]
    df_sex1 = sex_and_ancestry_counts_df[sex_and_ancestry_counts_df$sex_at_birth == var_sex1,]
    View(barplot(df_sex1, n_col = "n_pids", X = `ancestry_pred`, Fill = `sex_at_birth`, fill_palette = "Purples"
                     , plot_title= str_glue('DETAILED SUMMARIES BY SEX AT BIRTH AND ANCESTRY\n\n\nFigure {n}: {var_sex1} (Sex at Birth) by Ancestry')
                    , facet_nrow = 1, facet_col = 2))   
    for (var in sex_at_births[-1]){
        n = n+1
        df_var = sex_and_ancestry_counts_df[sex_and_ancestry_counts_df$sex_at_birth == var,]
        View(barplot(df_var, n_col = "n_pids", X = `ancestry_pred`, Fill = `sex_at_birth`, fill_palette = "Purples"
                     , plot_title= str_glue('\nFigure {n}: {var} by Ancestry')
                    , facet_nrow = 1, facet_col = 2))
        }

    
    # DETAILED SUMMARIES BY AGE AND ANCESTRY
    ages = unique(age_and_ancestry_counts_df$age_group)
    n = n+1
    var_age1 = ages[1]
    df_age1 = age_and_ancestry_counts_df[age_and_ancestry_counts_df$age_group == var_age1,]
    View(barplot(df_age1, n_col = "n_pids", X = `ancestry_pred`, Fill = `age_group`, fill_palette = "Pastel2"
                    , plot_title= str_glue('DETAILED SUMMARIES BY AGE AT CDR AND ANCESTRY\n\n\nFigure {n}: {var_age1} Years Old by Ancestry')
                        , facet_nrow = 1, facet_col = 2)) 
    for (var in ages[-1]){
        n = n+1
        df_var = age_and_ancestry_counts_df[age_and_ancestry_counts_df$age_group == var,]
        View(barplot(df_var, n_col = "n_pids", X = `ancestry_pred`, Fill = `age_group`, fill_palette = "Pastel2"
                         , plot_title= str_glue('\nFigure {n}: {var} Years Old by Ancestry')
                              , facet_nrow = 1, facet_col = 2))
        }
    }

## Save to Bucket

In [None]:
r_file = 'allxall_cat_data_summary_functions.R'
load_to_bucket(source_filename = r_file)

# `allxall_lab_summary_functions.py`

In [None]:
%%writefile allxall_lab_summary_functions.py

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from datetime import datetime
from google.cloud import storage

import subprocess

from plotnine import *
import os
dataset = os.getenv('WORKSPACE_CDR')
my_bucket = os.getenv('WORKSPACE_BUCKET')

################################## load data that all notebooks use ##########################################
demographics_df = read_csv_from_bucket(name_of_file_in_bucket = 'demographics_table.csv')
ancestry_df = read_tsv('ancestry.tsv', col_types='ic-c-') %>% rename(person_id=research_id)
ancestry_df$ancestry_pred = toupper(ancestry_df$ancestry_pred)

#########################################################################

def load_to_bucket(source_filename, destination_blob_name = 'notebooks/all_x_all'):
    
    """Uploads a file to the bucket."""
    my_bucket = os.getenv('WORKSPACE_BUCKET')

    args = ["gsutil", "cp", "-R", f"./{source_filename}", f"{my_bucket}/{destination_blob_name}/{source_filename}"]
    output = subprocess.run(args, capture_output=True)
    print(output.stderr)
    print(f'\n file in bucket at {destination_blob_name}/{source_filename}')
    
    
def measurement_data(ancestor_cid):
    
    start = datetime.now() #started 6:16 pm ish
    
    #print(cid)
    df = pd.read_gbq(f"""
            SELECT DISTINCT person_id
                        , measurement_concept_id
                        , measurement_source_concept_id, LOWER(measurement_source_value) AS measurement_source_value
                        , value_as_number, range_low, range_high
                        , LOWER(unit_source_value) AS unit_source_value , operator_concept_id
                        , src_id
                        , measurement_datetime
                        ---, MAX(measurement_date) AS most_recent_measurement_date

            FROM `{dataset}.measurement` m 
            JOIN `{dataset}.measurement_ext` m_ext ON m.measurement_id = m_ext.measurement_id
            JOIN `{dataset}.concept_ancestor` ON descendant_concept_id = measurement_concept_id
            AND ancestor_concept_id IN ({ancestor_cid})
            WHERE m_ext.src_id LIKE '%EHR%' --GROUP BY 1,2,3,4,5,6,7,8,9 
            
            """)
    end = datetime.now()
    print(end - start)
    n_pids = df.person_id.nunique()
    
    print(f'N Pids :{n_pids}')
    
    return df

def map_concept_names(ancestor_cid, merge = 'yes'):

    df = pd.read_gbq(f"""
        SELECT DISTINCT measurement_concept_id, LOWER(STANDARD_CONCEPT_NAME) AS measurement_concept_name
        , measurement_source_concept_id, LOWER(SOURCE_CONCEPT_NAME) AS measurement_source_concept_name
        , unit_concept_id, LOWER(unit_concept_name) AS unit_concept_name
        , operator_concept_id, LOWER(operator_concept_name) AS operator_concept_name

        FROM `{dataset}.ds_measurement` me
        JOIN `{dataset}.concept_ancestor` ON descendant_concept_id = measurement_concept_id
        AND ancestor_concept_id IN ({ancestor_cid}) 
        """)
    return df

def percentile(n):
    def percentile_(x):
        return np.percentile(x[~np.isnan(x)], n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_


def boxplot(measurement_df, c_name, fill_col, n_row= None, n_col = 1, w = 10, h = 12, facet_col = None):    
    #measurement_df['meas_and_unit'] = measurement_df['measurement_concept_name']+ ' ('+measurement_df['unit_concept_name']+')'

    plot = (ggplot(measurement_df, aes(x='src_id', y='value_as_number'#, fill=f'factor({fill_col})'
               , color =f'factor({fill_col})')) 
            + geom_boxplot()
            + labs(title = f"Comparing distributions of EHR {c_name} measures")
            + theme(axis_text_x = element_text(angle = 90, hjust = 1))
            + theme(figure_size=(w, h))
           )
    if facet_col is not None:     
        if n_row is None:
            n_row = measurement_df[facet_col].nunique()
        else:
            n_row = n_row
        plot = (plot
                + facet_wrap(facet_col, ncol = n_col, nrow = n_row, scales = 'free_y')
               )
             
    return plot


def check_df(measurement_df):
    cols = ['measurement_concept_name', 'measurement_source_concept_name', 'unit_concept_name'
        , 'operator_concept_name', 'value_as_number']

    qc_df1 = measurement_df[['person_id']+cols].drop_duplicates()
    qc_df1.loc[qc_df1.value_as_number.notna(), 'value_as_number'] = 'has value'
    qc_df1.loc[qc_df1.value_as_number.isna(), 'value_as_number'] = 'no value'
    qc_df1 = qc_df1.groupby(cols).nunique().sort_values('person_id')
    qc_df1.columns = ['n_pids']
    display(qc_df1)
    
    qc_df2 = measurement_df[['measurement_concept_name', 'value_as_number', 'unit_concept_name']].drop_duplicates()\
                .groupby(['measurement_concept_name', 'unit_concept_name'])\
                .agg({'value_as_number':['mean','median', 'min','max','std',percentile(1), percentile(25) , percentile(75), percentile(99)]})
    qc_df2.columns = [c[1] for c in qc_df2.columns]
    qc_df2 = qc_df2.reset_index()
    display(qc_df2)
    
    return [qc_df1, qc_df2]


def standard_cleaning(measurement_df): 
    print('''Standard Cleaning (for all measurements):\n 
- Drop rows with:
        - no value_as_number (can be done in SQL)
        - no units or 'no matching concept units' or units that do not make sense (e.g. 45666662)
 (This will most likely drop some EHR sites)\n 
- Harmonize units:
        - e.g. mg/dl = milligram per deciliter, Percent = percent''')
    
    harmonize_units_dd = {'mg/dl':'milligram per deciliter', 'Percent':'percent'
                          , 'meq/l':'milliequivalent per liter'}
    measurement_clean_df = measurement_df[(measurement_df.value_as_number.notna()) & (measurement_df.unit_concept_id.notna())]
    measurement_clean_df = measurement_clean_df[~(measurement_clean_df.unit_concept_name.isin(['no matching concept', 'no value']))]
    measurement_clean_df['unit_concept_name'] = measurement_clean_df['unit_concept_name'].replace(harmonize_units_dd)
    return measurement_clean_df

def drop_extreme_outliers(df, max_percentage_diff_threshold, max_percentile_threshold
                          , min_percentage_diff_threshold , min_percentile_threshold, c, u):
    
#     print(f'''Definition: Drop extreme outliers, which is defined as rows where:
#     - the max measurement value is {max_percentage_diff_threshold}% larger than the maximum of all values at or below the {max_percentile_threshold} percentile. 
#     This threshold can be adjusted by using max_percentage_diff_threshold = n. The percentile threshold can also be adjusted using max_percentile_threshold = n.
    
#     - and/or the min measurement value is {min_percentile_threshold}% smaller than the min of all values at or below the {min_percentile_threshold} percentile
#     . This threshold can be adjusted by using min_percentage_diff_threshold = n. The percentile threshold can also be adjusted using min_percentile_threshold = n.\n\n\n''')
    
    # Drop extremely high values
    values = sorted(df.value_as_number.dropna())
    max_meas_value = max(values)
    #diff = max_meas_value - np.percentile(values, max_percentile_threshold)
    values_minus_max = [i for i in values if i < max_meas_value]
    max_values_2nd = max(values_minus_max)
    diff_perc1 = round(max_meas_value*100/max_values_2nd)
    
    if diff_perc1 >= max_percentage_diff_threshold:
        print(f'Dropping {max_meas_value} ({u})')
        #print(f'For {c} in {u}, the max measurement value ({max_meas_value}) is {diff_perc1}% larger than the maximum of all values at or below the 99 percentile ({max_values_2nd}).\n We will consider it an extreme outlier and drop it.')
        df_clean = df[df.value_as_number < max_meas_value]
    else:
        df_clean = df.copy()
    
    
    # Drop extremely low values
    min_meas_value = min(values)
    #diff = min_meas_value - np.percentile(values, min_percentile_threshold)
    values_minus_min = [i for i in values if i > min_meas_value]
    min_values_2nd = min(values_minus_min)
    diff_perc2 = abs(round(min_meas_value*100/min_values_2nd))
    
    if diff_perc2 >= min_percentage_diff_threshold:
        print(f'Dropping {min_meas_value} ({u})')
        #print(f'For {c} in {u}, the min measurement value ({min_meas_value}) is {diff_perc2}% smaller than the min of all values at or below the 1 percentile ({min_values_2nd}).\n We will consider it an extreme outlier and drop it.')
        df_clean2 = df_clean[df_clean.value_as_number > min_meas_value]
    else:
        df_clean2 = df_clean.copy()
        
    return df_clean2

def drop_extreme_outliers_in_df(measurement_df, max_percentage_diff_threshold = 400, max_percentile_threshold = 99
                          , min_percentage_diff_threshold = 400, min_percentile_threshold = 1):
    measurement_clean2 = pd.DataFrame()
    
    print(f'''Definition: Drop extreme outliers, which is defined as rows where:
    - the max measurement value is {max_percentage_diff_threshold}% larger than the maximum of all values at or below the {max_percentile_threshold} percentile. 
    This threshold can be adjusted by using max_percentage_diff_threshold = n. The percentile threshold can also be adjusted using max_percentile_threshold = n.
    
    - and/or the min measurement value is {min_percentile_threshold}% smaller than the min of all values at or below the {min_percentile_threshold} percentile
    . This threshold can be adjusted by using min_percentage_diff_threshold = n. The percentile threshold can also be adjusted using min_percentile_threshold = n.\n\n\n''')


    for c in measurement_df.measurement_concept_name.unique():
        DF1 = measurement_df[(measurement_df.measurement_concept_name == c)]
        for u in DF1.unit_concept_name.unique():
            DF2 = DF1[DF1.unit_concept_name ==u]
            clean_DF2 = drop_extreme_outliers(DF2, max_percentage_diff_threshold = max_percentage_diff_threshold
                                              , max_percentile_threshold = max_percentile_threshold
                                              , min_percentage_diff_threshold = min_percentage_diff_threshold
                                              , min_percentile_threshold = min_percentile_threshold, c= c, u = u)
            measurement_clean2 = pd.concat([measurement_clean2, clean_DF2]) 

    return measurement_clean2

def final_data_output(clean_measurement_df):
    
    base_cols = ['person_id', 'value_as_number']
    df = clean_measurement_df[base_cols+['measurement_datetime']].drop_duplicates()

    df_latest= df.loc[df.groupby('person_id')['measurement_datetime'].idxmax()]
    df_latest = df_latest[base_cols].drop_duplicates()
    df_latest.columns = ['person_id','latest']
    df_latest = df_latest.reset_index(drop = True)

    df_units = clean_measurement_df[['person_id','unit_concept_name']].drop_duplicates()
    df_units['unit(s)'] = df_units.groupby('person_id')['unit_concept_name'].transform(lambda x: ', '.join(x.unique()))
    df_units = df_units.drop(['unit_concept_name'],1)
    df_units = df_units.drop_duplicates()
    
    df_stats = clean_measurement_df[base_cols]; df_stats[df_stats.value_as_number.notna()]    
    df_stats = df_stats.groupby([c for c in base_cols if c != 'value_as_number'])\
                                .agg({'value_as_number':['min','median','max','mean', 'count']})
    df_stats.columns = [c[1] for c in df_stats.columns]
    df_stats = df_stats.reset_index()

    df_final = df_stats.merge(df_units).merge(df_latest).drop_duplicates()
    return df_final,df_units

## Upload file to bucket

In [None]:
# Upload py file from cloud to bucket
r_file = 'allxall_summary_functions.R'
load_to_bucket(source_filename = r_file)

# Upload individual concept id csv files to bucket

In [None]:
pfhh_survey_table = read_csv_from_bucket(name_of_file_in_bucket = 'pfhh_survey_table.csv')

for c in pfhh_survey_table.drop('person_id',1).columns:
    df = pfhh_survey_table[['person_id',c]].drop_duplicates()
    question_filename = f'pfhh_survey_{c}.csv'
    df.to_csv(question_filename, index = False)
    load_to_bucket(source_filename = question_filename)

In [None]:
survey_question_filename = f'pfhh_survey_{c}.csv'
survey_question_filename

----------------

In [None]:
basefilename = 'drug'
df_table = read_csv_from_bucket(name_of_file_in_bucket = 'r_drug_table.csv')
for c in df_table.drop('person_id',1).columns:
    df = df_table[['person_id',c]].drop_duplicates()
    question_filename = f'drug_{c}.csv'
    df.to_csv(question_filename, index = False)
    load_to_bucket(source_filename = question_filename)

In [None]:
# import os
# import pandas as pd
# import dask.dataframe as dd

# #df_table = pd.read_csv('r_drug_table.csv') 
# #df_table = dd.read_csv('mcc2_phecode_table.csv')
# for c in df_table.drop('person_id',1).columns:
#     #question_filename = f'mcc2_phecode_{c}.csv'
#     nb_filename = f'phecode_{c}_summary.ipynb'
#     #os.remove(f"{question_filename}")
#     os.remove(f"{nb_filename}")
#     print(f"{question_filename} deleted.")
#     print(f"{nb_filename} deleted.")

In [None]:
df_drug = dd.read_csv('r_drug_table.csv')
df_drug.head()

In [None]:
def generate_notebook(base_filename, phecode):
    
    code = ''' 
        ################################################## CODE ##############################################

        # Set Up

        ## Loading packages and custom functions for AllxAll Phenotypes Summaries

        source_code_filename <- 'allxall_summary_functions.R'
        system(paste0('gsutil cp ', Sys.getenv('WORKSPACE_BUCKET'), '/notebooks/all_x_all/', source_code_filename,  ' ./'), intern=T)
        system2('gsutil',args = c('cp','gs://fc-aou-preprod-datasets-controlled/v7/wgs/without_ext_aian_prod/vds/aux/ancestry/delta_v1_gt_no_ext_aian_gq0_prod.ancestry_preds.tsv','./ancestry.tsv'))
        source(source_code_filename)

        # ################################################## SUMMARY ##############################################
        
        phecode_summary(phecode = '{phecode}')

        '''
    
    notebook_name = f"{base_filename}_{phecode}_summary"
    with open(f"{notebook_name}.ipynb", mode='a') as file:
        file.write(code)
        
    print(f'{notebook_name}.ipynb saved.')

In [None]:
generate_notebook(base_filename = 'phecode', phecode = '1830')

----------