In [None]:
## import packages
from datetime import datetime
import os
import pandas as pd
import numpy as np
import random
from itertools import chain
import hail as hl
from hail.linalg import BlockMatrix
import statsmodels.api as sm

In [None]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

In [6]:
# read matrix table 
mt_array = hl.read_matrix_table(f"{bucket}/ArrayData/Array_GT_QCed.mt")
mt_array = mt_array.key_rows_by("locus")

In [4]:
## read Sample_quant 
Sample_quant = hl.import_table(f"{bucket}/Pheno/quant_all.tsv", 
                         missing='',
                        impute=True,
                        types = {"person_id": "str"})
Sample_quant = Sample_quant.key_by("person_id")
mt_array_quant = mt_array.semi_join_cols(Sample_quant)
mt_array_quant = mt_array_quant.annotate_cols(**Sample_quant[mt_array_quant.s])


#### read Sample_binary
Sample_binary = hl.import_table(f"{bucket}/Pheno/binary_all.tsv", 
                         missing='',
                        impute=True,
                        types = {"person_id": "str"})
Sample_binary = Sample_binary.key_by("person_id")
mt_array_binary = mt_array.semi_join_cols(Sample_binary)
mt_array_binary = mt_array_binary.annotate_cols(**Sample_binary[mt_array_binary.s])

2023-04-05 20:21:11.016 Hail: INFO: wrote table with 91490 rows in 1 partition to /tmp/persist_tablenjd78OgF2r
2023-04-05 20:21:12.732 Hail: INFO: Reading table to impute column types
2023-04-05 20:21:21.179 Hail: INFO: Loading <StructExpression of type struct{person_id: str, Age: int32, is_sex_Male: int32, is_sex_Female: int32, is_White: int32, is_Black_or_African_American: int32, is_Native_Hawaiian_or_Other_Pacific_Islander: int32, is_Asian: int32, is_Middle_Eastern_or_North_African: int32, is_gender_Male: int32, is_gender_Female: int32, is_Hispanic: int32, is_anc_pred_eur: int32, is_anc_pred_amr: int32, is_anc_pred_afr: int32, is_anc_pred_sas: int32, is_anc_pred_eas: int32, is_anc_pred_mid: int32, PC1: float64, PC2: float64, PC3: float64, PC4: float64, PC5: float64, PC6: float64, PC7: float64, PC8: float64, PC9: float64, PC10: float64, PC11: float64, PC12: float64, PC13: float64, PC14: float64, PC15: float64, PC16: float64, BMI: float64, Height: float64, DBP: float64, SBP: float64, 

In [5]:
def SumStats_Wrangle(sumstats_file, phenotype):
    
    ## read hail table; change key
    if phenotype == "quant":
        sumstats = hl.import_table(sumstats_file,
                         types={"is_negative_strand": "bool",
                                "af_meta_hq": "float",
                                "beta_meta_hq": "float",
                                "beta_meta_hq_fix_ref_alt": "float",
                                "se_meta_hq": "float",
                                "neglog10_pval_meta_hq": "float",
                                "pval_meta_hq": "float"})

        sumstats = sumstats.rename({'af_meta_hq': 'af_meta', 
                                         'beta_meta_hq': 'beta_meta',
                                         'beta_meta_hq_fix_ref_alt': 'beta_meta_fix_ref_alt',
                                         'se_meta_hq': 'se_meta',
                                         'neglog10_pval_meta_hq': 'neglog10_pval_meta',
                                         'pval_meta_hq': 'pval_meta',})
    elif phenotype == "binary":
        sumstats = hl.import_table(sumstats_file,
                     types={"is_negative_strand": "bool",
                            "beta_meta": "float",
                            "beta_meta_fix_ref_alt": "float",
                            "se_meta": "float",
                            "neglog10_pval_meta": "float",
                            "pval_meta": "float"})
        
    sumstats = sumstats.annotate(locus = hl.parse_locus(sumstats.locus, reference_genome='GRCh38'))
    sumstats = sumstats.key_by("locus")
    
    ## add beta with different thresholds
    sumstats = sumstats.annotate(beta_thresh1 =  sumstats.beta_meta_fix_ref_alt)
    sumstats = sumstats.annotate(beta_thresh2 =  hl.if_else(sumstats.pval_meta < 0.5,  sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh3 =  hl.if_else(sumstats.pval_meta < 1e-1, sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh4 =  hl.if_else(sumstats.pval_meta < 1e-2, sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh5 =  hl.if_else(sumstats.pval_meta < 1e-3, sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh6 =  hl.if_else(sumstats.pval_meta < 1e-4, sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh7 =  hl.if_else(sumstats.pval_meta < 1e-5, sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh8 =  hl.if_else(sumstats.pval_meta < 1e-6, sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh9 =  hl.if_else(sumstats.pval_meta < 1e-7, sumstats.beta_meta_fix_ref_alt, 0))
    sumstats = sumstats.annotate(beta_thresh10 = hl.if_else(sumstats.pval_meta < 5e-8, sumstats.beta_meta_fix_ref_alt, 0))
    return(sumstats)

In [6]:
######### quantatative sumstats #########
DBP_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_DBP_QCed_clumped.tsv', "quant")
HDL_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_HDL_QCed_clumped.tsv', "quant")
Height_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_Height_QCed_clumped.tsv', "quant")
RBC_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_RBC_QCed_clumped.tsv', "quant")
TC_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_TC_QCed_clumped.tsv', "quant")
leukocyte_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_leukocyte_QCed_clumped.tsv', "quant")
# check point
DBP_sumstats = DBP_sumstats.checkpoint(f"{bucket}/hail_checkpoints/DBP_sumstats_checkpoint.ht")
HDL_sumstats = HDL_sumstats.checkpoint(f"{bucket}/hail_checkpoints/HDL_sumstats_checkpoint.ht")
Height_sumstats = Height_sumstats.checkpoint(f"{bucket}/hail_checkpoints/Height_sumstats_checkpoint.ht")
RBC_sumstats = RBC_sumstats.checkpoint(f"{bucket}/hail_checkpoints/RBC_sumstats_checkpoint.ht")
TC_sumstats = TC_sumstats.checkpoint(f"{bucket}/hail_checkpoints/TC_sumstats_checkpoint.ht")
leukocyte_sumstats = leukocyte_sumstats.checkpoint(f"{bucket}/hail_checkpoints/leukocyte_sumstats_checkpoint.ht")


######### binary sumstats #########
T2D_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_T2D_QCed_clumped.tsv', "binary")
Asthma_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_Asthma_QCed_clumped.tsv', "binary")
Breast_Cancer_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_Breast_Cancer_QCed_clumped.tsv', "binary")
Colorectal_Cancer_sumstats = SumStats_Wrangle(f'{bucket}/Sumstats_clumped/Array_Colorectal_Cancer_QCed_clumped.tsv', "binary")
# check point
T2D_sumstats = T2D_sumstats.checkpoint(f"{bucket}/hail_checkpoints/T2D_sumstats_checkpoint.ht")
Asthma_sumstats = Asthma_sumstats.checkpoint(f"{bucket}/hail_checkpoints/Asthma_sumstats_checkpoint.ht")
Breast_Cancer_sumstats = Breast_Cancer_sumstats.checkpoint(f"{bucket}/hail_checkpoints/Breast_Cancer_sumstats_checkpoint.ht")
Colorectal_Cancer_sumstats = Colorectal_Cancer_sumstats.checkpoint(f"{bucket}/hail_checkpoints/Colorectal_Cancer_sumstats_checkpoint.ht")

2023-04-05 20:21:30.053 Hail: INFO: wrote table with 261859 rows in 1 partition to /tmp/persist_tablenC0kn3wZm7
2023-04-05 20:21:30.863 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type str (not specified)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type bool (user-supplied)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'af_meta_hq' as type float64 (user-supplied)
  Loading field 'beta_meta_hq' as type float64 (user-supplied)
  Loading field 'beta_meta_hq_fix_ref_alt' as type float64 (user-supplied)
  Loading field 'se_meta_hq' a

In [8]:
mt_array_quant = mt_array_quant.annotate_rows(
                                  DBP_sumstats = DBP_sumstats[mt_array_quant.locus],
                                  HDL_sumstats = HDL_sumstats[mt_array_quant.locus],
                                  Height_sumstats = Height_sumstats[mt_array_quant.locus],
                                  RBC_sumstats = RBC_sumstats[mt_array_quant.locus],
                                  TC_sumstats = TC_sumstats[mt_array_quant.locus],
                                  leukocyte_sumstats = leukocyte_sumstats[mt_array_quant.locus])

mt_array_binary = mt_array_binary.annotate_rows(
                                  T2D_sumstats = T2D_sumstats[mt_array_binary.locus],
                                  Asthma_sumstats = Asthma_sumstats[mt_array_binary.locus],
                                  Breast_Cancer_sumstats = Breast_Cancer_sumstats[mt_array_binary.locus],
                                  Colorectal_Cancer_sumstats = Colorectal_Cancer_sumstats[mt_array_binary.locus])


mt_array_quant = mt_array_quant.checkpoint(f"{bucket}/hail_checkpoints/mt_array_quant_checkpoint1.mt")
mt_array_binary = mt_array_binary.checkpoint(f"{bucket}/hail_checkpoints/mt_array_binary_checkpoint1.mt")

2023-04-05 20:28:19.980 Hail: INFO: wrote matrix table with 975876 rows and 91489 columns in 74 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/hail_checkpoints/mt_array_quant_checkpoint1.mt
2023-04-05 20:32:02.525 Hail: INFO: wrote matrix table with 975876 rows and 91630 columns in 74 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/hail_checkpoints/mt_array_binary_checkpoint1.mt


In [9]:
mt_array_quant = mt_array_quant.annotate_cols(
        # DBP
        DBP_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh1 * mt_array_quant.GT),
                  pgs2 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh2 * mt_array_quant.GT),
                  pgs3 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh3 * mt_array_quant.GT),
                  pgs4 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh4 * mt_array_quant.GT),
                  pgs5 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh5 * mt_array_quant.GT),
                  pgs6 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh6 * mt_array_quant.GT),
                  pgs7 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh7 * mt_array_quant.GT),
                  pgs8 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh8 * mt_array_quant.GT),
                  pgs9 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh9 * mt_array_quant.GT),
                  pgs10 = hl.agg.sum(mt_array_quant.DBP_sumstats.beta_thresh10 * mt_array_quant.GT)),
        
        # HDL
        HDL_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh1 * mt_array_quant.GT),
              pgs2 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh2 * mt_array_quant.GT),
              pgs3 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh3 * mt_array_quant.GT),
              pgs4 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh4 * mt_array_quant.GT),
              pgs5 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh5 * mt_array_quant.GT),
              pgs6 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh6 * mt_array_quant.GT),
              pgs7 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh7 * mt_array_quant.GT),
              pgs8 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh8 * mt_array_quant.GT),
              pgs9 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh9 * mt_array_quant.GT),
              pgs10 = hl.agg.sum(mt_array_quant.HDL_sumstats.beta_thresh10 * mt_array_quant.GT)),
    
        # Height
        Height_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh1 * mt_array_quant.GT),
              pgs2 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh2 * mt_array_quant.GT),
              pgs3 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh3 * mt_array_quant.GT),
              pgs4 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh4 * mt_array_quant.GT),
              pgs5 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh5 * mt_array_quant.GT),
              pgs6 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh6 * mt_array_quant.GT),
              pgs7 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh7 * mt_array_quant.GT),
              pgs8 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh8 * mt_array_quant.GT),
              pgs9 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh9 * mt_array_quant.GT),
              pgs10 = hl.agg.sum(mt_array_quant.Height_sumstats.beta_thresh10 * mt_array_quant.GT)),
        
        # RBC
        RBC_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh1 * mt_array_quant.GT),
              pgs2 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh2 * mt_array_quant.GT),
              pgs3 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh3 * mt_array_quant.GT),
              pgs4 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh4 * mt_array_quant.GT),
              pgs5 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh5 * mt_array_quant.GT),
              pgs6 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh6 * mt_array_quant.GT),
              pgs7 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh7 * mt_array_quant.GT),
              pgs8 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh8 * mt_array_quant.GT),
              pgs9 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh9 * mt_array_quant.GT),
              pgs10 = hl.agg.sum(mt_array_quant.RBC_sumstats.beta_thresh10 * mt_array_quant.GT)),
        
        # TC
        TC_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh1 * mt_array_quant.GT),
              pgs2 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh2 * mt_array_quant.GT),
              pgs3 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh3 * mt_array_quant.GT),
              pgs4 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh4 * mt_array_quant.GT),
              pgs5 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh5 * mt_array_quant.GT),
              pgs6 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh6 * mt_array_quant.GT),
              pgs7 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh7 * mt_array_quant.GT),
              pgs8 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh8 * mt_array_quant.GT),
              pgs9 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh9 * mt_array_quant.GT),
              pgs10 = hl.agg.sum(mt_array_quant.TC_sumstats.beta_thresh10 * mt_array_quant.GT)),
        
        # leukocyte
        leukocyte_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh1 * mt_array_quant.GT),
              pgs2 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh2 * mt_array_quant.GT),
              pgs3 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh3 * mt_array_quant.GT),
              pgs4 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh4 * mt_array_quant.GT),
              pgs5 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh5 * mt_array_quant.GT),
              pgs6 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh6 * mt_array_quant.GT),
              pgs7 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh7 * mt_array_quant.GT),
              pgs8 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh8 * mt_array_quant.GT),
              pgs9 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh9 * mt_array_quant.GT),
              pgs10 = hl.agg.sum(mt_array_quant.leukocyte_sumstats.beta_thresh10 * mt_array_quant.GT))
)

In [11]:
mt_array_binary = mt_array_binary.annotate_cols(
        # T2D
        T2D_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh1 * mt_array_binary.GT),
                  pgs2 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh2 * mt_array_binary.GT),
                  pgs3 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh3 * mt_array_binary.GT),
                  pgs4 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh4 * mt_array_binary.GT),
                  pgs5 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh5 * mt_array_binary.GT),
                  pgs6 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh6 * mt_array_binary.GT),
                  pgs7 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh7 * mt_array_binary.GT),
                  pgs8 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh8 * mt_array_binary.GT),
                  pgs9 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh9 * mt_array_binary.GT),
                  pgs10 = hl.agg.sum(mt_array_binary.T2D_sumstats.beta_thresh10 * mt_array_binary.GT)),
    
        # Asthma
        Asthma_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh1 * mt_array_binary.GT),
                  pgs2 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh2 * mt_array_binary.GT),
                  pgs3 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh3 * mt_array_binary.GT),
                  pgs4 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh4 * mt_array_binary.GT),
                  pgs5 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh5 * mt_array_binary.GT),
                  pgs6 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh6 * mt_array_binary.GT),
                  pgs7 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh7 * mt_array_binary.GT),
                  pgs8 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh8 * mt_array_binary.GT),
                  pgs9 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh9 * mt_array_binary.GT),
                  pgs10 = hl.agg.sum(mt_array_binary.Asthma_sumstats.beta_thresh10 * mt_array_binary.GT)),
    
        # Breast_Cancer
        Breast_Cancer_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh1 * mt_array_binary.GT),
                  pgs2 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh2 * mt_array_binary.GT),
                  pgs3 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh3 * mt_array_binary.GT),
                  pgs4 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh4 * mt_array_binary.GT),
                  pgs5 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh5 * mt_array_binary.GT),
                  pgs6 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh6 * mt_array_binary.GT),
                  pgs7 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh7 * mt_array_binary.GT),
                  pgs8 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh8 * mt_array_binary.GT),
                  pgs9 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh9 * mt_array_binary.GT),
                  pgs10 = hl.agg.sum(mt_array_binary.Breast_Cancer_sumstats.beta_thresh10 * mt_array_binary.GT)),
    
    
        # Colorectal_Cancer
        Colorectal_Cancer_pgs = hl.struct(pgs1 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh1 * mt_array_binary.GT),
                  pgs2 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh2 * mt_array_binary.GT),
                  pgs3 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh3 * mt_array_binary.GT),
                  pgs4 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh4 * mt_array_binary.GT),
                  pgs5 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh5 * mt_array_binary.GT),
                  pgs6 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh6 * mt_array_binary.GT),
                  pgs7 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh7 * mt_array_binary.GT),
                  pgs8 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh8 * mt_array_binary.GT),
                  pgs9 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh9 * mt_array_binary.GT),
                  pgs10 = hl.agg.sum(mt_array_binary.Colorectal_Cancer_sumstats.beta_thresh10 * mt_array_binary.GT))

)

In [12]:
def export_Scores(mt, pheno, mt_type, method):
    export_filename = f"{bucket}/Scores/" + mt_type + "/" + pheno + "_" + method + ".bgz"
    sample_info = mt.cols().select(
        "Age", 'is_sex_Male', 'is_sex_Female', 
        "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", 
        "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", 
        pheno + "_pgs", pheno)
    sample_info = sample_info.annotate(**sample_info[pheno + "_pgs"])
    sample_info = sample_info.drop(pheno + "_pgs")
    sample_info.export(export_filename)

In [None]:
export_Scores(mt_array_quant, "DBP", "Array", "clump")
export_Scores(mt_array_quant, "HDL", "Array", "clump")
export_Scores(mt_array_quant, "Height", "Array", "clump")
export_Scores(mt_array_quant, "RBC", "Array", "clump")
export_Scores(mt_array_quant, "TC", "Array", "clump")
export_Scores(mt_array_quant, "leukocyte", "Array", "clump")

export_Scores(mt_array_binary, "T2D", "Array", "clump")
export_Scores(mt_array_binary, "Asthma", "Array", "clump")
export_Scores(mt_array_binary, "Breast_Cancer", "Array", "clump")
export_Scores(mt_array_binary, "Colorectal_Cancer", "Array", "clump")

2023-04-05 20:32:33.481 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2023-04-05 20:38:27.388 Hail: INFO: Coerced sorted dataset        (10 + 6) / 16]
2023-04-05 20:38:31.080 Hail: INFO: merging 17 files totalling 10.0M... 1) / 16]
2023-04-05 20:38:31.455 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/DBP_clump.bgz
  merge time: 373.620ms
2023-04-05 20:44:06.327 Hail: INFO: Coerced sorted dataset=>      (14 + 2) / 16]
2023-04-05 20:44:09.544 Hail: INFO: merging 17 files totalling 9.8M...+ 1) / 16]
2023-04-05 20:44:09.839 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/HDL_clump.bgz
  merge time: 294.928ms
2023-04-05 20:49:36.078 Hail: INFO: Coerced sorted dataset====>   (15 + 1) / 16]
2023-04-05 20:49:38.262 Hail: INFO: merging 17 files totalling 10.0M... 1) / 16]
2023-04-05 20:49:38.612

In [None]:
!gsutil mv f'{bucket}/Scores/Array/DBP_clump.bgz' f'{bucket}/Scores/Array/DBP_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/HDL_clump.bgz' f'{bucket}/Scores/Array/HDL_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/Height_clump.bgz' f'{bucket}/Scores/Array/Height_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/RBC_clump.bgz' f'{bucket}/Scores/Array/RBC_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/TC_clump.bgz' f'{bucket}/Scores/Array/TC_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/leukocyte_clump.bgz' f'{bucket}/Scores/Array/leukocyte_clump.gz'

!gsutil mv f'{bucket}/Scores/Array/T2D_clump.bgz' f'{bucket}/Scores/Array/T2D_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/Asthma_clump.bgz' f'{bucket}/Scores/Array/Asthma_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/Breast_Cancer_clump.bgz' f'{bucket}/Scores/Array/Breast_Cancer_clump.gz'
!gsutil mv f'{bucket}/Scores/Array/Colorectal_Cancer_clump.bgz' f'{bucket}/Scores/Array/Colorectal_Cancer_clump.gz'