In [None]:
## import packages
from datetime import datetime
import os
import pandas as pd
import numpy as np
import random
from itertools import chain
import hail as hl
from hail.linalg import BlockMatrix

In [2]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

## QC

1. Liftover, key the sumstats by locus  
2. Restrict to biallelic variants  
3. Negative strand problems  
4. Annotate to matrix table, and swap ref:alt if necessary  

## Liftover, then key the sumstats by locus (done in Step2a)

rg37 = hl.get_reference('GRCh37')  
rg38 = hl.get_reference('GRCh38')   
rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)  

In [4]:
# 8,996,707 biallelic SNP after QC
var_wgs = hl.read_table(f"{bucket}/WGSData/WGS_Vars_QCed.ht")
var_wgs = var_wgs.key_by("locus")

Initializing Hail with default parameters...

Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-c.c.terra-vpc-sc-fd39b54c.internal:38207
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20241117-0608-0.2.130.post1-c69cd67afb8b.log


### Quantatative traits

Use `beta_meta_hq`, `se_meta_hq`, `neglog10_pval_meta_hq`

Add `var_wgs = var_wgs.repartition(2000)`, var_wgs = var_wgs.checkpoint(ht_filename_check`, overwrite=True`)

In [6]:
# Step2a_Array_SumStats_QC.ipynb performed some preliminary QC for sumstats
# The intermediate file is saved as checkpoint
# start from the checkpoint files

def sumstats_QC_quant(ht_filename_in, ht_filename_check, filename_out, var_wgs):
    sumstats = hl.read_table(ht_filename_in)
    
    ##############################################################################################################
    ###################################### Analysis done in Step2a_Array_SumStats_QC.ipynb #######################
    # most of the fields are irrelevant for our analysis, drop it first to save computation                      #
    # biallelic SNPs                                                                                             #
    # check if is_SNP                                                                                            #                                                                                   #
    # flip negative strand                                                                                       #
    ##############################################################################################################
    ##############################################################################################################
    
    var_wgs = var_wgs.repartition(2000) ###***
    
    ############## check ref:alt, flip if necessary ############
    var_wgs = var_wgs.annotate(**sumstats[var_wgs.locus])
    var_wgs = var_wgs.filter((~hl.is_nan(var_wgs.beta_meta_hq)) &
                                  (~hl.is_nan(var_wgs.se_meta_hq)) &
                                  (~hl.is_nan(var_wgs.neglog10_pval_meta_hq)))
    
    var_wgs = var_wgs.annotate(beta_meta_fix_ref_alt = hl.case()
        .when(((var_wgs.alleles[0] == var_wgs.alleles_fix_neg_strand[0]) & (var_wgs.alleles[1] == var_wgs.alleles_fix_neg_strand[1])), var_wgs.beta_meta_hq) 
        .when(((var_wgs.alleles[0] == var_wgs.alleles_fix_neg_strand[1]) & (var_wgs.alleles[1] == var_wgs.alleles_fix_neg_strand[0])), -var_wgs.beta_meta_hq)
        .default(float('nan')))
    
    var_wgs = var_wgs.checkpoint(ht_filename_check, overwrite=True)
    
    sumstats_QCed = var_wgs.select(
                        rsid = var_wgs.rsid,
                        alleles1_wgs = var_wgs.alleles[0],
                        alleles2_wgs = var_wgs.alleles[1],
                        alleles1_sumstats_original = var_wgs.ref,
                        alleles2_sumstats_original = var_wgs.alt,
                        is_negative_strand = var_wgs.is_negative_strand,
                        alleles1_sumstats_fixstrand = var_wgs.alleles_fix_neg_strand[0],
                        alleles2_sumstats_fixstrand = var_wgs.alleles_fix_neg_strand[1],
                        beta_meta = var_wgs.beta_meta_hq,
                        beta_meta_fix_ref_alt = var_wgs.beta_meta_fix_ref_alt,
                        se_meta = var_wgs.se_meta_hq,
                        neglog10_pval_meta = var_wgs.neglog10_pval_meta_hq)
    
    sumstats_QCed.export(filename_out)

In [7]:
print("Height " + str(datetime.now()))
sumstats_QC_quant(f"{bucket}/Sumstats/continuous-50-both_sexes-irnt_checkpoint.ht",
                  f"{bucket}/Sumstats/continuous-50-both_sexes-irnt_checkpoint2.ht", 
                  f"{bucket}/Sumstats/WGS_Height_QCed.tsv.bgz",
                  var_wgs)

print("DBP " + str(datetime.now()))
sumstats_QC_quant(f"{bucket}/Sumstats/continuous-DBP-both_sexes-auto_medadj_irnt_checkpoint.ht",
                  f"{bucket}/Sumstats/continuous-DBP-both_sexes-auto_medadj_irnt_checkpoint2.ht",
                  f"{bucket}/Sumstats/WGS_DBP_QCed.tsv.bgz",
                  var_wgs)

print("HDL " + str(datetime.now()))
sumstats_QC_quant(f"{bucket}/Sumstats/biomarkers-30760-both_sexes-irnt_checkpoint.ht",
                  f"{bucket}/Sumstats/biomarkers-30760-both_sexes-irnt_checkpoint2.ht",
                  f"{bucket}/Sumstats/WGS_HDL_QCed.tsv.bgz",
                  var_wgs)

print("TC " + str(datetime.now()))
sumstats_QC_quant(f"{bucket}/Sumstats/biomarkers-30690-both_sexes-irnt_checkpoint.ht",
                  f"{bucket}/Sumstats/biomarkers-30690-both_sexes-irnt_checkpoint2.ht",
                  f"{bucket}/Sumstats/WGS_TC_QCed.tsv.bgz",
                  var_wgs)

print("RBC " + str(datetime.now()))
sumstats_QC_quant(f"{bucket}/Sumstats/continuous-30010-both_sexes-irnt_checkpoint.ht",
                  f"{bucket}/Sumstats/continuous-30010-both_sexes-irnt_checkpoint2.ht",
                  f"{bucket}/Sumstats/WGS_RBC_QCed.tsv.bgz",
                  var_wgs)

print("leukocyte " + str(datetime.now()))
sumstats_QC_quant(f"{bucket}/Sumstats/continuous-30000-both_sexes-irnt_checkpoint.ht",
                  f"{bucket}/Sumstats/continuous-30000-both_sexes-irnt_checkpoint2.ht",
                  f"{bucket}/Sumstats/WGS_leukocyte_QCed.tsv.bgz",
                  var_wgs)

Height 2024-10-05 02:43:03.270038


2024-10-05 02:47:47.457 Hail: INFO: wrote table with 8604715 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/continuous-50-both_sexes-irnt_checkpoint2.ht
2024-10-05 02:47:57.615 Hail: INFO: merging 2001 files totalling 143.6M... 2000]
2024-10-05 02:48:05.689 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_Height_QCed.tsv.bgz
  merge time: 8.073s


DBP 2024-10-05 02:48:08.169490


2024-10-05 02:51:28.521 Hail: INFO: wrote table with 8503485 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/continuous-DBP-both_sexes-auto_medadj_irnt_checkpoint2.ht
2024-10-05 02:51:35.637 Hail: INFO: merging 2001 files totalling 141.9M... 2000]
2024-10-05 02:51:42.472 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_DBP_QCed.tsv.bgz
  merge time: 6.835s


HDL 2024-10-05 02:51:44.884103


2024-10-05 02:54:43.441 Hail: INFO: wrote table with 8484537 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/biomarkers-30760-both_sexes-irnt_checkpoint2.ht
2024-10-05 02:54:50.455 Hail: INFO: merging 2001 files totalling 141.8M... 2000]
2024-10-05 02:54:56.433 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_HDL_QCed.tsv.bgz
  merge time: 5.978s


TC 2024-10-05 02:54:58.678035


2024-10-05 02:57:51.717 Hail: INFO: wrote table with 8506131 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/biomarkers-30690-both_sexes-irnt_checkpoint2.ht
2024-10-05 02:57:58.627 Hail: INFO: merging 2001 files totalling 142.0M... 2000]
2024-10-05 02:58:04.243 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_TC_QCed.tsv.bgz
  merge time: 5.615s


RBC 2024-10-05 02:58:06.746817


2024-10-05 03:00:51.570 Hail: INFO: wrote table with 8533326 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/continuous-30010-both_sexes-irnt_checkpoint2.ht
2024-10-05 03:00:58.495 Hail: INFO: merging 2001 files totalling 142.5M... 2000]
2024-10-05 03:01:03.943 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_RBC_QCed.tsv.bgz
  merge time: 5.448s


leukocyte 2024-10-05 03:01:06.352666


2024-10-05 03:38:33.782 Hail: INFO: wrote table with 8509546 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/continuous-30000-both_sexes-irnt_checkpoint2.ht
2024-10-05 03:38:49.985 Hail: INFO: merging 2001 files totalling 141.7M... 2000]
2024-10-05 03:38:56.647 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_leukocyte_QCed.tsv.bgz
  merge time: 6.662s


### Binary traits

Use `beta_meta`, `se_meta`, `neglog10_pval_meta`

Add `var_wgs = var_wgs.repartition(2000)`, var_wgs = var_wgs.checkpoint(ht_filename_check`, overwrite=True`)

In [5]:
# Step2a_Array_SumStats_QC.ipynb performed some preliminary QC for sumstats
# The intermediate file is saved as checkpoint
# start from the checkpoint files

def sumstats_QC_binary(ht_filename_in, ht_filename_check, filename_out, var_wgs):
    sumstats = hl.read_table(ht_filename_in)
    
    ##############################################################################################################
    ###################################### Analysis done in Step2a_Array_SumStats_QC.ipynb #######################
    # most of the fields are irrelevant for our analysis, drop it first to save computation                      #
    # biallelic SNPs                                                                                             #
    # check if is_SNP                                                                                            #                                                                                   #
    # flip negative strand                                                                                       #
    ##############################################################################################################
    ##############################################################################################################
    
    var_wgs = var_wgs.repartition(2000) ###***
        
    ############## check ref:alt, flip if necessary ############
    var_wgs = var_wgs.annotate(**sumstats[var_wgs.locus])
    var_wgs = var_wgs.filter((~hl.is_nan(var_wgs.beta_meta)) &
                                  (~hl.is_nan(var_wgs.se_meta)) &
                                  (~hl.is_nan(var_wgs.neglog10_pval_meta)))
    
    var_wgs = var_wgs.annotate(beta_meta_fix_ref_alt = hl.case()
        .when(((var_wgs.alleles[0] == var_wgs.alleles_fix_neg_strand[0]) & (var_wgs.alleles[1] == var_wgs.alleles_fix_neg_strand[1])), var_wgs.beta_meta) 
        .when(((var_wgs.alleles[0] == var_wgs.alleles_fix_neg_strand[1]) & (var_wgs.alleles[1] == var_wgs.alleles_fix_neg_strand[0])), -var_wgs.beta_meta)
        .default(float('nan')))
    
    var_wgs = var_wgs.checkpoint(ht_filename_check, overwrite=True)
    
    sumstats_QCed = var_wgs.select(
                        rsid = var_wgs.rsid,
                        alleles1_wgs = var_wgs.alleles[0],
                        alleles2_wgs = var_wgs.alleles[1],
                        alleles1_sumstats_original = var_wgs.ref,
                        alleles2_sumstats_original = var_wgs.alt,
                        is_negative_strand = var_wgs.is_negative_strand,
                        alleles1_sumstats_fixstrand = var_wgs.alleles_fix_neg_strand[0],
                        alleles2_sumstats_fixstrand = var_wgs.alleles_fix_neg_strand[1],
                        beta_meta = var_wgs.beta_meta,
                        beta_meta_fix_ref_alt = var_wgs.beta_meta_fix_ref_alt,
                        se_meta = var_wgs.se_meta,
                        neglog10_pval_meta = var_wgs.neglog10_pval_meta)
    
    sumstats_QCed.export(filename_out)

In [None]:
print("T2D " + str(datetime.now()))
sumstats_QC_binary(f"{bucket}/Sumstats/phecode-250.2-both_sexes_checkpoint.ht",
                   f"{bucket}/Sumstats/phecode-250.2-both_sexes_checkpoint2.ht",
                   f"{bucket}/Sumstats/WGS_T2D_QCed.tsv.bgz",
                   var_wgs)

print("Asthma " + str(datetime.now()))
sumstats_QC_binary(f"{bucket}/Sumstats/phecode-495-both_sexes_checkpoint.ht",
                   f"{bucket}/Sumstats/phecode-495-both_sexes_checkpoint2.ht",
                   f"{bucket}/Sumstats/WGS_Asthma_QCed.tsv.bgz",
                   var_wgs)

T2D 2024-10-05 04:39:35.791189


2024-10-05 04:43:46.343 Hail: INFO: wrote table with 8604426 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/phecode-250.2-both_sexes_checkpoint2.ht
2024-10-05 04:43:55.818 Hail: INFO: merging 2001 files totalling 142.6M... 2000]
2024-10-05 04:44:01.980 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_T2D_QCed.tsv.bgz
  merge time: 6.160s


Asthma 2024-10-05 04:44:04.744452


2024-10-05 04:50:00.856 Hail: INFO: wrote table with 8605299 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/phecode-495-both_sexes_checkpoint2.ht
2024-10-05 04:50:21.397 Hail: INFO: merging 2001 files totalling 143.7M... 2000]
2024-10-05 04:50:26.750 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_Asthma_QCed.tsv.bgz
  merge time: 5.353s


Breast_Cancer 2024-10-05 04:50:29.165377




In [None]:
print("Breast_Cancer " + str(datetime.now()))
sumstats_QC_binary(f"{bucket}/Sumstats/phecode-174.1-females_checkpoint.ht", 
                   f"{bucket}/Sumstats/phecode-174.1-females_checkpoint2.ht",
                   f"{bucket}/Sumstats/WGS_Breast_Cancer_QCed.tsv.bgz",
                   var_wgs)


Breast_Cancer 2024-10-05 05:06:25.031190


2024-10-05 05:10:17.745 Hail: INFO: wrote table with 8601970 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/phecode-174.1-females_checkpoint2.ht
2024-10-05 05:10:27.145 Hail: INFO: merging 2001 files totalling 142.0M... 2000]
2024-10-05 05:10:33.362 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_Breast_Cancer_QCed.tsv.bgz
  merge time: 6.216s


Colorectal_Cancer 2024-10-05 05:10:35.998043




In [6]:
print("Colorectal_Cancer " + str(datetime.now()))
sumstats_QC_binary(f"{bucket}/Sumstats/phecode-153-both_sexes_checkpoint.ht",
                   f"{bucket}/Sumstats/phecode-153-both_sexes_checkpoint2.ht",
                   f"{bucket}/Sumstats/WGS_Colorectal_Cancer_QCed.tsv.bgz",
                   var_wgs)

Colorectal_Cancer 2024-10-05 05:27:12.723094


2024-10-05 05:37:42.557 Hail: INFO: wrote table with 8537841 rows in 2000 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/phecode-153-both_sexes_checkpoint2.ht
2024-10-05 05:37:59.045 Hail: INFO: merging 2001 files totalling 141.2M... 2000]
2024-10-05 05:38:04.686 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats/WGS_Colorectal_Cancer_QCed.tsv.bgz
  merge time: 5.640s
