In [1]:
from datetime import datetime
import os
import pandas as pd
import numpy as np
#import hail as hl

In [None]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

### Compute overlap with hm3 variants

In [4]:
# downloaded from https://zenodo.org/records/7773502
!gunzip w_hm3.snplist.gz

In [12]:
hm3 = pd.read_csv("w_hm3.snplist", sep='\t') #1217311  vairants
qc_aou_array = pd.read_csv("Array_Vars_QCed_dbsnp.tsv", sep='\t') #975876 variants
qc_aou_wgs = pd.read_csv("WGS_Vars_QCed_dbsnp.tsv", sep='\t') #8996707 variants

In [6]:
overlap = hm3[hm3['SNP'].isin(qc_aou_array['rsid'])] #297119 overalp variants

In [8]:
297119/975876

0.30446388680529085

In [9]:
297119/1217311

0.2440781361541956

In [14]:
overlap2 = qc_aou_array[qc_aou_array['rsid'].isin(qc_aou_array['rsid'])] #975876 overalp variants (100%)

## Array QC'd locus

In [3]:
var_array = hl.read_table(f"{bucket}/ArrayData/Array_Vars_QCed.ht")

Initializing Hail with default parameters...

Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-b.c.terra-vpc-sc-fd39b54c.internal:40693
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20241128-0048-0.2.130.post1-c69cd67afb8b.log


In [6]:
var_array = var_array.repartition(100)

In [7]:
db = hl.experimental.DB(region='us-central1', cloud='gcp')
var_array = db.annotate_rows_db(var_array, 'dbSNP_rsid') 

In [None]:
# Simplify the format of array<struct{rsid: str}> to a delimited string
var_array = var_array.annotate(
    dbSNP_rsid=hl.delimit(var_array.dbSNP_rsid.map(lambda x: x.rsid), delimiter=",")
)

In [11]:
# add ref and alt as seperate columns 
var_array = var_array.annotate(ref = var_array.alleles[0])
var_array = var_array.annotate(alt = var_array.alleles[1])

In [13]:
#var_array.count() #975876

In [14]:
var_array = var_array.key_by('locus')

In [None]:
# select desired cols for output 
varinfo_out = var_array.select(rsid = var_array.dbSNP_rsid,
                               ref = var_array.ref,
                               alt = var_array.alt)

In [17]:
varinfo_out.export(f"{bucket}/ArrayData/Array_Vars_QCed_dbsnp.tsv.bgz")

2024-11-28 00:50:50.355 Hail: INFO: merging 100 files totalling 8.8M...1) / 100]
2024-11-28 00:50:51.109 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/ArrayData/Array_Vars_QCed_dbsnp.tsv.bgz
  merge time: 752.272ms
Traceback (most recent call last):


## WGS QC'd locus

In [11]:
var_wgs = hl.read_table(f"{bucket}/WGSData/WGS_Vars_QCed.ht")

Initializing Hail with default parameters...

Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-b.c.terra-vpc-sc-fd39b54c.internal:37287
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20241128-0055-0.2.130.post1-c69cd67afb8b.log


In [14]:
var_wgs = var_wgs.repartition(1000)

In [15]:
db = hl.experimental.DB(region='us-central1', cloud='gcp')
var_wgs = db.annotate_rows_db(var_wgs, 'dbSNP_rsid') 

In [18]:
#var_wgs.count() #8996707

8996707

In [None]:
# Simplify the format of wgs<struct{rsid: str}> to a delimited string
var_wgs = var_wgs.annotate(
    dbSNP_rsid=hl.delimit(var_wgs.dbSNP_rsid.map(lambda x: x.rsid), delimiter=",")
)

In [20]:
# add ref and alt as seperate columns 
var_wgs = var_wgs.annotate(ref = var_wgs.alleles[0])
var_wgs = var_wgs.annotate(alt = var_wgs.alleles[1])

In [22]:
var_wgs = var_wgs.key_by('locus')

In [None]:
# select desired cols for output 
varinfo_out = var_wgs.select(rsid = var_wgs.dbSNP_rsid,
                             ref = var_wgs.ref,
                             alt = var_wgs.alt)

In [25]:
varinfo_out.export(f"{bucket}/WGSData/WGS_Vars_QCed_dbsnp.tsv.bgz")

2024-11-28 01:14:10.374 Hail: INFO: merging 1000 files totalling 76.4M.../ 1000]
2024-11-28 01:15:14.056 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/WGSData/WGS_Vars_QCed_dbsnp.tsv.bgz
  merge time: 1m3.7s


## Array QC'd sumstats

In [19]:
def sumstats_QC(filename_in, filename_out):
    sumstats = hl.import_table(filename_in, 
                               #impute=True, 
                               types = {"locus": hl.tlocus("GRCh38")})
    # Add a new column "alleles" by combining "alleles1_sumstats_fixstrand" and "alleles2_sumstats_fixstrand"
    sumstats = sumstats.annotate(
    alleles=[sumstats.alleles1_sumstats_fixstrand, sumstats.alleles2_sumstats_fixstrand]
    )
    # Set the type of "alleles" explicitly to array<str> (Hail infers it automatically in this case)
    sumstats = sumstats.annotate(
    alleles=hl.array(sumstats.alleles)
    )

    sumstats = sumstats.key_by('locus', 'alleles') #for annotate dbSNP
    sumstats = sumstats.repartition(100)
    
    # annotate dbSNP rsID 
    db = hl.experimental.DB(region='us-central1', cloud='gcp')
    sumstats = db.annotate_rows_db(sumstats, 'dbSNP_rsid')
    
    # Simplify the format of wgs<struct{rsid: str}> to a delimited string
    sumstats = sumstats.annotate(
    dbSNP_rsid=hl.delimit(sumstats.dbSNP_rsid.map(lambda x: x.rsid), delimiter=",")
    )
    
    # only keep required columns
    sumstats = sumstats.key_by('locus') 
    sumstats = sumstats.select(
    'dbSNP_rsid',   
    'alleles2_sumstats_fixstrand',
    'alleles1_sumstats_fixstrand',
    'beta_meta_hq_fix_ref_alt',
    'neglog10_pval_meta_hq'
    )
    
    sumstats.export(filename_out)

In [20]:
print("Height " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_Height_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_Height_QCed_dbsnp.tsv.bgz")

print("DBP " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_DBP_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_DBP_QCed_dbsnp.tsv.bgz")

print("HDL " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_HDL_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_HDL_QCed_dbsnp.tsv.bgz")

print("TC " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_TC_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_TC_QCed_dbsnp.tsv.bgz")

print("RBC " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_RBC_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_RBC_QCed_dbsnp.tsv.bgz")

print("leukocyte " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_leukocyte_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_leukocyte_QCed_dbsnp.tsv.bgz")

Height 2024-11-28 01:45:51.740745


2024-11-28 01:45:52.782 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'af_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta_hq' as type str (not specified)
  Loading field 'neglog10_pval_meta_hq' as type str (not specified)
2024-11-28 01:45:59.

DBP 2024-11-28 01:47:20.689098


2024-11-28 01:47:22.093 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'af_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta_hq' as type str (not specified)
  Loading field 'neglog10_pval_meta_hq' as type str (not specified)
2024-11-28 01:47:33.

HDL 2024-11-28 01:48:46.034473


2024-11-28 01:48:46.993 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'af_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta_hq' as type str (not specified)
  Loading field 'neglog10_pval_meta_hq' as type str (not specified)
2024-11-28 01:48:53.

TC 2024-11-28 01:50:05.097208


2024-11-28 01:50:06.058 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'af_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta_hq' as type str (not specified)
  Loading field 'neglog10_pval_meta_hq' as type str (not specified)
2024-11-28 01:50:12.

RBC 2024-11-28 01:51:24.847224


2024-11-28 01:51:25.698 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'af_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta_hq' as type str (not specified)
  Loading field 'neglog10_pval_meta_hq' as type str (not specified)
2024-11-28 01:51:32.

leukocyte 2024-11-28 01:52:44.316448


2024-11-28 01:52:45.165 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'af_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq' as type str (not specified)
  Loading field 'beta_meta_hq_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta_hq' as type str (not specified)
  Loading field 'neglog10_pval_meta_hq' as type str (not specified)
2024-11-28 01:52:51.

T2D 2024-11-28 01:54:03.077284


2024-11-28 01:54:03.961 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)


LookupError: Table instance has no field 'beta_meta_hq_fix_ref_alt'
    Did you mean:
        'beta_meta_fix_ref_alt' [row]
    Hint: use 'describe()' to show the names of all data fields.

In [23]:
def sumstats_QC(filename_in, filename_out):
    sumstats = hl.import_table(filename_in, 
                               #impute=True, 
                               types = {"locus": hl.tlocus("GRCh38")})
    # Add a new column "alleles" by combining "alleles1_sumstats_fixstrand" and "alleles2_sumstats_fixstrand"
    sumstats = sumstats.annotate(
    alleles=[sumstats.alleles1_sumstats_fixstrand, sumstats.alleles2_sumstats_fixstrand]
    )
    # Set the type of "alleles" explicitly to array<str> (Hail infers it automatically in this case)
    sumstats = sumstats.annotate(
    alleles=hl.array(sumstats.alleles)
    )

    sumstats = sumstats.key_by('locus', 'alleles') #for annotate dbSNP
    sumstats = sumstats.repartition(100)
    
    # annotate dbSNP rsID 
    db = hl.experimental.DB(region='us-central1', cloud='gcp')
    sumstats = db.annotate_rows_db(sumstats, 'dbSNP_rsid')
    
    # Simplify the format of wgs<struct{rsid: str}> to a delimited string
    sumstats = sumstats.annotate(
    dbSNP_rsid=hl.delimit(sumstats.dbSNP_rsid.map(lambda x: x.rsid), delimiter=",")
    )
    
    # only keep required columns
    sumstats = sumstats.key_by('locus') 
    sumstats = sumstats.select(
    'dbSNP_rsid',   
    'alleles2_sumstats_fixstrand',
    'alleles1_sumstats_fixstrand',
    'beta_meta_fix_ref_alt',
    'neglog10_pval_meta'
    )
    
    sumstats.export(filename_out)

In [24]:
print("T2D " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_T2D_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_T2D_QCed_dbsnp.tsv.bgz")

print("Asthma " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_Asthma_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_Asthma_QCed_dbsnp.tsv.bgz")

print("Breast_Cancer " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_Breast_Cancer_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_Breast_Cancer_QCed_dbsnp.tsv.bgz")

print("Colorectal_Cancer " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/Array_Colorectal_Cancer_QCed.tsv.bgz",
            f"{bucket}/Sumstats/Array_Colorectal_Cancer_QCed_dbsnp.tsv.bgz")

T2D 2024-11-28 02:02:54.577540


2024-11-28 02:02:55.348 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-11-28 02:03:01.748 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-11-

Asthma 2024-11-28 02:04:21.785529


2024-11-28 02:04:22.841 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-11-28 02:04:29.368 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-11-

Breast_Cancer 2024-11-28 02:05:39.708792


2024-11-28 02:05:40.683 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-11-28 02:05:47.136 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-11-

Colorectal_Cancer 2024-11-28 02:06:57.952122


2024-11-28 02:06:58.949 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_array' as type str (not specified)
  Loading field 'alleles2_array' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-11-28 02:07:05.586 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-11-

## WGS QC'd sumstats

In [4]:
def sumstats_QC(filename_in, filename_out):
    sumstats = hl.import_table(filename_in, 
                               #impute=True, 
                               types = {"locus": hl.tlocus("GRCh38")})
    # Add a new column "alleles" by combining "alleles1_sumstats_fixstrand" and "alleles2_sumstats_fixstrand"
    sumstats = sumstats.annotate(
    alleles=[sumstats.alleles1_sumstats_fixstrand, sumstats.alleles2_sumstats_fixstrand]
    )
    # Set the type of "alleles" explicitly to array<str> (Hail infers it automatically in this case)
    sumstats = sumstats.annotate(
    alleles=hl.array(sumstats.alleles)
    )

    sumstats = sumstats.key_by('locus', 'alleles') #for annotate dbSNP
    sumstats = sumstats.repartition(1000)
    
    # annotate dbSNP rsID 
    db = hl.experimental.DB(region='us-central1', cloud='gcp')
    sumstats = db.annotate_rows_db(sumstats, 'dbSNP_rsid')
    
    # Simplify the format of wgs<struct{rsid: str}> to a delimited string
    sumstats = sumstats.annotate(
    dbSNP_rsid=hl.delimit(sumstats.dbSNP_rsid.map(lambda x: x.rsid), delimiter=",")
    )
    
    # only keep required columns
    sumstats = sumstats.key_by('locus') 
    sumstats = sumstats.select(
    'dbSNP_rsid',   
    'alleles2_sumstats_fixstrand',
    'alleles1_sumstats_fixstrand',
    'beta_meta_fix_ref_alt',
    'neglog10_pval_meta'
    )
    
    sumstats.export(filename_out)

In [5]:
print("Height " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_Height_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_Height_QCed_dbsnp.tsv.bgz")

print("DBP " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_DBP_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_DBP_QCed_dbsnp.tsv.bgz")

print("HDL " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_HDL_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_HDL_QCed_dbsnp.tsv.bgz")

print("TC " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_TC_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_TC_QCed_dbsnp.tsv.bgz")

print("RBC " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_RBC_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_RBC_QCed_dbsnp.tsv.bgz")

print("leukocyte " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_leukocyte_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_leukocyte_QCed_dbsnp.tsv.bgz")

print("T2D " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_T2D_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_T2D_QCed_dbsnp.tsv.bgz")

print("Asthma " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_Asthma_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_Asthma_QCed_dbsnp.tsv.bgz")

print("Breast_Cancer " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_Breast_Cancer_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_Breast_Cancer_QCed_dbsnp.tsv.bgz")

print("Colorectal_Cancer " + str(datetime.now()))
sumstats_QC(f"{bucket}/Sumstats/WGS_Colorectal_Cancer_QCed.tsv.bgz",
            f"{bucket}/Sumstats/WGS_Colorectal_Cancer_QCed_dbsnp.tsv.bgz")

Initializing Hail with default parameters...


Height 2024-12-17 03:22:12.602822



Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-a.c.terra-vpc-sc-fd39b54c.internal:46487
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20241217-0322-0.2.130.post1-c69cd67afb8b.log
2024-12-17 03:22:55.453 Hail: INFO: Reading table without type imputation1) / 1]
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2

DBP 2024-12-17 03:30:40.470220


2024-12-17 03:30:42.438 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 03:31:25.984 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

HDL 2024-12-17 03:35:38.588868


2024-12-17 03:35:39.883 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 03:36:23.761 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

TC 2024-12-17 03:40:40.244575


2024-12-17 03:40:42.037 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 03:41:26.922 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

RBC 2024-12-17 03:45:45.149549


2024-12-17 03:45:46.567 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 03:46:30.526 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

leukocyte 2024-12-17 03:50:49.286805


2024-12-17 03:50:50.724 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 03:51:34.573 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

T2D 2024-12-17 03:55:42.553279


2024-12-17 03:55:43.688 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 03:56:25.177 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

Asthma 2024-12-17 04:00:36.087674


2024-12-17 04:00:37.375 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 04:01:19.850 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

Breast_Cancer 2024-12-17 04:05:27.698792


2024-12-17 04:05:29.027 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 04:06:15.218 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

Colorectal_Cancer 2024-12-17 04:10:22.456707


2024-12-17 04:10:23.638 Hail: INFO: Reading table without type imputation
  Loading field 'locus' as type locus<GRCh38> (user-supplied)
  Loading field 'rsid' as type str (not specified)
  Loading field 'alleles1_wgs' as type str (not specified)
  Loading field 'alleles2_wgs' as type str (not specified)
  Loading field 'alleles1_sumstats_original' as type str (not specified)
  Loading field 'alleles2_sumstats_original' as type str (not specified)
  Loading field 'is_negative_strand' as type str (not specified)
  Loading field 'alleles1_sumstats_fixstrand' as type str (not specified)
  Loading field 'alleles2_sumstats_fixstrand' as type str (not specified)
  Loading field 'beta_meta' as type str (not specified)
  Loading field 'beta_meta_fix_ref_alt' as type str (not specified)
  Loading field 'se_meta' as type str (not specified)
  Loading field 'neglog10_pval_meta' as type str (not specified)
2024-12-17 04:11:09.153 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-17 0

Due to file size limitation for download, save the files with locus or rsID separately

In [5]:
wgs_Height = pd.read_csv("WGS_Height_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_DBP = pd.read_csv("WGS_DBP_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_HDL = pd.read_csv("WGS_HDL_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_TC = pd.read_csv("WGS_TC_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_RBC = pd.read_csv("WGS_RBC_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_leukocyte = pd.read_csv("WGS_leukocyte_QCed_dbsnp.tsv.gz",sep = "\t")

wgs_Asthma = pd.read_csv("WGS_Asthma_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_T2D = pd.read_csv("WGS_T2D_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_Colorectal_Cancer = pd.read_csv("WGS_Colorectal_Cancer_QCed_dbsnp.tsv.gz",sep = "\t")
wgs_Breast_Cancer = pd.read_csv("WGS_Breast_Cancer_QCed_dbsnp.tsv.gz",sep = "\t")

In [6]:
# Drop the second column (index 1)
Height = wgs_Height.drop(wgs_Height.columns[1], axis=1)
DBP = wgs_DBP.drop(wgs_DBP.columns[1], axis=1)
HDL = wgs_HDL.drop(wgs_HDL.columns[1], axis=1)
TC = wgs_TC.drop(wgs_TC.columns[1], axis=1)
RBC = wgs_RBC.drop(wgs_RBC.columns[1], axis=1)
leukocyte = wgs_leukocyte.drop(wgs_leukocyte.columns[1], axis=1)
Asthma = wgs_Asthma.drop(wgs_Asthma.columns[1], axis=1)
T2D = wgs_T2D.drop(wgs_T2D.columns[1], axis=1)
Colorectal_Cancer = wgs_Colorectal_Cancer.drop(wgs_Colorectal_Cancer.columns[1], axis=1)
Breast_Cancer = wgs_Breast_Cancer.drop(wgs_Breast_Cancer.columns[1], axis=1)

In [7]:
# List of phenotypes
phenotypes = ['Height', 'DBP', 'HDL', 'TC', 'RBC', 'leukocyte', 'Asthma', 'T2D', 'Colorectal_Cancer', 'Breast_Cancer']

# Loop through each phenotype and save the corresponding dataframe to a file
for phenotype in phenotypes:
    globals()[phenotype].to_csv(f'WGS_{phenotype}_QCed.tsv.gz', sep="\t", index=False, compression='gzip')