To do ancestry-specific QC for AF after the current QC

In [1]:
## import packages
from datetime import datetime
import os
import pandas as pd
import numpy as np
import random
from itertools import chain
import hail as hl
from hail.plot import show
from hail.linalg import BlockMatrix

In [2]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

In [3]:
## set up hail
hl.init(default_reference = "GRCh38")


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.


Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-b.c.terra-vpc-sc-fd39b54c.internal:34005
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20241119-0532-0.2.130.post1-c69cd67afb8b.log


In [4]:
mt_wgs_path = os.getenv("WGS_HAIL_STORAGE_PATH")
mt_wgs = hl.read_matrix_table(mt_wgs_path)

In [5]:
mt_wgs.count()

(702574937, 98590)

In [6]:
# read QC'ed variants
var_wgs = hl.read_table(f'{bucket}/WGSData/WGS_Vars_QCed.ht')
var_wgs = var_wgs.key_by("locus")
print(f'{var_wgs.count()} WGS variants passing QC') 

# read QC'ed samples
sample_inter = hl.read_table(f'{bucket}/hail_checkpoints/sample_inter.ht')
sample_inter = sample_inter.key_by(sample_inter.person_id)
print(f'{sample_inter.count()} samples passing QC')

8996707 WGS variants passing QC
95562 samples passing QC


In [7]:
# read union clumped variants
HDL_table = hl.read_table(f'{bucket}/Sumstats/continuous-DBP-both_sexes-auto_medadj_irnt_checkpoint.ht')

HDL_table_count = HDL_table.count()
print(f"HDL_table row count: {HDL_table_count}")

HDL_table row count: 21612699


In [None]:
# subset to QC'ed variants and samples
mt_wgs = mt_wgs.semi_join_rows(var_wgs)
mt_wgs = mt_wgs.semi_join_cols(sample_inter)

In [9]:
Sample_quant = hl.import_table(f"{bucket}/Pheno/quant_all.tsv", 
                         missing='',
                        impute=True,
                        types = {"person_id": "str"})
Sample_quant = Sample_quant.key_by("person_id")

2024-11-19 05:32:54.651 Hail: INFO: Reading table to impute column types 1) / 1]
2024-11-19 05:32:59.525 Hail: INFO: Loading <StructExpression of type struct{person_id: str, Age: int32, is_sex_Male: int32, is_sex_Female: int32, is_White: int32, is_Black_or_African_American: int32, is_Native_Hawaiian_or_Other_Pacific_Islander: int32, is_Asian: int32, is_Middle_Eastern_or_North_African: int32, is_gender_Male: int32, is_gender_Female: int32, is_Hispanic: int32, is_anc_pred_eur: int32, is_anc_pred_amr: int32, is_anc_pred_afr: int32, is_anc_pred_sas: int32, is_anc_pred_eas: int32, is_anc_pred_mid: int32, PC1: float64, PC2: float64, PC3: float64, PC4: float64, PC5: float64, PC6: float64, PC7: float64, PC8: float64, PC9: float64, PC10: float64, PC11: float64, PC12: float64, PC13: float64, PC14: float64, PC15: float64, PC16: float64, BMI: float64, Height: float64, DBP: float64, SBP: float64, HbA1c: float64, leukocyte: float64, Lymphocyte: float64, RBC: float64, Neutrophil: float64, Hemoglobin_

In [10]:
mt_wgs = mt_wgs.semi_join_cols(Sample_quant)
mt_wgs = mt_wgs.annotate_cols(**Sample_quant[mt_wgs.s])

## Caluculate AFR-specific allele frequency

In [13]:
# Create AFR specific MT
mt_wgs_afr = mt_wgs.filter_cols(mt_wgs.is_anc_pred_afr == 1)

In [11]:
# Annotate rows with allele frequency
mt_wgs_afr = mt_wgs_afr.annotate_rows(allele_freq = hl.agg.mean(mt_wgs_afr.GT.n_alt_alleles()) / 2)

In [12]:
# retrieve the row as a table 
varinfo_afr = mt_wgs_afr.rows()

In [13]:
# filter by alleles frequencies (MAF>0.01)
varinfo_afr = varinfo_afr.filter((varinfo_afr.allele_freq >0.01)) 

In [14]:
# select desired cols for output 
varinfo_afr_out = varinfo_afr.select(allele_freq = varinfo_afr.allele_freq)
varinfo_afr_out.write(f"{bucket}/WGSData/WGS_Vars_Ancestry_AFR_QCed.ht", overwrite=True)

2024-11-17 20:04:17.822 Hail: INFO: wrote table with 8059388 rows in 140126 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/WGSData/WGS_Vars_Ancestry_AFR_QCed.ht


## Create matrix tables with less redundant fields

In [12]:
var_wgs_afr = hl.read_table(f'{bucket}/WGSData/WGS_Vars_Ancestry_AFR_QCed.ht')

In [13]:
sample_inter = hl.import_table(f"{bucket}/WGSData/sample_inter.tsv")
sample_inter = sample_inter.key_by(sample_inter.person_id)

2024-11-18 05:00:53.296 Hail: INFO: Reading table without type imputation
  Loading field 'person_id' as type str (not specified)


In [None]:
sample_inter.count()

## HDL (run clumping first at notebook XXX and then back to this step)

In [27]:
var_wgs_HDL_afr = hl.read_table(f'{bucket}/Sumstats_clumped/WGS_HDL_afr_clumps_var.ht')
var_wgs_HDL_afr = var_wgs_HDL_afr.key_by('locus')
print(f'{var_wgs_HDL_afr.count()} WGS HDL AFR variants clumped') 

382342 WGS HDL AFR variants clumped


In [28]:
mt_wgs_afr = mt_wgs.filter_cols(mt_wgs.is_anc_pred_afr == 1)

In [29]:
mt_wgs_afr = mt_wgs_afr.semi_join_rows(var_wgs_HDL_afr)

In [30]:
mt_wgs_afr.count_cols()

[Stage 11:>                                                         (0 + 1) / 1]

20652

In [31]:
# retrieve relavent fields
mt_wgs_afr = mt_wgs_afr.select_cols()
mt_wgs_afr = mt_wgs_afr.select_rows(rsid = mt_wgs_afr.rsid)
mt_wgs_afr = mt_wgs_afr.select_entries(GT = mt_wgs_afr.GT.n_alt_alleles())

In [33]:
mt_wgs_afr.write(f"{bucket}/WGSData/WGS_GT_HDL_Ancestry_AFR_QCed.mt", overwrite = True)

2024-11-19 05:42:35.548 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-11-19 05:42:54.625 Hail: INFO: wrote table with 382342 rows in 1 partition to /tmp/__iruid_10984-cWBLkJtll0w0hB1cOvAIj0
2024-11-19 11:14:46.427 Hail: INFO: wrote matrix table with 382342 rows and 20652 columns in 140126 partitions to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/WGSData/WGS_GT_HDL_Ancestry_AFR_QCed.mt
