In [2]:
## import packages
from datetime import datetime
import os
import pandas as pd
import numpy as np
from itertools import chain
import hail as hl
import statsmodels.api as sm

In [None]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

## Array PGS

In [3]:
# read matrix table 
mt_array = hl.read_matrix_table(f"{bucket}/ArrayData/Array_GT_QCed.mt")

Initializing Hail with default parameters...

Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-c.c.terra-vpc-sc-fd39b54c.internal:42113
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20241223-2314-0.2.130.post1-c69cd67afb8b.log


In [6]:
## read Sample_quant 
Sample_quant = hl.import_table(f"{bucket}/Pheno/quant_all.tsv", 
                         missing='',
                        impute=True,
                        types = {"person_id": "str"})
Sample_quant = Sample_quant.key_by("person_id")
mt_array_quant = mt_array.semi_join_cols(Sample_quant)
mt_array_quant = mt_array_quant.annotate_cols(**Sample_quant[mt_array_quant.s])


#### read Sample_binary
Sample_binary = hl.import_table(f"{bucket}/Pheno/binary_all.tsv", 
                         missing='',
                        impute=True,
                        types = {"person_id": "str"})
Sample_binary = Sample_binary.key_by("person_id")
mt_array_binary = mt_array.semi_join_cols(Sample_binary)
mt_array_binary = mt_array_binary.annotate_cols(**Sample_binary[mt_array_binary.s])

2024-12-23 23:15:27.296 Hail: INFO: Reading table to impute column types
2024-12-23 23:15:31.439 Hail: INFO: Loading <StructExpression of type struct{person_id: str, Age: int32, is_sex_Male: int32, is_sex_Female: int32, is_White: int32, is_Black_or_African_American: int32, is_Native_Hawaiian_or_Other_Pacific_Islander: int32, is_Asian: int32, is_Middle_Eastern_or_North_African: int32, is_gender_Male: int32, is_gender_Female: int32, is_Hispanic: int32, is_anc_pred_eur: int32, is_anc_pred_amr: int32, is_anc_pred_afr: int32, is_anc_pred_sas: int32, is_anc_pred_eas: int32, is_anc_pred_mid: int32, PC1: float64, PC2: float64, PC3: float64, PC4: float64, PC5: float64, PC6: float64, PC7: float64, PC8: float64, PC9: float64, PC10: float64, PC11: float64, PC12: float64, PC13: float64, PC14: float64, PC15: float64, PC16: float64, BMI: float64, Height: float64, DBP: float64, SBP: float64, HbA1c: float64, leukocyte: float64, Lymphocyte: float64, RBC: float64, Neutrophil: float64, Hemoglobin_concentr

In [9]:
# List of quant phenotypes
phenotypes = ["Height", "DBP", "HDL", "TC", "RBC", "leukocyte"]

# Loop through each phenotype and annotate it to the Matrix Table
for phenotype in phenotypes:
    print(f"Processing phenotype: {phenotype}")
    
    # Import the phenotype table
    phenotype_table = hl.import_table(
        f'{bucket}/PRScs/hg38/Array_{phenotype}_pst_eff_a1_b0.5_phiauto.txt',
        impute=True,
        types = {"locus": hl.tlocus("GRCh38"),
                "alleles":hl.tarray(hl.tstr)}
    )
    
    phenotype_table = phenotype_table.key_by('locus', 'alleles')
    
    # Annotate rows in the Matrix Table with the phenotype data
    mt_array_quant = mt_array_quant.annotate_rows(
        **{f"{phenotype}_PRScs_result": phenotype_table[mt_array_quant.locus, mt_array_quant.alleles]}
    )

Processing phenotype: Height


2024-12-23 19:01:47.704 Hail: INFO: Reading table to impute column types
2024-12-23 19:01:50.213 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: DBP


2024-12-23 19:01:51.061 Hail: INFO: Reading table to impute column types
2024-12-23 19:01:53.567 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: HDL


2024-12-23 19:01:54.388 Hail: INFO: Reading table to impute column types
2024-12-23 19:01:56.899 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: TC


2024-12-23 19:01:57.629 Hail: INFO: Reading table to impute column types
2024-12-23 19:02:00.152 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: RBC


2024-12-23 19:02:00.915 Hail: INFO: Reading table to impute column types
2024-12-23 19:02:03.468 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: leukocyte


2024-12-23 19:02:04.212 Hail: INFO: Reading table to impute column types
2024-12-23 19:02:06.873 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


In [7]:
# List of binary phenotypes
phenotypes = ["T2D", "Asthma", "Breast_Cancer", "Colorectal_Cancer"]

# Loop through each phenotype and annotate it to the Matrix Table
for phenotype in phenotypes:
    print(f"Processing phenotype: {phenotype}")
    
    # Import the phenotype table
    phenotype_table = hl.import_table(
        f'{bucket}/PRScs/hg38/Array_{phenotype}_pst_eff_a1_b0.5_phiauto.txt',
        impute=True,
        types = {"locus": hl.tlocus("GRCh38"),
                "alleles":hl.tarray(hl.tstr)}
    )
    
    phenotype_table = phenotype_table.key_by('locus', 'alleles')
    
    # Annotate rows in the Matrix Table with the phenotype data
    mt_array_binary = mt_array_binary.annotate_rows(
        **{f"{phenotype}_PRScs_result": phenotype_table[mt_array_binary.locus, mt_array_binary.alleles]}
    )

Processing phenotype: T2D


2024-12-23 23:15:36.519 Hail: INFO: Reading table to impute column types
2024-12-23 23:15:39.279 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: Asthma


2024-12-23 23:15:40.016 Hail: INFO: Reading table to impute column types
2024-12-23 23:15:42.623 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: Breast_Cancer


2024-12-23 23:15:43.329 Hail: INFO: Reading table to impute column types
2024-12-23 23:15:46.003 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: Colorectal_Cancer


2024-12-23 23:15:46.778 Hail: INFO: Reading table to impute column types
2024-12-23 23:15:49.299 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


In [11]:
# Compute PGS
phenotypes = ["Height", "DBP", "HDL", "TC", "RBC", "leukocyte"]

# Annotate PGS for all phenotypes
mt_array_quant = mt_array_quant.annotate_cols(**{
    f"{phenotype}_pgs": hl.struct(
        pgs=hl.agg.sum(
            mt_array_quant[f"{phenotype}_PRScs_result"]["BETA_posterior"] * mt_array_quant.GT
        )
    )
    for phenotype in phenotypes
})

In [8]:
# Compute PGS
phenotypes = ["T2D", "Asthma", "Breast_Cancer", "Colorectal_Cancer"]

# Annotate PGS for all phenotypes
mt_array_binary = mt_array_binary.annotate_cols(**{
    f"{phenotype}_pgs": hl.struct(
        pgs=hl.agg.sum(
            mt_array_binary[f"{phenotype}_PRScs_result"]["BETA_posterior"] * mt_array_binary.GT
        )
    )
    for phenotype in phenotypes
})

In [9]:
def export_Scores(mt, pheno, mt_type, method):
    export_filename = f"{bucket}/Scores/" + mt_type + "/" + pheno + "_" + method + ".bgz"
    sample_info = mt.cols().select(
        "Age", 'is_sex_Male', 'is_sex_Female', 
        "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", 
        "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", 
        pheno + "_pgs", pheno)
    sample_info = sample_info.annotate(**sample_info[pheno + "_pgs"])
    sample_info = sample_info.drop(pheno + "_pgs")
    sample_info.export(export_filename)

In [16]:
export_Scores(mt_array_quant, "DBP", "Array", "PRScs_hg38")
export_Scores(mt_array_quant, "HDL", "Array", "PRScs_hg38")
export_Scores(mt_array_quant, "Height", "Array", "PRScs_hg38")
export_Scores(mt_array_quant, "RBC", "Array", "PRScs_hg38")
export_Scores(mt_array_quant, "TC", "Array", "PRScs_hg38")
export_Scores(mt_array_quant, "leukocyte", "Array", "PRScs_hg38")

export_Scores(mt_array_binary, "T2D", "Array", "PRScs_hg38")
export_Scores(mt_array_binary, "Asthma", "Array", "PRScs_hg38")
export_Scores(mt_array_binary, "Breast_Cancer", "Array", "PRScs_hg38")
export_Scores(mt_array_binary, "Colorectal_Cancer", "Array", "PRScs_hg38")

2024-12-23 18:20:56.072 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2024-12-23 18:21:08.268 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-23 18:21:15.819 Hail: INFO: wrote table with 685739 rows in 1 partition to /tmp/__iruid_29119-PDhPgO3wBJCGdm5Zlq2NWo
2024-12-23 18:25:07.630 Hail: INFO: merging 41 files totalling 6.7M...+ 8) / 40]
2024-12-23 18:25:08.061 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/DBP_PRScs_hg38.bgz
  merge time: 430.685ms
2024-12-23 18:25:18.821 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-23 18:25:23.381 Hail: INFO: wrote table with 685737 rows in 1 partition to /tmp/__iruid_36008-rP9YPm8WyCnpoM9dmtqb8F
2024-12-23 18:29:08.289 Hail: INFO: Coerced sorted dataset          (0 + 1) / 1]
2024-12-23 18:29:09.310 Hail: INFO: merging 33 files totalling 6.6M...+ 8) / 32]
2024-12

In [3]:
%%bash

base_path="gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array"
phenotypes=("DBP" "HDL" "Height" "RBC" "TC" "leukocyte" "T2D" "Asthma" "Breast_Cancer" "Colorectal_Cancer")

for phenotype in "${phenotypes[@]}"; do
    gsutil mv "${base_path}/${phenotype}_PRScs_hg38.bgz" "${base_path}/${phenotype}_PRScs_hg38.gz"
done

CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/DBP_PRScs_hg38.bgz
CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/HDL_PRScs_hg38.bgz
CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/Height_PRScs_hg38.bgz
CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/RBC_PRScs_hg38.bgz
CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/TC_PRScs_hg38.bgz
CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/leukocyte_PRScs_hg38.bgz
CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/T2D_PRScs_hg38.bgz
CommandException: No URLs matched: gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/Array/Asthma_PRScs_hg38.bgz
CommandException: No URLs matched: gs

## WGS PGS

In [3]:
# read matrix table 
mt_wgs = hl.read_matrix_table(f"{bucket}/WGSData/WGS_GT_QCed.mt")

Initializing Hail with default parameters...

Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-b.c.terra-vpc-sc-fd39b54c.internal:34577
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20250119-0156-0.2.130.post1-c69cd67afb8b.log


In [4]:
## read Sample_quant 
Sample_quant = hl.import_table(f"{bucket}/Pheno/quant_all.tsv", 
                         missing='',
                        impute=True,
                        types = {"person_id": "str"})
Sample_quant = Sample_quant.key_by("person_id")
mt_wgs_quant = mt_wgs.semi_join_cols(Sample_quant)
mt_wgs_quant = mt_wgs_quant.annotate_cols(**Sample_quant[mt_wgs_quant.s])


#### read Sample_binary
Sample_binary = hl.import_table(f"{bucket}/Pheno/binary_all.tsv", 
                         missing='',
                        impute=True,
                        types = {"person_id": "str"})
Sample_binary = Sample_binary.key_by("person_id")
mt_wgs_binary = mt_wgs.semi_join_cols(Sample_binary)
mt_wgs_binary = mt_wgs_binary.annotate_cols(**Sample_binary[mt_wgs_binary.s])

2025-01-19 01:57:26.551 Hail: INFO: Reading table to impute column types 1) / 1]
2025-01-19 01:57:37.712 Hail: INFO: Loading <StructExpression of type struct{person_id: str, Age: int32, is_sex_Male: int32, is_sex_Female: int32, is_White: int32, is_Black_or_African_American: int32, is_Native_Hawaiian_or_Other_Pacific_Islander: int32, is_Asian: int32, is_Middle_Eastern_or_North_African: int32, is_gender_Male: int32, is_gender_Female: int32, is_Hispanic: int32, is_anc_pred_eur: int32, is_anc_pred_amr: int32, is_anc_pred_afr: int32, is_anc_pred_sas: int32, is_anc_pred_eas: int32, is_anc_pred_mid: int32, PC1: float64, PC2: float64, PC3: float64, PC4: float64, PC5: float64, PC6: float64, PC7: float64, PC8: float64, PC9: float64, PC10: float64, PC11: float64, PC12: float64, PC13: float64, PC14: float64, PC15: float64, PC16: float64, BMI: float64, Height: float64, DBP: float64, SBP: float64, HbA1c: float64, leukocyte: float64, Lymphocyte: float64, RBC: float64, Neutrophil: float64, Hemoglobin_

In [5]:
# List of quant phenotypes
phenotypes = ["Height", "DBP", "HDL", "TC", "RBC", "leukocyte"]

# Loop through each phenotype and annotate it to the Matrix Table
for phenotype in phenotypes:
    print(f"Processing phenotype: {phenotype}")
    
    # Import the phenotype table
    phenotype_table = hl.import_table(
        f'{bucket}/PRScs/hg38/WGS_{phenotype}_pst_eff_a1_b0.5_phiauto.txt',
        impute=True,
        types = {"locus": hl.tlocus("GRCh38"),
                "alleles":hl.tarray(hl.tstr)}
    )
    
    phenotype_table = phenotype_table.key_by('locus', 'alleles')
    
    # Annotate rows in the Matrix Table with the phenotype data
    mt_wgs_quant = mt_wgs_quant.annotate_rows(
        **{f"{phenotype}_PRScs_result": phenotype_table[mt_wgs_quant.locus, mt_wgs_quant.alleles]}
    )

Processing phenotype: Height


2025-01-19 00:23:24.745 Hail: INFO: Reading table to impute column types
2025-01-19 00:23:31.119 Hail: INFO: Finished type imputation        (2 + 1) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: DBP


2025-01-19 00:23:32.281 Hail: INFO: Reading table to impute column types
2025-01-19 00:23:38.474 Hail: INFO: Finished type imputation        (1 + 2) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: HDL


2025-01-19 00:23:39.605 Hail: INFO: Reading table to impute column types
2025-01-19 00:23:45.856 Hail: INFO: Finished type imputation        (2 + 1) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: TC


2025-01-19 00:23:46.977 Hail: INFO: Reading table to impute column types
2025-01-19 00:23:53.036 Hail: INFO: Finished type imputation        (1 + 2) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: RBC


2025-01-19 00:23:53.990 Hail: INFO: Reading table to impute column types
2025-01-19 00:23:59.918 Hail: INFO: Finished type imputation        (1 + 2) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: leukocyte


2025-01-19 00:24:00.820 Hail: INFO: Reading table to impute column types
2025-01-19 00:24:06.930 Hail: INFO: Finished type imputation        (2 + 1) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


In [5]:
# List of binary phenotypes
phenotypes = ["T2D", "Asthma", "Breast_Cancer", "Colorectal_Cancer"]

# Loop through each phenotype and annotate it to the Matrix Table
for phenotype in phenotypes:
    print(f"Processing phenotype: {phenotype}")
    
    # Import the phenotype table
    phenotype_table = hl.import_table(
        f'{bucket}/PRScs/hg38/WGS_{phenotype}_pst_eff_a1_b0.5_phiauto.txt',
        impute=True,
        types = {"locus": hl.tlocus("GRCh38"),
                "alleles":hl.tarray(hl.tstr)}
    )
    
    phenotype_table = phenotype_table.key_by('locus', 'alleles')
    
    # Annotate rows in the Matrix Table with the phenotype data
    mt_wgs_binary = mt_wgs_binary.annotate_rows(
        **{f"{phenotype}_PRScs_result": phenotype_table[mt_wgs_binary.locus, mt_wgs_binary.alleles]}
    )

Processing phenotype: T2D


2025-01-19 01:57:43.865 Hail: INFO: Reading table to impute column types
2025-01-19 01:57:50.178 Hail: INFO: Finished type imputation        (2 + 1) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: Asthma


2025-01-19 01:57:51.225 Hail: INFO: Reading table to impute column types
2025-01-19 01:57:57.252 Hail: INFO: Finished type imputation        (2 + 1) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: Breast_Cancer


2025-01-19 01:57:58.148 Hail: INFO: Reading table to impute column types
2025-01-19 01:58:04.332 Hail: INFO: Finished type imputation        (2 + 1) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


Processing phenotype: Colorectal_Cancer


2025-01-19 01:58:05.294 Hail: INFO: Reading table to impute column types
2025-01-19 01:58:11.298 Hail: INFO: Finished type imputation        (2 + 1) / 3]
  Loading field 'CHR' as type int32 (imputed)
  Loading field 'ID' as type str (imputed)
  Loading field 'BP' as type int32 (imputed)
  Loading field 'A1_alt' as type str (imputed)
  Loading field 'A2_ref' as type str (imputed)
  Loading field 'BETA_posterior' as type float64 (imputed)
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'alleles' as type array<str> (user-supplied type)


In [7]:
# Compute PGS
phenotypes = ["Height", "DBP", "HDL", "TC", "RBC", "leukocyte"]

# Annotate PGS for all phenotypes
mt_wgs_quant = mt_wgs_quant.annotate_cols(**{
    f"{phenotype}_pgs": hl.struct(
        pgs=hl.agg.sum(
            mt_wgs_quant[f"{phenotype}_PRScs_result"]["BETA_posterior"] * mt_wgs_quant.GT
        )
    )
    for phenotype in phenotypes
})

In [6]:
# Compute PGS
phenotypes = ["T2D", "Asthma", "Breast_Cancer", "Colorectal_Cancer"]

# Annotate PGS for all phenotypes
mt_wgs_binary = mt_wgs_binary.annotate_cols(**{
    f"{phenotype}_pgs": hl.struct(
        pgs=hl.agg.sum(
            mt_wgs_binary[f"{phenotype}_PRScs_result"]["BETA_posterior"] * mt_wgs_binary.GT
        )
    )
    for phenotype in phenotypes
})

In [7]:
def export_Scores(mt, pheno, mt_type, method):
    export_filename = f"{bucket}/Scores/{mt_type}/{pheno}_{method}_checkpoint.ht"  # Changed extension to .ht (Hail table)
    sample_info = mt.cols().select(
        "Age", 'is_sex_Male', 'is_sex_Female', 
        "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", 
        "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", 
        pheno + "_pgs", pheno)
    sample_info = sample_info.annotate(**sample_info[pheno + "_pgs"])
    sample_info = sample_info.drop(pheno + "_pgs")
    
    # Write the data as a Hail table (.ht)
    sample_info = sample_info.checkpoint(export_filename, overwrite=True)  
    print(f"Wrote sample info to {export_filename}.")

In [None]:
export_Scores(mt_wgs_quant, "DBP", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_quant, "HDL", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_quant, "Height", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_quant, "RBC", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_quant, "TC", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_quant, "leukocyte", "WGS", "PRScs_hg38")

export_Scores(mt_wgs_binary, "T2D", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_binary, "Asthma", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_binary, "Breast_Cancer", "WGS", "PRScs_hg38")
export_Scores(mt_wgs_binary, "Colorectal_Cancer", "WGS", "PRScs_hg38")

In [4]:
Height_df = hl.read_table(f"{bucket}/Scores/WGS/Height_PRScs_hg38_checkpoint.ht")
DBP_df = hl.read_table(f"{bucket}/Scores/WGS/DBP_PRScs_hg38_checkpoint.ht")
HDL_df = hl.read_table(f"{bucket}/Scores/WGS/HDL_PRScs_hg38_checkpoint.ht")
TC_df = hl.read_table(f"{bucket}/Scores/WGS/TC_PRScs_hg38_checkpoint.ht")
RBC_df = hl.read_table(f"{bucket}/Scores/WGS/RBC_PRScs_hg38_checkpoint.ht")
leukocyte_df = hl.read_table(f"{bucket}/Scores/WGS/leukocyte_PRScs_hg38_checkpoint.ht")
T2D = hl.read_table(f"{bucket}/Scores/WGS/T2D_PRScs_hg38_checkpoint.ht")
Asthma = hl.read_table(f"{bucket}/Scores/WGS/Asthma_PRScs_hg38_checkpoint.ht")
Breast_Cancer = hl.read_table(f"{bucket}/Scores/WGS/Breast_Cancer_PRScs_hg38_checkpoint.ht")
Colorectal_Cancer = hl.read_table(f"{bucket}/Scores/WGS/Colorectal_Cancer_PRScs_hg38_checkpoint.ht")

Initializing Hail with default parameters...

Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11150-m.us-central1-b.c.terra-vpc-sc-fd39b54c.internal:38651
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130.post1-c69cd67afb8b
LOGGING: writing to /home/jupyter/workspaces/prswithwgsvsarraydata/hail-20250119-0235-0.2.130.post1-c69cd67afb8b.log


In [5]:
Height_df.export(f"{bucket}/Scores/WGS/Height_PRScs_hg38.gz")
DBP_df.export(f"{bucket}/Scores/WGS/DBP_PRScs_hg38.gz")
HDL_df.export(f"{bucket}/Scores/WGS/HDL_PRScs_hg38.gz")
TC_df.export(f"{bucket}/Scores/WGS/TC_PRScs_hg38.gz")
RBC_df.export(f"{bucket}/Scores/WGS/RBC_PRScs_hg38.gz")
leukocyte_df.export(f"{bucket}/Scores/WGS/leukocyte_PRScs_hg38.gz")
T2D.export(f"{bucket}/Scores/WGS/T2D_PRScs_hg38.gz")
Asthma.export(f"{bucket}/Scores/WGS/Asthma_PRScs_hg38.gz")
Breast_Cancer.export(f"{bucket}/Scores/WGS/Breast_Cancer_PRScs_hg38.gz")
Colorectal_Cancer.export(f"{bucket}/Scores/WGS/Colorectal_Cancer_PRScs_hg38.gz")

2025-01-19 02:36:41.114 Hail: INFO: merging 161 files totalling 6.4M...6) / 160]
2025-01-19 02:36:42.072 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/WGS/Height_PRScs_hg38.gz
  merge time: 955.608ms
2025-01-19 02:36:48.350 Hail: INFO: merging 161 files totalling 6.4M...8) / 160]
2025-01-19 02:36:48.899 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/WGS/DBP_PRScs_hg38.gz
  merge time: 548.911ms
2025-01-19 02:36:54.137 Hail: INFO: merging 161 files totalling 6.3M...6) / 160]
2025-01-19 02:36:54.641 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/WGS/HDL_PRScs_hg38.gz
  merge time: 503.318ms
2025-01-19 02:36:56.433 Hail: INFO: merging 161 files totalling 6.3M...1) / 160]
2025-01-19 02:36:57.054 Hail: INFO: while writing:
    gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Scores/WGS/TC_PRScs_hg38.gz
  merge time: 621.552ms
2025-01-19 02:36:58.641 Hail: INFO