In [21]:
## import packages
from datetime import datetime
import os
import pandas as pd
import numpy as np

In [22]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

### Note:

In order to perform P+T, add an additional column `varid` in the summary statistics; This `varid` should have the same form as the reference panel (hg38)

In [3]:
def process_sumstats(ancestry):
    # Read in the file
    input_file = f'./Sumstats/WGS_HDL_{ancestry}_QCed.tsv'
    sumstats = pd.read_csv(input_file, sep='\t')

    # Create 'varid' column
    sumstats['varid'] = sumstats['locus'].str[3:] + ":" + sumstats['alleles1_sumstats_fixstrand'] + ":" + sumstats['alleles2_sumstats_fixstrand']

    # Calculate 'pval_meta'
    sumstats['pval_meta'] = 10 ** (-sumstats['neglog10_pval_meta'])

    # Write the updated file to a new output
    output_file = f'./Sumstats/WGS_HDL_{ancestry}_QCed.tsv'
    sumstats.to_csv(output_file, sep='\t', index=False, header=True, quoting=False)

    # Replace all missing rsid values with NA
    os.system(f"sed -i 's/\\t\\t/\\tNA\\t/g' {output_file}")
    os.system(f"sed -i 's/\\t$/\\tNA/g' {output_file}")

In [None]:
ancestries = ['eur', 'amr', 'afr']

for ancestry in ancestries:
    process_sumstats(ancestry)

In [5]:
process_sumstats('afr')

In [6]:
%%bash

ancestries=("afr")

for ancestry in "${ancestries[@]}"; do
    plink2 \
        --bfile REF/all_hg38_QCed \
        --clump Sumstats/WGS_HDL_${ancestry}_QCed.tsv \
        --clump-p1 1 \
        --clump-r2 0.1 \
        --clump-kb 250 \
        --clump-snp-field varid \
        --clump-field pval_meta \
        --out Sumstats/WGS_HDL_${ancestry}_QCed
done

PLINK v2.00a6LM 64-bit Intel (6 Aug 2024)      www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_HDL_afr_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_HDL_afr_QCed.tsv
  --clump-id-field varid
  --clump-kb 250
  --clump-p-field pval_meta
  --clump-p1 1
  --clump-r2 0.1
  --out Sumstats/WGS_HDL_afr_QCed

Start time: Sun Nov 17 20:31:24 2024
60285 MiB RAM detected, ~20202 available; reserving 20138 MiB for main
workspace.
Using up to 16 threads (change this with --threads).
2573 samples (1296 females, 1276 males, 1 ambiguous; 2561 founders) loaded from
REF/all_hg38_QCed.fam.
12542279 variants loaded from REF/all_hg38_QCed.bim.
Note: No phenotype data present.


written to Sumstats/WGS_HDL_afr_QCed.clumps.missing_id .


--clump: 0/7551588 index candidates processed.--clump: 1000/7551588 index candidates processed.--clump: 2000/7551588 index candidates processed.--clump: 3000/7551588 index candidates processed.--clump: 4000/7551588 index candidates processed.--clump: 5000/7551588 index candidates processed.--clump: 6000/7551588 index candidates processed.--clump: 7000/7551588 index candidates processed.--clump: 8000/7551588 index candidates processed.--clump: 9000/7551588 index candidates processed.--clump: 10000/7551588 index candidates processed.--clump: 11000/7551588 index candidates processed.--clump: 12000/7551588 index candidates processed.--clump: 13000/7551588 index candidates processed.--clump: 14000/7551588 index candidates processed.--clump: 15000/7551588 index candidates processed.--clump: 16000/7551588 index candidates processed.--clump: 17000/7551588 index candidates processed.--clump: 18000/7551588 index candidates processed.--clump: 19000/7551588 index candidates pro

--clump: 1258000/7551588 index candidates processed.--clump: 1259000/7551588 index candidates processed.--clump: 1260000/7551588 index candidates processed.--clump: 1261000/7551588 index candidates processed.--clump: 1262000/7551588 index candidates processed.--clump: 1263000/7551588 index candidates processed.--clump: 1264000/7551588 index candidates processed.--clump: 1265000/7551588 index candidates processed.--clump: 1266000/7551588 index candidates processed.--clump: 1267000/7551588 index candidates processed.--clump: 1268000/7551588 index candidates processed.--clump: 1269000/7551588 index candidates processed.--clump: 1270000/7551588 index candidates processed.--clump: 1271000/7551588 index candidates processed.--clump: 1272000/7551588 index candidates processed.--clump: 1273000/7551588 index candidates processed.--clump: 1274000/7551588 index candidates processed.--clump: 1275000/7551588 index candidates processed.--clump: 1276000/7551588 index candidates pro

--clump: 2495000/7551588 index candidates processed.--clump: 2496000/7551588 index candidates processed.--clump: 2497000/7551588 index candidates processed.--clump: 2498000/7551588 index candidates processed.--clump: 2499000/7551588 index candidates processed.--clump: 2500000/7551588 index candidates processed.--clump: 2501000/7551588 index candidates processed.--clump: 2502000/7551588 index candidates processed.--clump: 2503000/7551588 index candidates processed.--clump: 2504000/7551588 index candidates processed.--clump: 2505000/7551588 index candidates processed.--clump: 2506000/7551588 index candidates processed.--clump: 2507000/7551588 index candidates processed.--clump: 2508000/7551588 index candidates processed.--clump: 2509000/7551588 index candidates processed.--clump: 2510000/7551588 index candidates processed.--clump: 2511000/7551588 index candidates processed.--clump: 2512000/7551588 index candidates processed.--clump: 2513000/7551588 index candidates pro

--clump: 3732000/7551588 index candidates processed.--clump: 3733000/7551588 index candidates processed.--clump: 3734000/7551588 index candidates processed.--clump: 3735000/7551588 index candidates processed.--clump: 3736000/7551588 index candidates processed.--clump: 3737000/7551588 index candidates processed.--clump: 3738000/7551588 index candidates processed.--clump: 3739000/7551588 index candidates processed.--clump: 3740000/7551588 index candidates processed.--clump: 3741000/7551588 index candidates processed.--clump: 3742000/7551588 index candidates processed.--clump: 3743000/7551588 index candidates processed.--clump: 3744000/7551588 index candidates processed.--clump: 3745000/7551588 index candidates processed.--clump: 3746000/7551588 index candidates processed.--clump: 3747000/7551588 index candidates processed.--clump: 3748000/7551588 index candidates processed.--clump: 3749000/7551588 index candidates processed.--clump: 3750000/7551588 index candidates pro

--clump: 4969000/7551588 index candidates processed.--clump: 4970000/7551588 index candidates processed.--clump: 4971000/7551588 index candidates processed.--clump: 4972000/7551588 index candidates processed.--clump: 4973000/7551588 index candidates processed.--clump: 4974000/7551588 index candidates processed.--clump: 4975000/7551588 index candidates processed.--clump: 4976000/7551588 index candidates processed.--clump: 4977000/7551588 index candidates processed.--clump: 4978000/7551588 index candidates processed.--clump: 4979000/7551588 index candidates processed.--clump: 4980000/7551588 index candidates processed.--clump: 4981000/7551588 index candidates processed.--clump: 4982000/7551588 index candidates processed.--clump: 4983000/7551588 index candidates processed.--clump: 4984000/7551588 index candidates processed.--clump: 4985000/7551588 index candidates processed.--clump: 4986000/7551588 index candidates processed.--clump: 4987000/7551588 index candidates pro

--clump: 6206000/7551588 index candidates processed.--clump: 6207000/7551588 index candidates processed.--clump: 6208000/7551588 index candidates processed.--clump: 6209000/7551588 index candidates processed.--clump: 6210000/7551588 index candidates processed.--clump: 6211000/7551588 index candidates processed.--clump: 6212000/7551588 index candidates processed.--clump: 6213000/7551588 index candidates processed.--clump: 6214000/7551588 index candidates processed.--clump: 6215000/7551588 index candidates processed.--clump: 6216000/7551588 index candidates processed.--clump: 6217000/7551588 index candidates processed.--clump: 6218000/7551588 index candidates processed.--clump: 6219000/7551588 index candidates processed.--clump: 6220000/7551588 index candidates processed.--clump: 6221000/7551588 index candidates processed.--clump: 6222000/7551588 index candidates processed.--clump: 6223000/7551588 index candidates processed.--clump: 6224000/7551588 index candidates pro

--clump: 382342 clumps formed from 7551588 index candidates.  
Results written to Sumstats/WGS_HDL_afr_QCed.clumps .
End time: Sun Nov 17 20:35:08 2024


In [16]:
def select_clump(ancestry):

    clumped = pd.read_csv(f'Sumstats/WGS_HDL_{ancestry}_QCed.clumps', delim_whitespace=True)  
    original = pd.read_csv(f'Sumstats/WGS_HDL_{ancestry}_QCed.tsv', sep="\t")  
    
    # Filter the original ']data to keep only the clumped variants
    original_sub = original[original['varid'].isin(clumped['ID'])]
    
    original_sub.to_csv(f'Sumstats/WGS_HDL_{ancestry}_QCed_clumps.tsv', sep='\t', index=False, header=True)

In [17]:
select_clump('afr')

In [11]:
ancestries = ['eur', 'amr', 'afr']

for ancestry in ancestries:
    select_clump(ancestry)

In [None]:
# retreive clumped variants
import hail as hl
hl.init(default_reference = "GRCh38")

In [37]:
WGS_HDL_afr_clumps = hl.import_table(f'{bucket}/Sumstats_clumped/WGS_HDL_afr_QCed_clumps.tsv',
                                     types = {'locus':hl.tlocus('GRCh38')},
                                     impute=True)

2024-11-17 21:09:13.637 Hail: INFO: Reading table to impute column types
2024-11-17 21:09:17.116 Hail: INFO: Finished type imputation        (0 + 1) / 1]
  Loading field 'locus' as type locus<GRCh38> (user-supplied type)
  Loading field 'rsid' as type str (imputed)
  Loading field 'alleles1_wgs' as type str (imputed)
  Loading field 'alleles2_wgs' as type str (imputed)
  Loading field 'alleles1_sumstats_original' as type str (imputed)
  Loading field 'alleles2_sumstats_original' as type str (imputed)
  Loading field 'is_negative_strand' as type bool (imputed)
  Loading field 'alleles1_sumstats_fixstrand' as type str (imputed)
  Loading field 'alleles2_sumstats_fixstrand' as type str (imputed)
  Loading field 'beta_meta' as type float64 (imputed)
  Loading field 'beta_meta_fix_ref_alt' as type str (imputed)
  Loading field 'se_meta' as type float64 (imputed)
  Loading field 'neglog10_pval_meta' as type float64 (imputed)
  Loading field 'varid' as type str (imputed)
  Loading field 'pval

In [40]:
WGS_HDL_afr_clumps_var = WGS_HDL_afr_clumps.select(WGS_HDL_afr_clumps.locus)

In [42]:
WGS_HDL_afr_clumps_var.write(f'{bucket}/Sumstats_clumped/WGS_HDL_afr_clumps_var.ht', overwrite=True)

2024-11-17 21:11:04.134 Hail: INFO: wrote table with 382342 rows in 1 partition to gs://fc-secure-9afe7562-2fad-4781-ab60-03528a626c19/Sumstats_clumped/WGS_HDL_afr_clumps_var.ht
