In [1]:
## import packages
from datetime import datetime
import os
import pandas as pd
import numpy as np

In [2]:
DATASET = os.getenv('WORKSPACE_CDR')
bucket = os.getenv('WORKSPACE_BUCKET')

In [4]:
%%bash

for trait in Asthma Breast_Cancer Colorectal_Cancer DBP HDL Height leukocyte RBC T2D TC; do
    mv Sumstats/WGS_${trait}_QCed.tsv.bgz Sumstats/WGS_${trait}_QCed.tsv.gz
    gunzip Sumstats/WGS_${trait}_QCed.tsv.gz
done

### Note:

In order to perform P+T, add an additional column `varid` in the summary statistics; This `varid` should have the same form as the reference panel (hg38)

In [5]:
def process_sumstats(trait):
    # Read in the file
    input_file = f'./Sumstats/WGS_{trait}_QCed.tsv'
    sumstats = pd.read_csv(input_file, sep='\t')

    # Create 'varid' column
    sumstats['varid'] = sumstats['locus'].str[3:] + ":" + sumstats['alleles1_sumstats_fixstrand'] + ":" + sumstats['alleles2_sumstats_fixstrand']

    # Calculate 'pval_meta'
    sumstats['pval_meta'] = 10 ** (-sumstats['neglog10_pval_meta'])

    # Write the updated file to a new output
    output_file = f'./Sumstats/WGS_{trait}_QCed.tsv'
    sumstats.to_csv(output_file, sep='\t', index=False, header=True, quoting=False)

    # Replace all missing rsid values with NA
    os.system(f"sed -i 's/\\t\\t/\\tNA\\t/g' {output_file}")
    os.system(f"sed -i 's/\\t$/\\tNA/g' {output_file}")

In [6]:
traits = ['Asthma', 'Breast_Cancer', 'Colorectal_Cancer', 'DBP', 'HDL', 'Height', 'leukocyte', 'RBC', 'T2D', 'TC']

for trait in traits:
    process_sumstats(trait)

In [6]:
%%bash

traits=("Asthma" "Breast_Cancer" "Colorectal_Cancer" "T2D" "DBP" "HDL" "Height" "leukocyte" "RBC" "TC")

for trait in "${traits[@]}"; do
    plink \
        --bfile REF/all_hg38_QCed \
        --clump Sumstats/WGS_${trait}_QCed.tsv \
        --clump-p1 1 \
        --clump-r2 0.1 \
        --clump-kb 250 \
        --clump-snp-field varid \
        --clump-field pval_meta \
        --out Sumstats/WGS_${trait}_QCed
done

PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_Asthma_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_Asthma_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_Asthma_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_Asthma_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.

variant.
variant.
variant.
433169 more top variant IDs missing; see log file.


--clump: 468426 clumps formed from 8172127 top variants.
Results written to Sumstats/WGS_Asthma_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_Breast_Cancer_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_Breast_Cancer_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_Breast_Cancer_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_Breast_Cancer_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 1011121314151617181920212223242526272829303132

variant.
variant.
variant.
432851 more top variant IDs missing; see log file.


--clump: 468563 clumps formed from 8169116 top variants.
Results written to Sumstats/WGS_Breast_Cancer_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_Colorectal_Cancer_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_Colorectal_Cancer_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_Colorectal_Cancer_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_Colorectal_Cancer_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 10111213141516171819202

variant.
variant.
variant.
429493 more top variant IDs missing; see log file.


--clump: 464134 clumps formed from 8108345 top variants.
Results written to Sumstats/WGS_Colorectal_Cancer_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_T2D_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_T2D_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_T2D_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_T2D_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233343536373839404142434445464

variant.
variant.
variant.
433021 more top variant IDs missing; see log file.


--clump: 468161 clumps formed from 8171402 top variants.
Results written to Sumstats/WGS_T2D_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_DBP_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_DBP_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_DBP_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_DBP_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535

variant.
variant.
variant.
427532 more top variant IDs missing; see log file.


--clump: 466453 clumps formed from 8075950 top variants.
Results written to Sumstats/WGS_DBP_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_HDL_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_HDL_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_HDL_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_HDL_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535

variant.
variant.
variant.
426795 more top variant IDs missing; see log file.


--clump: 465726 clumps formed from 8057739 top variants.
Results written to Sumstats/WGS_HDL_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_Height_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_Height_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_Height_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_Height_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474

variant.
variant.
variant.
433081 more top variant IDs missing; see log file.


--clump: 468717 clumps formed from 8171631 top variants.
Results written to Sumstats/WGS_Height_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_leukocyte_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_leukocyte_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_leukocyte_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_leukocyte_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940

variant.
variant.
variant.
427682 more top variant IDs missing; see log file.


--clump: 466426 clumps formed from 8081861 top variants.
Results written to Sumstats/WGS_leukocyte_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_RBC_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_RBC_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_RBC_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_RBC_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505

variant.
variant.
variant.
428411 more top variant IDs missing; see log file.


--clump: 466830 clumps formed from 8104912 top variants.
Results written to Sumstats/WGS_RBC_QCed.clumped .
PLINK v1.90b6.22 64-bit (16 Apr 2021)          www.cog-genomics.org/plink/1.9/
(C) 2005-2021 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Sumstats/WGS_TC_QCed.log.
Options in effect:
  --bfile REF/all_hg38_QCed
  --clump Sumstats/WGS_TC_QCed.tsv
  --clump-field pval_meta
  --clump-kb 250
  --clump-p1 1
  --clump-r2 0.1
  --clump-snp-field varid
  --out Sumstats/WGS_TC_QCed

60285 MB RAM detected; reserving 30142 MB for main workspace.
12542279 variants loaded from .bim file.
2573 people (1276 males, 1296 females, 1 ambiguous) loaded from .fam.
Ambiguous sex ID written to Sumstats/WGS_TC_QCed.nosex .
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2561 founders and 12 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555

variant.
variant.
variant.
427536 more top variant IDs missing; see log file.


--clump: 466573 clumps formed from 8078592 top variants.
Results written to Sumstats/WGS_TC_QCed.clumped .


In [10]:
def select_clump(trait):

    clumped = pd.read_csv(f'Sumstats/WGS_{trait}_QCed.clumped', delim_whitespace=True)  
    original = pd.read_csv(f'Sumstats/WGS_{trait}_QCed.tsv', sep="\t")  
    
    # Filter the original data to keep only the clumped variants
    original_sub = original[original['varid'].isin(clumped['SNP'])]
    
    original_sub.to_csv(f'Sumstats/WGS_{trait}_QCed_clumped.tsv', sep='\t', index=False, header=True)

In [11]:
traits = ['Asthma', 'Breast_Cancer', 'Colorectal_Cancer', 'DBP', 'HDL', 'Height', 'leukocyte', 'RBC', 'T2D', 'TC']

for trait in traits:
    select_clump(trait)