## Notebook to format and re-annotate the IPDGC Meta5 ([Nalls PD GWAS 2019](https://pubmed.ncbi.nlm.nih.gov/31701892/), [without 23andMe data](https://drive.google.com/file/d/1FZ9UL99LAqyWnyNBxxlx6qOUlfAnublN/view?usp=sharing)) summary stats pulled from pdgenetics.org 

- summary stats as downloaded have position based IDs based on hg19 reference
- annotate with dbSNP IDs
- annotate with hg38 positions
- modify effect allele and frequency info such that the minor allele is the effect allele

In [1]:
!date

Fri Jul 29 16:44:57 UTC 2022


#### import libraries

In [2]:
from pandas import read_csv, DataFrame
from os import makedirs
import concurrent.futures
from dask.dataframe import read_csv as dask_read_csv
from numpy import exp

#### set notebook variables

In [3]:
# naming
project = 'adrd'
cohort = 'ipsc'

# directories
wrk_dir = '/home/jupyter/ipsc_qtl'
public_dir = f'{wrk_dir}/public'

# in files
in_sum_stats_file = f'{public_dir}/nalls_pd_gwas/nallsEtAl2019_excluding23andMe_allVariants.tab.zip'
hrc_file = f'{public_dir}/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz'

# out files
out_file = f'{public_dir}/nalls_pd_gwas/pdmeta_sumstats_hg38_no23andme.parquet'
out_catalogue_file = f'{public_dir}/nalls_pd_gwas/pd_sumstats_no23andme_buildGRCh38.tsv.gz'

# variables
DEBUG = True
autosomes = [str(x) for x in list(range(1,23))]

### load input data

#### load the GWAS summary stats file

In [4]:
%%time
gwas_df = read_csv(in_sum_stats_file, sep='\t')
print(f'gwas summary stats shape {gwas_df.shape}')

if DEBUG:
    display(gwas_df.head())

gwas summary stats shape (17510617, 9)


Unnamed: 0,SNP,A1,A2,freq,b,se,p,N_cases,N_controls
0,chr11:88249377,T,C,0.9931,0.1575,0.1783,0.3771,7161,5356
1,chr1:60320992,A,G,0.9336,0.0605,0.0456,0.1846,26421,442271
2,chr2:18069070,T,C,0.9988,-0.6774,1.3519,0.6163,582,905
3,chr8:135908647,A,G,0.2081,-0.0358,0.0273,0.1887,26421,442271
4,chr12:3871714,A,C,0.9972,0.1489,1.0636,0.8886,749,658


CPU times: user 19.4 s, sys: 1.7 s, total: 21.1 s
Wall time: 21.1 s


#### load the HRC reference files

In [5]:
%%time
hrc_df = read_csv(hrc_file, sep='\t')
print(f'HRC reference shape {hrc_df.shape}')

if DEBUG:
    display(hrc_df.head())

  exec(code, glob, local_ns)


HRC reference shape (40405505, 12)


Unnamed: 0,#CHROM,POS,ID,REF,ALT,AC,AN,AF,AC_EXCLUDING_1000G,AN_EXCLUDING_1000G,AF_EXCLUDING_1000G,AA
0,1,13380,rs571093408,C,G,5,64940,7.7e-05,0,59950,0.0,.
1,1,16071,rs541172944,G,A,8,64940,0.000123,0,59950,0.0,G
2,1,16141,rs529651976,C,T,9,64940,0.000139,4,59950,6.7e-05,C
3,1,16280,.,T,C,43,64940,0.000662,2,59950,3.3e-05,T
4,1,49298,rs200943160,T,C,41571,64940,0.640145,38155,59950,0.636447,T


CPU times: user 51.5 s, sys: 4.68 s, total: 56.2 s
Wall time: 56.2 s


### annotate dbSNP IDs to summary stats

#### create posID type in the HRC files

In [6]:
hrc_df['posID'] = 'chr' + hrc_df['#CHROM'].astype('str') + ':' + hrc_df.POS.astype('str')
print(f'HRC reference shape {hrc_df.shape}')

if DEBUG:
    display(hrc_df.head())

HRC reference shape (40405505, 13)


Unnamed: 0,#CHROM,POS,ID,REF,ALT,AC,AN,AF,AC_EXCLUDING_1000G,AN_EXCLUDING_1000G,AF_EXCLUDING_1000G,AA,posID
0,1,13380,rs571093408,C,G,5,64940,7.7e-05,0,59950,0.0,.,chr1:13380
1,1,16071,rs541172944,G,A,8,64940,0.000123,0,59950,0.0,G,chr1:16071
2,1,16141,rs529651976,C,T,9,64940,0.000139,4,59950,6.7e-05,C,chr1:16141
3,1,16280,.,T,C,43,64940,0.000662,2,59950,3.3e-05,T,chr1:16280
4,1,49298,rs200943160,T,C,41571,64940,0.640145,38155,59950,0.636447,T,chr1:49298


#### how many of the summary stats variants are not present in HRC

In [7]:
cnt = len(set(gwas_df.SNP) & set(hrc_df.posID))
print(f'count or posID intersect between GWAS summary stats and HRC ref is {cnt}')
missing_cnt = len(set(gwas_df.SNP) - set(hrc_df.posID))
print(f'count of posID in GWAS summary stats but not in HRC ref is {missing_cnt}')

count or posID intersect between GWAS summary stats and HRC ref is 17510617
count of posID in GWAS summary stats but not in HRC ref is 0


#### merge subset of HRC columns to summary stats keyed on podID

In [8]:
hrc_cols_to_keep = ['posID', 'ID', 'REF', 'ALT']
gwas_df = gwas_df.merge(hrc_df[hrc_cols_to_keep], how='inner', left_on='SNP', right_on='posID')
print(f'gwas summary stats shape {gwas_df.shape}')

if DEBUG:
    display(gwas_df.head())

gwas summary stats shape (17579908, 13)


Unnamed: 0,SNP,A1,A2,freq,b,se,p,N_cases,N_controls,posID,ID,REF,ALT
0,chr11:88249377,T,C,0.9931,0.1575,0.1783,0.3771,7161,5356,chr11:88249377,rs11020170,T,C
1,chr1:60320992,A,G,0.9336,0.0605,0.0456,0.1846,26421,442271,chr1:60320992,rs116406626,A,G
2,chr2:18069070,T,C,0.9988,-0.6774,1.3519,0.6163,582,905,chr2:18069070,.,T,C
3,chr8:135908647,A,G,0.2081,-0.0358,0.0273,0.1887,26421,442271,chr8:135908647,rs11992603,G,A
4,chr12:3871714,A,C,0.9972,0.1489,1.0636,0.8886,749,658,chr12:3871714,rs192908256,A,C


In [9]:
cnt_wo_dbsnp = gwas_df.loc[gwas_df.ID == '.'].shape[0]
frac_wo_dbsnp = cnt_wo_dbsnp/gwas_df.shape[0]
print(f'{frac_wo_dbsnp:.3f} fraction of variants without dbSNP IDs')

0.072 fraction of variants without dbSNP IDs


### annotate summary stats with hg38 variant positions

#### since annotating the hg38 positions based on dbSNP IDs drop the ones that don't have a dbSNP IDs

In [10]:
gwas_df = gwas_df.loc[gwas_df.ID != '.']
print(f'gwas summary stats shape {gwas_df.shape}')

if DEBUG:
    display(gwas_df.head())    

gwas summary stats shape (16311706, 13)


Unnamed: 0,SNP,A1,A2,freq,b,se,p,N_cases,N_controls,posID,ID,REF,ALT
0,chr11:88249377,T,C,0.9931,0.1575,0.1783,0.3771,7161,5356,chr11:88249377,rs11020170,T,C
1,chr1:60320992,A,G,0.9336,0.0605,0.0456,0.1846,26421,442271,chr1:60320992,rs116406626,A,G
3,chr8:135908647,A,G,0.2081,-0.0358,0.0273,0.1887,26421,442271,chr8:135908647,rs11992603,G,A
4,chr12:3871714,A,C,0.9972,0.1489,1.0636,0.8886,749,658,chr12:3871714,rs192908256,A,C
5,chr16:77148858,A,G,0.9976,-0.1213,0.3874,0.7543,6248,4391,chr16:77148858,rs189372368,A,G


#### pull down the dbSNP annotation files

In [11]:
def run_bash_cmd(cmd: str):
    !{cmd}

#funciton to pull down the UCSC dbSNP files
def pull_ucsc_dbsnp_files(version_build_name: str, chrs_list: list, 
                          out_dir: str, threaded: bool=False, verbose: bool=False):
    snp_bed_url = f'ftp://ftp.ncbi.nih.gov/snp/organisms/{version_build_name}/BED/'
    with concurrent.futures.ProcessPoolExecutor() as ppe:
        for chrom in chrs_list:
            this_chr_snp_bed = f'bed_chr_{chrom}.bed.gz'
            snp_bed_file = snp_bed_url + this_chr_snp_bed
            this_cmd = f'curl --silent -L {snp_bed_file} --output {out_dir}/{this_chr_snp_bed}'
            if verbose:
                print(this_cmd)
            if threaded:
                ppe.submit(run_bash_cmd, this_cmd)
            else:
                run_bash_cmd(this_cmd)

In [12]:
%%time
# pull down hg38 dbSNP b151
out_dir = f'{public_dir}/dbsnp_hg38'
makedirs(out_dir, exist_ok=True)

pull_ucsc_dbsnp_files('human_9606_b151_GRCh38p7', autosomes, out_dir, 
                      threaded=True, verbose=False)

CPU times: user 64.2 ms, sys: 3.53 s, total: 3.59 s
Wall time: 1min 9s


#### load the dbSNP bed files

In [13]:
%%time

# def load_dbsnp_bed(file_path: str) -> DataFrame:
#     this_df = read_csv(file_path, skiprows=1, header=None, sep='\t')
#     this_df.drop(columns=[1, 4, 5], inplace=True)
#     this_df.columns = ['chrom', 'position', 'id']
#     return this_df

# fs_list = []
# lm_results = []
# with concurrent.futures.ThreadPoolExecutor() as tpe:
#     for chrom in autosomes:
#         this_file = f'{public_dir}/dbsnp_hg38/bed_chr_{chrom}.bed.gz'
#         fs_list.append(tpe.submit(load_dbsnp_bed, this_file))
# for future in concurrent.futures.as_completed(fs_list):
#     load_results.append(future.result())

# # flatten the list
# dbsnp_df = concat([item for item in load_results])
# print(dbsnp_df.shape)
# if DEBUG:
#     display(dbsnp_df.sample(10))

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.63 µs


In [14]:
%%time

bed_files = f'{public_dir}/dbsnp_hg38/bed_chr_*.bed.gz'
dbsnp_df = dask_read_csv(bed_files, skiprows=1, header=None, 
                         sep='\t', usecols=[0, 2, 3]).compute()
# dbsnp_df.drop(columns=[1, 4, 5], inplace=True)
dbsnp_df.columns = ['chrom', 'position', 'id']
print(dbsnp_df.shape)
if DEBUG:
    display(dbsnp_df.sample(10))

Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  "Setting ``blocksize=None``" % compression


(633520791, 3)


Unnamed: 0,chrom,position,id
17785105,chr3,139990223,rs989100640
5232190,chr5,130555511,rs551330192
3519084,chr19,31157075,rs781603457
22299731,chr12,453909,rs1315965319
15871892,chr14,66200769,rs1342810072
825243,chr10,15901476,rs77388106
33749671,chr1,212281844,rs1265157675
12210849,chr6,98548145,rs939358627
3367880,chr7,64456505,rs527562885
44367662,chr3,69373073,rs1476055991


CPU times: user 14min 3s, sys: 4min 1s, total: 18min 4s
Wall time: 9min 29s


#### only need the variants that are in the summary stats

In [15]:
dbsnp_df = dbsnp_df.loc[dbsnp_df.id.isin(gwas_df.ID)]
print(dbsnp_df.shape)
if DEBUG:
    display(dbsnp_df.sample(10))

(16263779, 3)


Unnamed: 0,chrom,position,id
3222549,chr14,60098879,rs569515776
2600502,chr2,164269495,rs147574942
794877,chr6,5039553,rs62385159
408496,chr8,29591134,rs17060380
886211,chr3,98168032,rs72931982
5977525,chr5,118008414,rs562831561
1190309,chr4,101284781,rs78650301
439314,chr1,39041489,rs10888650
284481,chr5,58983004,rs7713345
310263,chr4,56483116,rs7700034


#### merget the dbSNP 151 hg38 annotations onto the gwas summary statistics

In [16]:
gwas_df = gwas_df.merge(dbsnp_df, how='left', left_on='ID', right_on='id')
print(f'gwas summary stats shape {gwas_df.shape}')

if DEBUG:
    display(gwas_df.head())

gwas summary stats shape (16311706, 16)


Unnamed: 0,SNP,A1,A2,freq,b,se,p,N_cases,N_controls,posID,ID,REF,ALT,chrom,position,id
0,chr11:88249377,T,C,0.9931,0.1575,0.1783,0.3771,7161,5356,chr11:88249377,rs11020170,T,C,chr11,88516209.0,rs11020170
1,chr1:60320992,A,G,0.9336,0.0605,0.0456,0.1846,26421,442271,chr1:60320992,rs116406626,A,G,chr1,59855320.0,rs116406626
2,chr8:135908647,A,G,0.2081,-0.0358,0.0273,0.1887,26421,442271,chr8:135908647,rs11992603,G,A,chr8,134896404.0,rs11992603
3,chr12:3871714,A,C,0.9972,0.1489,1.0636,0.8886,749,658,chr12:3871714,rs192908256,A,C,chr12,3762548.0,rs192908256
4,chr16:77148858,A,G,0.9976,-0.1213,0.3874,0.7543,6248,4391,chr16:77148858,rs189372368,A,G,chr16,77114961.0,rs189372368


#### how many of the the HRC dbSNP IDs aren't in or didn't merge for some other resion in dbSNP151
note could've just used 'inner' merge to not include these but doing in two steps so I can see the count

In [17]:
missing_df = gwas_df.loc[gwas_df.ID != gwas_df.id]
print(f'number of variants that did not merge from dbSNP 151 {missing_df.shape[0]}')
if DEBUG:
    display(missing_df.head())

number of variants that did not merge from dbSNP 151 18118


Unnamed: 0,SNP,A1,A2,freq,b,se,p,N_cases,N_controls,posID,ID,REF,ALT,chrom,position,id
549,chr6:29968965,A,G,0.064,0.0493,0.036,0.1717,32505,448088,chr6:29968965,rs558185119,G,A,,,
1546,chr2:129706224,T,C,0.9989,-0.0673,0.6657,0.9195,2692,2238,chr2:129706224,rs111489065,C,T,,,
2273,chr10:107779570,A,G,0.261,-0.0558,0.0307,0.06927,7803,5852,chr10:107779570,rs112715238,G,A,,,
3790,chr14:74592024,A,G,0.8584,0.0027,0.0251,0.913,33674,449056,chr14:74592024,rs138656424,A,G,,,
3984,chr21:18617763,A,C,0.1024,0.0046,0.0422,0.9122,25252,441303,chr21:18617763,rs146412710,C,A,,,


#### go ahead and remove the ones that were missing from dbSNP 151

In [18]:
gwas_df = gwas_df.loc[~gwas_df.id.isna()]
print(f'gwas summary stats shape {gwas_df.shape}')

if DEBUG:
    display(gwas_df.head()) 

gwas summary stats shape (16293588, 16)


Unnamed: 0,SNP,A1,A2,freq,b,se,p,N_cases,N_controls,posID,ID,REF,ALT,chrom,position,id
0,chr11:88249377,T,C,0.9931,0.1575,0.1783,0.3771,7161,5356,chr11:88249377,rs11020170,T,C,chr11,88516209.0,rs11020170
1,chr1:60320992,A,G,0.9336,0.0605,0.0456,0.1846,26421,442271,chr1:60320992,rs116406626,A,G,chr1,59855320.0,rs116406626
2,chr8:135908647,A,G,0.2081,-0.0358,0.0273,0.1887,26421,442271,chr8:135908647,rs11992603,G,A,chr8,134896404.0,rs11992603
3,chr12:3871714,A,C,0.9972,0.1489,1.0636,0.8886,749,658,chr12:3871714,rs192908256,A,C,chr12,3762548.0,rs192908256
4,chr16:77148858,A,G,0.9976,-0.1213,0.3874,0.7543,6248,4391,chr16:77148858,rs189372368,A,G,chr16,77114961.0,rs189372368


In [19]:
test_snps = ['rs114138760', 'rs35749011', 'rs76763715', 'rs6658353',
             'rs11578699', 'rs823118', 'rs11557080', 'rs4653767',
             'rs10797576', 'rs76116224']
display(gwas_df.loc[gwas_df.ID.isin(test_snps)])

Unnamed: 0,SNP,A1,A2,freq,b,se,p,N_cases,N_controls,posID,ID,REF,ALT,chrom,position,id
2764250,chr1:205723572,T,C,0.5748,0.0999,0.0171,4.941e-09,33674,449056,chr1:205723572,rs823118,C,T,chr1,205754444.0,rs823118
3539407,chr1:205737739,A,G,0.1426,0.1351,0.0241,2.122e-08,33674,449056,chr1:205737739,rs11557080,G,A,chr1,205768611.0,rs11557080
3893676,chr1:154898185,C,G,0.0112,0.3113,0.0844,0.0002246,33674,449056,chr1:154898185,rs114138760,G,C,chr1,154925709.0,rs114138760
6114374,chr1:155205634,T,C,0.993,-0.4907,0.1426,0.0005764,12989,10894,chr1:155205634,rs76763715,T,C,chr1,155235843.0,rs76763715
6568248,chr2:18147848,A,T,0.9105,0.1551,0.0403,0.000119,26948,442743,chr2:18147848,rs76116224,A,T,chr2,17966582.0,rs76116224
7573730,chr1:171719769,T,C,0.1955,-0.0781,0.0222,0.0004235,32505,448088,chr1:171719769,rs11578699,C,T,chr1,171750629.0,rs11578699
9698034,chr1:161469054,C,G,0.5014,0.0722,0.0171,2.418e-05,33674,449056,chr1:161469054,rs6658353,G,C,chr1,161499264.0,rs6658353
10185847,chr1:226916078,T,C,0.7158,0.0729,0.0186,8.667e-05,33674,449056,chr1:226916078,rs4653767,T,C,chr1,226728377.0,rs4653767
13526195,chr1:155135036,A,G,0.0191,0.7508,0.0659,5.0219999999999996e-30,33674,449056,chr1:155135036,rs35749011,G,A,chr1,155162560.0,rs35749011
15191523,chr1:232664611,T,C,0.1434,0.0998,0.0241,3.532e-05,33674,449056,chr1:232664611,rs10797576,C,T,chr1,232528865.0,rs10797576


#### cleanup, re-order, and subset gwas summary statistis to keep
format columns to be very similar to public summary stats available via NHGRI-EBI GWAS Catalog

In [20]:
gwas_df.rename(columns={'ID': 'variant_id', 'p': 'p_value', 'chrom': 'chromosome', 
                        'position': 'base_pair_location', 'A1': 'effect_allele', 
                        'A2': 'other_allele', 'freq': 'effect_allele_frequency', 
                        'b': 'beta', 'se': 'standard_error', 'N_cases': 'n_cases', 
                        'N_controls': 'n_controls', 'REF': 'ref_allele', 
                        'ALT': 'alt_allele'}, inplace=True)
# need to strip 'chr' from chromosome
gwas_df.chromosome = gwas_df.chromosome.str.replace('chr', '')
# make sure bp pos is int instead of float
gwas_df.base_pair_location = gwas_df.base_pair_location.astype('int64')

# reorder columns
gwas_df = gwas_df[['variant_id', 'p_value', 'chromosome', 'base_pair_location', 
                   'effect_allele', 'other_allele', 'effect_allele_frequency', 
                   'beta', 'standard_error', 'n_cases', 'n_controls', 
                   'ref_allele', 'alt_allele']]

print(f'gwas summary stats shape {gwas_df.shape}')
if DEBUG:
    display(gwas_df.head()) 

gwas summary stats shape (16293588, 13)


Unnamed: 0,variant_id,p_value,chromosome,base_pair_location,effect_allele,other_allele,effect_allele_frequency,beta,standard_error,n_cases,n_controls,ref_allele,alt_allele
0,rs11020170,0.3771,11,88516209,T,C,0.9931,0.1575,0.1783,7161,5356,T,C
1,rs116406626,0.1846,1,59855320,A,G,0.9336,0.0605,0.0456,26421,442271,A,G
2,rs11992603,0.1887,8,134896404,A,G,0.2081,-0.0358,0.0273,26421,442271,G,A
3,rs192908256,0.8886,12,3762548,A,C,0.9972,0.1489,1.0636,749,658,A,C
4,rs189372368,0.7543,16,77114961,A,G,0.9976,-0.1213,0.3874,6248,4391,A,G


#### compute the odds ratio values and add

In [21]:
gwas_df['odds_ratio'] = exp(gwas_df.beta)

print(f'gwas summary stats shape {gwas_df.shape}')
if DEBUG:
    display(gwas_df.head()) 

gwas summary stats shape (16293588, 14)


Unnamed: 0,variant_id,p_value,chromosome,base_pair_location,effect_allele,other_allele,effect_allele_frequency,beta,standard_error,n_cases,n_controls,ref_allele,alt_allele,odds_ratio
0,rs11020170,0.3771,11,88516209,T,C,0.9931,0.1575,0.1783,7161,5356,T,C,1.170581
1,rs116406626,0.1846,1,59855320,A,G,0.9336,0.0605,0.0456,26421,442271,A,G,1.062368
2,rs11992603,0.1887,8,134896404,A,G,0.2081,-0.0358,0.0273,26421,442271,G,A,0.964833
3,rs192908256,0.8886,12,3762548,A,C,0.9972,0.1489,1.0636,749,658,A,C,1.160557
4,rs189372368,0.7543,16,77114961,A,G,0.9976,-0.1213,0.3874,6248,4391,A,G,0.885768


In [22]:
display(gwas_df.loc[gwas_df.variant_id.isin(test_snps)])

Unnamed: 0,variant_id,p_value,chromosome,base_pair_location,effect_allele,other_allele,effect_allele_frequency,beta,standard_error,n_cases,n_controls,ref_allele,alt_allele,odds_ratio
2764250,rs823118,4.941e-09,1,205754444,T,C,0.5748,0.0999,0.0171,33674,449056,C,T,1.10506
3539407,rs11557080,2.122e-08,1,205768611,A,G,0.1426,0.1351,0.0241,33674,449056,G,A,1.144651
3893676,rs114138760,0.0002246,1,154925709,C,G,0.0112,0.3113,0.0844,33674,449056,G,C,1.365199
6114374,rs76763715,0.0005764,1,155235843,T,C,0.993,-0.4907,0.1426,12989,10894,T,C,0.612198
6568248,rs76116224,0.000119,2,17966582,A,T,0.9105,0.1551,0.0403,26948,442743,A,T,1.167775
7573730,rs11578699,0.0004235,1,171750629,T,C,0.1955,-0.0781,0.0222,32505,448088,C,T,0.924872
9698034,rs6658353,2.418e-05,1,161499264,C,G,0.5014,0.0722,0.0171,33674,449056,G,C,1.07487
10185847,rs4653767,8.667e-05,1,226728377,T,C,0.7158,0.0729,0.0186,33674,449056,T,C,1.075623
13526195,rs35749011,5.0219999999999996e-30,1,155162560,A,G,0.0191,0.7508,0.0659,33674,449056,G,A,2.118694
15191523,rs10797576,3.532e-05,1,232528865,T,C,0.1434,0.0998,0.0241,33674,449056,C,T,1.10495


### save the formatted and annotated GWAS summary statistics

#### save in file format to match NHGRI-EBI GWAS Catalog

In [23]:
gwas_df.to_csv(out_catalogue_file, index=False, sep='\t')

#### save in parquet for faster access for other projects, etc

In [24]:
gwas_df.to_parquet(out_file)

In [25]:
!date

Fri Jul 29 17:12:43 UTC 2022
