## Notebook to update the public Jerber et al eQTL results with dbSNP IDs and hg38 positions

Jerber J, Seaton DD, Cuomo ASE et al. Population-scale single-cell RNA-seq profiling across dopaminergic neuron differentiation. Nat Genet 2021;53:304–12.
https://pubmed.ncbi.nlm.nih.gov/33664506/

[Data Access](https://zenodo.org/record/4333872#.YEEjEZNKhdA)

In [1]:
!date

Tue Jan  3 23:10:44 UTC 2023


#### import libraries

In [2]:
from pandas import read_csv, DataFrame, concat
from os import makedirs
import concurrent.futures
from dask.dataframe import read_csv as dask_read_csv

#### set notebook variables

In [3]:
# naming

# directories
wrk_dir = '/home/jupyter/foundin_qtl'
public_dir = f'{wrk_dir}/public'

# in files
da_qtl_file = f'{public_dir}/jerber_da_neuron_eqtl/eqtl_summary_stats_renamed/D52.DA.untreated.qtl_results_all.sorted.txt.gz'
pb_qtl_file = f'{public_dir}/jerber_da_neuron_eqtl/eqtl_summary_stats_renamed/D52.pseudobulk.untreated.qtl_results_all.sorted.txt.gz'

# out files
da_out_file = f'{public_dir}/jerber_da_neuron_eqtl/hipsci_D52_DA.untreated.qtl_results_all.hg38.parquet'
pb_out_file = f'{public_dir}/jerber_da_neuron_eqtl/hipsci_D52_pseudobulk.untreated.qtl_results_all.hg38.parquet'

# variables
DEBUG = False
autosomes = [str(x) for x in list(range(1,23))]

#### functions

In [4]:
def run_bash_cmd(cmd: str):
    !{cmd}

#funciton to pull down the UCSC dbSNP files
def pull_ucsc_dbsnp_files(version_build_name: str, chrs_list: list, 
                          out_dir: str, threaded: bool=False, verbose: bool=False):
    snp_bed_url = f'ftp://ftp.ncbi.nih.gov/snp/organisms/{version_build_name}/BED/'
    with concurrent.futures.ProcessPoolExecutor() as ppe:
        for chrom in chrs_list:
            this_chr_snp_bed = f'bed_chr_{chrom}.bed.gz'
            snp_bed_file = snp_bed_url + this_chr_snp_bed
            this_cmd = f'curl --silent -L {snp_bed_file} --output {out_dir}/{this_chr_snp_bed}'
            if verbose:
                print(this_cmd)
            if threaded:
                ppe.submit(run_bash_cmd, this_cmd)
            else:
                run_bash_cmd(this_cmd)
                
def load_hg19_dbsnp_chrom_subset(chrom: str, variants: set) -> DataFrame:
    in_file = f'{public_dir}/dbsnp_hg19/bed_chr_{chrom}.bed.gz'
    dbsnp_df = read_csv(in_file, skiprows=1, header=None, sep='\t', usecols=[0, 2, 3])
    dbsnp_df.columns = ['chrom', 'position', 'id']
    # go ahead and subset chrom before making posID column
    dbsnp_df = dbsnp_df.loc[dbsnp_df.chrom == f'chr{chrom}']
    dbsnp_df['posID'] = dbsnp_df.chrom.astype('str') + ':' + dbsnp_df.position.astype('str')
    if DEBUG:
        print(f'chr{chrom} dbSNP reference shape {dbsnp_df.shape}')
    # subset to just variants of interest
    dbsnp_df = dbsnp_df.loc[dbsnp_df.posID.isin(variants)]
    if DEBUG:
        print(f'chr{chrom} kept dbSNP reference shape {dbsnp_df.shape}')    
    return dbsnp_df

def load_hg38_dbsnp_chrom_subset(chrom: str, variants: set) -> DataFrame:
    in_file = f'{public_dir}/dbsnp_hg38/bed_chr_{chrom}.bed.gz'
    dbsnp_df = read_csv(in_file, skiprows=1, header=None, sep='\t', usecols=[0, 2, 3])
    dbsnp_df.columns = ['chrom', 'position', 'id']
    # go ahead and subset chrom
    dbsnp_df = dbsnp_df.loc[dbsnp_df.chrom == f'chr{chrom}']
    if DEBUG:
        print(f'chr{chrom} dbSNP reference shape {dbsnp_df.shape}')
    # subset to just variants of interest
    dbsnp_df = dbsnp_df.loc[dbsnp_df.id.isin(variants)]
    if DEBUG:
        print(f'chr{chrom} kept dbSNP reference shape {dbsnp_df.shape}')    
    return dbsnp_df                

### load input data

#### load the eQTL results

In [5]:
%%time
da_qtl_df = read_csv(da_qtl_file, sep='\t')
print(f'DA eQTL summary stats shape {da_qtl_df.shape}')
pb_qtl_df = read_csv(pb_qtl_file, sep='\t')
print(f'pseudobulk eQTL summary stats shape {pb_qtl_df.shape}')

if DEBUG:
    display(da_qtl_df.sample(5))
    display(pb_qtl_df.sample(5))    

DA eQTL summary stats shape (25586818, 17)
pseudobulk eQTL summary stats shape (20288045, 17)
CPU times: user 1min 32s, sys: 5.89 s, total: 1min 38s
Wall time: 1min 38s


#### need to pull the alleles from the 'snp_id'
looks like assessed_allele (assuming effect allele) is 2nd allele from delimited 'snp_id'

In [6]:
# DA results
variant_info = da_qtl_df.snp_id.str.split('_', expand=True)
variant_info.columns = ['chrom', 'position', 'A1', 'A2']
da_qtl_df['other_allele'] = variant_info.A1
da_qtl_df.rename(columns={'assessed_allele': 'effect_allele'}, inplace=True)
print(f'DA eQTL summary stats shape {da_qtl_df.shape}')
# pseudobulk results
variant_info = pb_qtl_df.snp_id.str.split('_', expand=True)
variant_info.columns = ['chrom', 'position', 'A1', 'A2']
pb_qtl_df['other_allele'] = variant_info.A1
pb_qtl_df.rename(columns={'assessed_allele': 'effect_allele'}, inplace=True)
print(f'pseudobulk eQTL summary stats shape {pb_qtl_df.shape}')

if DEBUG:
    display(da_qtl_df.sample(5))
    display(pb_qtl_df.sample(5))   

DA eQTL summary stats shape (25586818, 18)
pseudobulk eQTL summary stats shape (20288045, 18)


In [7]:
print(f'number of unique variants in DA results {da_qtl_df.snp_id.nunique()}')
print(f'number of unique variants in pseudobulk results {pb_qtl_df.snp_id.nunique()}')

number of unique variants in DA results 4936061
number of unique variants in pseudobulk results 4416814


#### create posID type in the eQTL results

In [8]:
%%time
da_qtl_df['posID'] = 'chr' + da_qtl_df.snp_chromosome.astype('str') + ':' + da_qtl_df.snp_position.astype('str')
print(f'DA eQTL results shape {da_qtl_df.shape}')
pb_qtl_df['posID'] = 'chr' + pb_qtl_df.snp_chromosome.astype('str') + ':' + pb_qtl_df.snp_position.astype('str')
print(f'pseudobulk eQTL results shape {pb_qtl_df.shape}')

if DEBUG:
    display(da_qtl_df.sample(5))
    display(pb_qtl_df.sample(5))    

DA eQTL results shape (25586818, 19)
pseudobulk eQTL results shape (20288045, 19)
CPU times: user 53.3 s, sys: 65.7 ms, total: 53.4 s
Wall time: 53.2 s


#### get set of unique variants in either results set

In [9]:
variant_set = set(da_qtl_df.posID) | set(pb_qtl_df.posID)
print(len(variant_set))

4964218


#### pull down the dbSNP annotation files

In [10]:
%%time
# pull down hg38 dbSNP b151
out_dir = f'{public_dir}/dbsnp_hg38'
makedirs(out_dir, exist_ok=True)

pull_ucsc_dbsnp_files('human_9606_b151_GRCh38p7', autosomes, out_dir, 
                      threaded=True, verbose=False)

CPU times: user 501 ms, sys: 4 s, total: 4.5 s
Wall time: 34.3 s


In [11]:
%%time
# pull down hg19 dbSNP b151
out_dir = f'{public_dir}/dbsnp_hg19'
makedirs(out_dir, exist_ok=True)

pull_ucsc_dbsnp_files('human_9606_b151_GRCh37p13', autosomes, out_dir, 
                      threaded=True, verbose=False)

CPU times: user 67.9 ms, sys: 4.08 s, total: 4.15 s
Wall time: 34 s


#### load the hg19 dbSNP bed files

In [12]:
%%time
fs_list = []
lm_results = []
with concurrent.futures.ThreadPoolExecutor() as tpe:
    for chrom in autosomes:
        fs_list.append(tpe.submit(load_hg19_dbsnp_chrom_subset, chrom, variant_set))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

# flatten the list
dbsnp_df = concat(lm_results)
print(f'dbSNP reference shape {dbsnp_df.shape}')
if DEBUG:
    display(dbsnp_df.sample(5))
    display(dbsnp_df.chrom.value_counts())    

dbSNP reference shape (5373579, 4)
CPU times: user 24min 56s, sys: 2min 30s, total: 27min 27s
Wall time: 20min 50s


#### drop duplicates from dbSNP to avoid creating duplicates and collisions on merge

In [13]:
dbsnp_df = dbsnp_df.drop_duplicates(subset=['posID'], keep='first')
print(f'dbSNP reference shape {dbsnp_df.shape}')
if DEBUG:
    display(dbsnp_df.sample(5))
    display(dbsnp_df.chrom.value_counts())  

dbSNP reference shape (4963897, 4)


### annotate dbSNP IDs to eQTL results

#### how many of the eQTL variants are not present in dbSNP

In [14]:
cnt = len(set(da_qtl_df.posID) & set(dbsnp_df.posID))
print(f'count of posID intersect between DA eQTL results and dbSNP ref is {cnt}')
missing_cnt = len(set(da_qtl_df.posID) - set(dbsnp_df.posID))
print(f'count of posID in DA eQTL results but not in dbSNP ref is {missing_cnt}')

cnt = len(set(pb_qtl_df.posID) & set(dbsnp_df.posID))
print(f'count of posID intersect between pseudobulk eQTL results and dbSNP ref is {cnt}')
missing_cnt = len(set(pb_qtl_df.posID) - set(dbsnp_df.posID))
print(f'count of posID in pseudobulk eQTL results but not in dbSNP ref is {missing_cnt}')

count of posID intersect between DA eQTL results and dbSNP ref is 4935740
count of posID in DA eQTL results but not in dbSNP ref is 321
count of posID intersect between pseudobulk eQTL results and dbSNP ref is 4416525
count of posID in pseudobulk eQTL results but not in dbSNP ref is 289


### merge hg19 dbSNP columns to eQTL results based on posID

In [15]:
da_qtl_df = da_qtl_df.merge(dbsnp_df, how='left', on='posID')
print(f'DA eQTL results shape {da_qtl_df.shape}')
pb_qtl_df = pb_qtl_df.merge(dbsnp_df, how='left', on='posID')
print(f'pseudobulk eQTL results shape {pb_qtl_df.shape}')

if DEBUG:
    display(da_qtl_df.head())
    display(pb_qtl_df.head())    

DA eQTL results shape (25586818, 22)
pseudobulk eQTL results shape (20288045, 22)


In [16]:
cnt_wo_dbsnp = da_qtl_df.loc[da_qtl_df.id == '.'].shape[0]
frac_wo_dbsnp = cnt_wo_dbsnp/da_qtl_df.shape[0]
print(f'{frac_wo_dbsnp:.3f} fraction of DA eQTL variants without dbSNP IDs')

cnt_wo_dbsnp = pb_qtl_df.loc[pb_qtl_df.id == '.'].shape[0]
frac_wo_dbsnp = cnt_wo_dbsnp/pb_qtl_df.shape[0]
print(f'{frac_wo_dbsnp:.3f} fraction of pseudobulk eQTL variants without dbSNP IDs')

0.000 fraction of DA eQTL variants without dbSNP IDs
0.000 fraction of pseudobulk eQTL variants without dbSNP IDs


### annotate eQTL with hg38 variant positions

#### since annotating the hg38 positions based on dbSNP IDs drop the ones that don't have a dbSNP IDs

In [17]:
da_qtl_df = da_qtl_df.loc[~da_qtl_df.id.isna()]
print(f'DA eQTL results shape {da_qtl_df.shape}')
pb_qtl_df = pb_qtl_df.loc[~pb_qtl_df.id.isna()]
print(f'pseudobulk eQTL results shape {pb_qtl_df.shape}')

if DEBUG:
    display(da_qtl_df.head())
    display(pb_qtl_df.head())    

DA eQTL results shape (25585607, 22)
pseudobulk eQTL results shape (20287181, 22)


#### get set of unique variants, based on dbSNP IDs, in either results set

In [18]:
variant_ids_set = set(da_qtl_df.id) | set(pb_qtl_df.id)
print(len(variant_ids_set))

4963897


#### drop the hg19 position columns from the eQTL results before merging on the hg38 ones

In [19]:
cols_to_drop = ['chrom', 'position', 'feature_chromosome', 'feature_start', 
                'feature_end', 'snp_chromosome', 'snp_position', 'snp_id']
da_qtl_df.drop(columns=cols_to_drop, inplace=True)
print(f'DA eQTL results shape {da_qtl_df.shape}')
pb_qtl_df.drop(columns=cols_to_drop, inplace=True)
print(f'pseudobulk eQTL results shape {pb_qtl_df.shape}')

if DEBUG:
    display(da_qtl_df.head())
    display(pb_qtl_df.head())    

DA eQTL results shape (25585607, 14)
pseudobulk eQTL results shape (20287181, 14)


#### load the hg38 dbSNP bed files

In [20]:
%%time
fs_list = []
lm_results = []
with concurrent.futures.ThreadPoolExecutor() as tpe:
    for chrom in autosomes:
        fs_list.append(tpe.submit(load_hg38_dbsnp_chrom_subset, chrom, variant_ids_set))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

# flatten the list
dbsnp_df = concat(lm_results)
print(f'hg38 dbSNP reference shape {dbsnp_df.shape}')
if DEBUG:
    display(dbsnp_df.sample(5))
    display(dbsnp_df.chrom.value_counts())    

hg38 dbSNP reference shape (4963301, 3)
CPU times: user 16min 24s, sys: 2min 5s, total: 18min 29s
Wall time: 11min 59s


#### merge hg38 dbSNP columns to eQTL results based on dbSNP ID

In [21]:
da_qtl_df = da_qtl_df.merge(dbsnp_df, how='inner', on='id')
print(f'DA eQTL results shape {da_qtl_df.shape}')
pb_qtl_df = pb_qtl_df.merge(dbsnp_df, how='inner', on='id')
print(f'pseudobulk eQTL results shape {pb_qtl_df.shape}')

if DEBUG:
    display(da_qtl_df.head())
    display(pb_qtl_df.head())    

DA eQTL results shape (25581562, 16)
pseudobulk eQTL results shape (20284100, 16)


### save the update results

In [22]:
%%time
da_qtl_df.to_parquet(da_out_file)
pb_qtl_df.to_parquet(pb_out_file)

CPU times: user 24.1 s, sys: 1.95 s, total: 26 s
Wall time: 24.6 s


In [23]:
!date

Tue Jan  3 23:51:06 UTC 2023
