#### Notebook to format genotypes for use with tensorQTL

typically store wgs genotypes by chromosome in vcf or plink2 pfiles
tensorQTL using plink1 bfiles, so convert, also since small cohort go ahead and merge from per chromosome to genome

In [1]:
!date

Thu Aug  5 02:26:02 UTC 2021


#### import libraries and set notebook variables

In [2]:
import concurrent.futures
import os
import pandas as pd

In [3]:
# parameters
cohort = 'foundin'
amp_abbr = 'PP'
version = 'amppdv1'

# naming
cohort_version = f'{cohort}.{version}'

# directories
wrk_dir = f'/home/jupyter/sceqtl'
geno_dir = f'{wrk_dir}/genotypes'
quants_dir = f'{wrk_dir}/quants'
info_dir = f'{wrk_dir}/sample_info'
public_dir = f'{wrk_dir}/public'

# input files
pfiles = '{genodir}/{cohortversion}.chr{chr}'
gwas_results_file = f'{public_dir}/pd_meta5v2_cojo_results.jma.cojo.csv'

# output files
genome_bfile = f'{geno_dir}/{cohort_version}.bfile'
risk_bfile = f'{geno_dir}/{cohort_version}.risk.bfile'
chr_detected_out_file = '{exprdir}/{cohortbuild}.detected.genes.chr{chr}'
risk_variants_file = f'{public_dir}/pd.risk.variants'

# constant values
autosomes = [str(x) for x in list(range(1,23))]
max_dist = 1000000
capture_out = !(nproc)
max_threads = int(capture_out[0])
alpha_value = 0.05
max_feature_cnt_parallel_load = 20000

#### utility functions

In [4]:
def run_bash_cmd(this_cmd):
    !{this_cmd}

#### convert from plink2 pfiles to plink bfiles

In [5]:
with concurrent.futures.ProcessPoolExecutor() as ppe:
    for chrom in autosomes:
        this_pfile = pfiles.format(genodir=geno_dir, cohortversion=cohort_version, chr=chrom)
        this_cmd = f'plink2 --pfile {this_pfile} --make-bed --out {this_pfile}.bfile --silent'
#         print(this_cmd)
        ppe.submit(run_bash_cmd, this_cmd)    

In [6]:
# merge the files into a single plink binary set

def frmt_merge_list_file(geno_dir, cohort_version, autosomes):
    merge_file_set = f'{geno_dir}/bfile_merge-list.txt'
    with open(merge_file_set, 'w') as file_handler:
        for chrom in autosomes:
            this_pfile = pfiles.format(genodir=geno_dir, cohortversion=cohort_version, chr=chrom)
            file_handler.write(f'{this_pfile}.bfile\n')
    return merge_file_set

def run_plink_bfile_merge(merge_file_set, genome_bfile):
    this_cmd = f'plink --merge-list {merge_file_set} --make-bed --allow-no-sex \
    --silent --out {genome_bfile} --maf 0.01 --geno 0.05 --hwe 0.000001'
    !{this_cmd}

# merge the per chrom bfiles into a genome bfile
merge_file_set = frmt_merge_list_file(geno_dir, cohort_version, autosomes)
run_plink_bfile_merge(merge_file_set, genome_bfile)

# if there was a missnp problem remove those variant and re-attemp merge
if os.path.exists(f'{genome_bfile}-merge.missnp'):
    print('removing problem variants and retrying merge')
    with concurrent.futures.ProcessPoolExecutor() as ppe:
        for chrom in autosomes:
            this_pfile = pfiles.format(genodir=geno_dir, cohortversion=cohort_version, chr=chrom)
            this_cmd = f'plink2 --pfile {this_pfile} --make-bed --out {this_pfile}.bfile \
--silent --exclude {genome_bfile}-merge.missnp'
    #         print(this_cmd)
            ppe.submit(run_bash_cmd, this_cmd)           

    # try the merge again
    merge_file_set = frmt_merge_list_file(geno_dir, cohort_version, autosomes)
    run_plink_bfile_merge(merge_file_set, genome_bfile)

with matching IDs are all merged together; if this is not what you want (e.g.
you have a bunch of novel variants, all with ID "."), assign distinct IDs to
them (with e.g. --set-missing-var-ids) before rerunning this merge.
to length-80+ variant IDs; consider using a different naming scheme for long
indels and the like.
Error: 6239 variants with 3+ alleles present.
* If you believe this is due to strand inconsistency, try --flip with
  /home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile-merge.missnp.
  alleles probably remain in your data.  If LD between nearby SNPs is high,
  --flip-scan should detect them.)
* If you are dealing with genuine multiallelic variants, we recommend exporting
  that subset of the data to VCF (via e.g. '--recode vcf'), merging with
  another tool/script, and then importing the result; PLINK is not yet suited
  to handling them.
See https://www.cog-genomics.org/plink/1.9/data#merge3 for more discussion.
removing problem variants and retrying merge
to length-8

In [7]:
!ls {genome_bfile}*
!head {genome_bfile}.log
!tail {genome_bfile}.log

/home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile-merge.missnp
/home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile.bed
/home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile.bim
/home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile.fam
/home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile.log
PLINK v1.90b6.21 64-bit (19 Oct 2020)
Options in effect:
  --allow-no-sex
  --geno 0.05
  --hwe 0.000001
  --maf 0.01
  --make-bed
  --merge-list /home/jupyter/sceqtl/genotypes/bfile_merge-list.txt
  --out /home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile
  --silent
--hwe: 15 variants removed due to Hardy-Weinberg exact test.
6980472 variants removed due to minor allele threshold(s)
(--maf/--max-maf/--mac/--max-mac).
8697174 variants and 119 people pass filters and QC.
Note: No phenotypes present.
--make-bed to /home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile.bed +
/home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile.bim +
/home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile.fam ... done.



#### IDs used in analysis will be prefixed 'PPMI' so change AMP-PD 'PPs'

In [None]:
# read fam file and replace IDs
fam_df = pd.read_csv(f'{genome_bfile}.fam', sep='\s+', header=None)
print(fam_df.shape)
# display(fam_df.head())
# do the replace
fam_df[0] = fam_df[1] = fam_df[0].str.replace('PP-', 'PPMI')
print(fam_df.shape)
# display(fam_df.head())
# write corrected file
fam_df.to_csv(f'{genome_bfile}.fam', header=False, index=False, sep=' ')

#### subset genome bfile to just the risk index variants

In [9]:
gwas_df = pd.read_csv(gwas_results_file)
print(gwas_df.shape)
display(gwas_df.head())

(115, 14)


Unnamed: 0,Chr,SNP,bp,refA,freq,b,se,p,n,freq_geno,bJ,bJ_se,pJ,LD_r
0,10,rs72840788,119656173,A,0.2155,0.0763,0.0113,1.45608e-11,1730210.0,0.213533,0.084061,0.01132,1.1224e-13,-0.064738
1,10,rs117896735,119776815,A,0.0166,0.4354,0.0394,2.17363e-28,1473810.0,0.015191,0.452913,0.039472,1.77734e-30,0.0
2,10,rs896435,15515407,T,0.6892,0.0735,0.0101,3.4075e-13,1715830.0,0.688791,0.0735,0.0101,3.41028e-13,0.0
3,10,rs10748818,102255522,A,0.8514,-0.079,0.013,1.2251e-09,1588080.0,0.854401,-0.079,0.013,1.22563e-09,0.0
4,11,rs3802920,133917106,T,0.2054,0.1073,0.0117,4.6891099999999996e-20,1721930.0,0.206198,0.1073,0.0117,4.6987499999999995e-20,0.0


In [10]:
# how many of the gwas variants are in the bfiles
bim_df = pd.read_csv(f'{genome_bfile}.bim', header=None, sep='\s+')
print(bim_df.shape)
display(bim_df.head())

variant_intersect = set(gwas_df['SNP']) & set(bim_df[1])
print(f'found {len(variant_intersect)} in both bfile and gwas')
missing_variants = set(gwas_df['SNP']) - set(bim_df[1])
print('here are the missing variants')
print(missing_variants)

(8697174, 6)


Unnamed: 0,0,1,2,3,4,5
0,1,rs201234755,0,766399,G,GAATA
1,1,rs142559957,0,769257,A,G
2,1,rs78250985,0,772506,C,G
3,1,rs958827772,0,772587,T,C
4,1,rs866924320,0,773628,A,T


found 104 in both bfile and gwas
here are the missing variants
{'rs181609621', 'rs199461', 'rs9468195', 'rs114138760', 'rs74751235', 'rs3794253', 'rs35749011', 'rs79956144', 'rs12502292', 'rs62465432', 'rs144755950'}


In [11]:
pd.DataFrame(data=variant_intersect).to_csv(risk_variants_file, index=False, header=False)
this_cmd = f'plink --bfile {genome_bfile} --make-bed --out {risk_bfile} \
--silent --extract {risk_variants_file}'
print(this_cmd)
!{this_cmd}

plink --bfile /home/jupyter/sceqtl/genotypes/foundin.amppdv1.bfile --make-bed --out /home/jupyter/sceqtl/genotypes/foundin.amppdv1.risk.bfile --silent --extract /home/jupyter/sceqtl/public/pd.risk.variants
