## Imports

In [1]:
import os
import pybedtools
import subprocess
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import plotnine as p9
import pysam

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Get ADPD SNPs

In [2]:
adpd_snps = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/snps_final/191121_ld_buddies_table_stage3.tsv', sep='\t')
display(adpd_snps.head())
display(adpd_snps.shape)

Unnamed: 0,chr,pos,r2_with_ld_tag,ld_tag_chr,ld_tag_pos,source_gwas,source,snp_id,locus_num,number_ld_buddies,ld_tag_locus,rsid,chrom_hg19,snp_pos_hg19,file,effect_allele,noneffect_allele,direction,pvalue,has_coloc,direct_atac_overlap_narrow_tissue_regions,containing_atac_tissues_narrow_tissue_regions,nearest_atac_tissue_narrow_tissue_regions,start_narrow_tissue_regions,end_narrow_tissue_regions,dist_narrow_tissue_regions,direct_atac_overlap_broad_tissue_regions,containing_atac_tissues_broad_tissue_regions,nearest_atac_tissue_broad_tissue_regions,start_broad_tissue_regions,end_broad_tissue_regions,dist_broad_tissue_regions,direct_atac_overlap_single_cell,containing_atac_tissues_single_cell,nearest_atac_tissue_single_cell,start_single_cell,end_single_cell,dist_single_cell
0,10,102207833,0.805869,10,102255522,Nalls_23andMe,LD,10_102207833,1,9,10_102255522,rs10883717,10.0,103967590.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,T,C,-,0.014434,False,False,none,CAUD,102225620,102226734,17780.0,False,none,ALLO,102225838,102226054,18005.0,False,none,microglia,102213693,102214072,5860.0
1,10,102245653,0.98749,10,102255522,Nalls_23andMe,LD,10_102245653,1,9,10_102255522,rs2296887,10.0,104005410.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,T,C,-,0.006977,False,True,"CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI","CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI",102244764,102246025,258.5,False,none,ALLO,102245836,102246040,183.0,True,"astrocytes,doublets,excitatory_neurons,inhibit...","astrocytes,doublets,excitatory_neurons,inhibit...",102244973,102245978,177.5
2,10,102250385,0.809348,10,102255522,Nalls_23andMe,LD,10_102250385,1,9,10_102255522,rs7913281,10.0,104010142.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,G,A,+,0.012859,False,False,none,CAUD,102244791,102246164,4221.0,False,none,ALLO,102245836,102246040,4345.0,False,none,opcs,102247774,102248243,2142.0
3,10,102251214,0.809348,10,102255522,Nalls_23andMe,LD,10_102251214,1,9,10_102255522,rs10883720,10.0,104010971.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,G,C,-,0.012617,False,False,none,CAUD,102244791,102246164,5050.0,False,none,ALLO,102245836,102246040,5174.0,False,none,opcs,102247774,102248243,2971.0
4,10,102255522,1.0,10,102255522,Nalls_23andMe,Nalls-Chang,10_102255522,1,9,10_102255522,rs10748818,10.0,104015279.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,G,A,+,0.006335,False,False,none,CAUD,102244791,102246164,9358.0,False,none,ALLO,102245836,102246040,9482.0,False,none,inhibitory_neurons,102255548,102256050,26.0


(13773, 38)

## Make hg38 SNP BED files

In [3]:
snps_bed = adpd_snps[['chr', 'pos']]
snps_bed['start'] = snps_bed['pos'] - 1
snps_bed = snps_bed[['chr', 'start', 'pos']]
snps_bed['chr'] = 'chr' + snps_bed['chr'].astype(str)
snps_bed.sort_values(by=['chr', 'start', 'pos'], inplace=True)
snps_bed.drop_duplicates(inplace=True)
snps_bed.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg38_snps.bed', sep='\t', index=False, header=False)
display(snps_bed.head())
display(snps_bed.shape)

Unnamed: 0,chr,start,pos
2334,chr1,39822150,39822151
2335,chr1,39822244,39822245
2336,chr1,39822259,39822260
2337,chr1,39822791,39822792
2338,chr1,39825224,39825225


(9707, 3)

## Make No-chr hg19 SNP BED files

In [4]:
# ! module load ucsc_tools/latest <- Do on command line
# ! liftOver /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg38_snps.bed \
#   /mnt/data/annotations/liftOver/hg38/hg38ToHg19.over.chain.gz \
#   /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/liftover_hg19_snps.bed \
#   unmapped_liftover_hg38_snps.bed

liftover_hg19_snps = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/liftover_hg19_snps.bed', sep='\t', header=None)
liftover_hg19_snps.columns = ['chr', 'start', 'pos']
liftover_hg19_snps['chr'] = liftover_hg19_snps['chr'].map(lambda x : x.replace('chr', ''))
liftover_hg19_snps.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/nochr_hg19_snps.bed', sep='\t', header=False, index=False)
display(liftover_hg19_snps.head())
display(liftover_hg19_snps.shape)

Unnamed: 0,chr,start,pos
0,1,40287822,40287823
1,1,40287916,40287917
2,1,40287931,40287932
3,1,40288463,40288464
4,1,40290896,40290897


(9696, 3)

## Intersect 1KG VCF with hg19 ADPD SNPs

In [5]:
#!  tabix -R /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/nochr_hg19_snps.bed /mnt/lab_data3/soumyak/refs/1KG/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz > /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/1KG_ADPD_hg19_snps.vcf

## Get hg38 Coordinates for 1KG intersected SNPs

In [6]:
hg19_vcf = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/1KG_ADPD_hg19_snps.vcf', header=None, sep='\t')
hg19_vcf = hg19_vcf[[2, 3, 4, 5, 6, 7]]
hg19_vcf.columns = ['rsid', 'ref', 'alt', 'score', 'qc', 'info']
hg19_vcf.sort_values(by=['rsid', 'ref'], inplace=True)
hg19_vcf.drop_duplicates(inplace=True)
display(hg19_vcf.head())
display(hg19_vcf.shape)

Unnamed: 0,rsid,ref,alt,score,qc,info
1239,.;esv3626682,T,"<CN0>,<CN2>",100,PASS,"AC=0,1;AF=0,0.000199681;AN=5008;CS=DUP_gs;END=..."
9850,DUP_uwash_chr5_60140563_60179087,A,<CN0>,100,PASS,AC=0;AF=0;AN=5008;CS=DUP_uwash;END=60179087;NS...
0,esv3585792,G,<CN0>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=0,500;CIPOS=..."
45,esv3585794,C,<CN0>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=-23,24;CIPOS..."
64,esv3587615,T,<CN2>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=-150,150;CIP..."


(9928, 6)

In [7]:
adpd_snps_hg38 = adpd_snps[['chr', 'pos', 'rsid']]
adpd_snps_hg38['chr'] = 'chr' + adpd_snps_hg38['chr'].astype('str')
adpd_snps_hg38.sort_values(by=['chr', 'pos'], inplace=True)
adpd_snps_hg38.drop_duplicates(subset=['rsid'], inplace=True)
display(adpd_snps_hg38.head())
display(adpd_snps_hg38.shape)

Unnamed: 0,chr,pos,rsid
2334,chr1,39822151,rs34640847
2335,chr1,39822245,rs36015266
2336,chr1,39822260,rs61779808
2337,chr1,39822792,rs61779809
2338,chr1,39825225,rs72666941


(9707, 3)

## Make 1KG ADPD hg38 VCF

In [8]:
merged = adpd_snps_hg38.merge(hg19_vcf, on='rsid')
merged = merged.loc[merged['ref'].apply(lambda x : len(x) < 2)]
merged = merged.loc[merged['alt'].apply(lambda x : len(x) < 2)]

hg38_vcf = merged[['chr', 'pos', 'rsid', 'ref', 'alt', 'score', 'qc', 'info']]
hg38_vcf['format'] = ['GT' for i in range(len(hg38_vcf))]
hg38_vcf = hg38_vcf.sort_values(by=['chr', 'pos'])
hg38_vcf.drop_duplicates(inplace=True)
hg38_vcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg38_snps_1KG_ADPD.vcf', header=False, index=False, sep='\t')
display(hg38_vcf.head())
display(hg38_vcf.shape)

Unnamed: 0,chr,pos,rsid,ref,alt,score,qc,info,format
0,chr1,39822151,rs34640847,G,A,100,PASS,AC=595;AF=0.11881;AN=5008;NS=2504;DP=18110;EAS...,GT
1,chr1,39822245,rs36015266,C,T,100,PASS,AC=595;AF=0.11881;AN=5008;NS=2504;DP=19296;EAS...,GT
2,chr1,39822260,rs61779808,T,A,100,PASS,AC=493;AF=0.0984425;AN=5008;NS=2504;DP=19241;E...,GT
3,chr1,39822792,rs61779809,C,T,100,PASS,AC=595;AF=0.11881;AN=5008;NS=2504;DP=19613;EAS...,GT
4,chr1,39825225,rs72666941,C,A,100,PASS,AC=477;AF=0.0952476;AN=5008;NS=2504;DP=20561;E...,GT


(9386, 9)

## Make 1KG ADPD hg38 BED

In [9]:
hg38_bed = merged[['chr', 'pos', 'rsid', 'ref', 'alt', 'info']]
hg38_bed['start'] = hg38_bed['pos'] - 1
hg38_bed = hg38_bed[['chr', 'start', 'pos', 'rsid', 'ref', 'alt', 'info']]
hg38_bed['info'] = hg38_bed['info'].apply(lambda x : x.split(';AF=')[1].split(';')[0])
hg38_bed.sort_values(by=['chr', 'pos'], inplace=True)
hg38_bed.drop_duplicates(inplace=True)
hg38_bed.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg38_snps_1KG_ADPD.bed', header=False, index=False, sep='\t')
display(hg38_bed.head())
display(hg38_bed.shape)

Unnamed: 0,chr,start,pos,rsid,ref,alt,info
0,chr1,39822150,39822151,rs34640847,G,A,0.11881
1,chr1,39822244,39822245,rs36015266,C,T,0.11881
2,chr1,39822259,39822260,rs61779808,T,A,0.0984425
3,chr1,39822791,39822792,rs61779809,C,T,0.11881
4,chr1,39825224,39825225,rs72666941,C,A,0.0952476


(9386, 7)

## Get ADPD Metadata

In [10]:
metadata = pd.read_excel('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/190215_Brain-ControlsOnly_Metadata_Merged.xlsx')
display(metadata.head())
display(metadata.shape)

Unnamed: 0,OldName,NewName,HarmonizedName,Bam,Contrast,Type,Group,Region,RegionMod,PatientID,TypeMod,Cohort,TissueCenter,TypeGroup,xxx.TechRep,Batch,xxx.Sample,xxx.Library,xxx.BioRep,xxx.TechRep.1,Pool,MAPT,xxx.race,Gender,xxx.expired_age,PMI,Mutation,MutationType,xxx.last_mmse_test_score,ApoE,xxx.last_mmse_test_date,xxx.motor_updrs_score_on,xxx.motor_updrs_score_off,xxx.motor_updrs_score_months_prior_to_death,xxx.Control,xxx.AD,xxx.PD,xxx.DLB,xxx.VAD,xxx.PSP,xxx.HS,xxx.DLDH,xxx.MND,xxx.CBD,xxx.PICKS,xxx.HD,xxx.MSA,xxx.ARG,xxx.CWMR,xxx.dementia_nos,xxx.MS,xxx.CAA,xxx.MCI,xxx.LBS,xxx.BRAIN_ABNORMAL,xxx.ACUTE_INFARCTS,xxx.PlaqueF,xxx.PlaqueT,xxx.PlaqueP,xxx.PlaqueH,xxx.PlaqueE,xxx.PlaqueTotal,xxx.Plaque density,xxx.TangleF,xxx.TangleT,xxx.TangleP,xxx.TangleE,xxx.TangleH,xxx.TangleTotal,xxx.Cerad NP,xxx.Braak score,xxx.NIA-R,xxx.Unified LB Stage,xxx.infarct_total_volume,xxx.infarct_cerebral_total_volume,xxx.obt,xxx.brain_stem_ix_x,xxx.brain_stem_lc,xxx.bf_amygdala,xxx.bf_nbm,xxx.brain_stem_sn,xxx.bf_trans,xxx.bf_cing,xxx.nctx_temporal,xxx.nctx_frontal,xxx.sum_lb_density,xxx.nctx_parietal,xxx.submondibular_gland,xxx.PathDXSummary,xxx.calc_NIA_AA,xxx.calc_thalPhase,xxx.micro_lewyBodyEvidenceTEXT,xxx.calc_HAAS_cerebralMicroinfarcts,xxx.calc_HAAS_deepGrayMicroinfarcts,xxx.AP_freshBrainWeight,xxx.CONS_CL_DX,xxx.CONS_CL_DX_UNK_OTHER,xxx.PRIMARY_NP_DX,xxx.CONTRIBUTING_NP_DX,xxx.Interval between death and last MMSE,xxx.ch_lastCasiScore,xxx.ch_lastCasiDate,xxx.micro_AmyloidAngiopathyOccipitalLobe_ID,xxx.GE_atherosclerosis_ID,xxx.calc_A,xxx.calc_B,xxx.C,xxx.CognitiveStatus
0,PD_00_38_CTRL_CAUD_X014_S01_L001_B1_T1_P025,PD_00_38_CTRL_CAUD_X014_S01_L001_B1_T1_P025,CTRL_CAUD_PD_00x38xx_X014_S01_L001_B1_T1_P025,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_CAUD,CTRL,CTRL,CAUD,CAUD,00_38,CTRL,PD,UA,CTRL_CAUD,PD_00_38_CTRL_CAUD,PD_X014,S01,L001,B1,T1,PD_P025,H2,White,Female,91,2.0,,,,2_3,,0.0,16.0,19.0,yes,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,0.0,0.0,0.0,0.0,0.0,0.0,zero,0.0,0.0,0.0,1.25,0.5,1.75,not AD,II,criteria not met,0. No Lewy bodies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Diseased Control; Left hippocampal sclerosis; ...,,,,,,,,,,,,,,,,,,,
1,PD_00_38_CTRL_CAUD_X014_S01_L002_B1_T2_P028,PD_00_38_CTRL_CAUD_X014_S01_L002_B1_T2_P028,CTRL_CAUD_PD_00x38xx_X014_S01_L002_B1_T2_P028,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_CAUD,CTRL,CTRL,CAUD,CAUD,00_38,CTRL,PD,UA,CTRL_CAUD,PD_00_38_CTRL_CAUD,PD_X014,S01,L002,B1,T2,PD_P028,H2,White,Female,91,2.0,,,,2_3,,0.0,16.0,19.0,yes,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,0.0,0.0,0.0,0.0,0.0,0.0,zero,0.0,0.0,0.0,1.25,0.5,1.75,not AD,II,criteria not met,0. No Lewy bodies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Diseased Control; Left hippocampal sclerosis; ...,,,,,,,,,,,,,,,,,,,
2,PD_00_38_CTRL_HIPP_X002_S11_L045_B1_T1_P002,PD_00_38_CTRL_HIPP_X002_S11_L045_B1_T1_P002,CTRL_HIPP_PD_00x38xx_X002_S11_L045_B1_T1_P002,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_HIPP,CTRL,CTRL,HIPP,HIPP,00_38,CTRL,PD,UA,CTRL_HIPP,PD_00_38_CTRL_HIPP,PD_X002,S11,L045,B1,T1,PD_P002,H2,White,Female,91,2.0,,,,2_3,,0.0,16.0,19.0,yes,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,0.0,0.0,0.0,0.0,0.0,0.0,zero,0.0,0.0,0.0,1.25,0.5,1.75,not AD,II,criteria not met,0. No Lewy bodies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Diseased Control; Left hippocampal sclerosis; ...,,,,,,,,,,,,,,,,,,,
3,PD_00_38_CTRL_HIPP_X002_S11_L046_B1_T2_P003,PD_00_38_CTRL_HIPP_X002_S11_L046_B1_T2_P003,CTRL_HIPP_PD_00x38xx_X002_S11_L046_B1_T2_P003,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_HIPP,CTRL,CTRL,HIPP,HIPP,00_38,CTRL,PD,UA,CTRL_HIPP,PD_00_38_CTRL_HIPP,PD_X002,S11,L046,B1,T2,PD_P003,H2,White,Female,91,2.0,,,,2_3,,0.0,16.0,19.0,yes,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,0.0,0.0,0.0,0.0,0.0,0.0,zero,0.0,0.0,0.0,1.25,0.5,1.75,not AD,II,criteria not met,0. No Lewy bodies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Diseased Control; Left hippocampal sclerosis; ...,,,,,,,,,,,,,,,,,,,
4,PD_00_38_CTRL_MDFG_X007_S04_L055_B1_T1_P014,PD_00_38_CTRL_MDFG_X007_S04_L055_B1_T1_P014,CTRL_MDFG_PD_00x38xx_X007_S04_L055_B1_T1_P014,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_MDFG,CTRL,CTRL,MDFG,MDFG,00_38,CTRL,PD,UA,CTRL_MDFG,PD_00_38_CTRL_MDFG,PD_X007,S04,L055,B1,T1,PD_P014,H2,White,Female,91,2.0,,,,2_3,,0.0,16.0,19.0,yes,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,0.0,0.0,0.0,0.0,0.0,0.0,zero,0.0,0.0,0.0,1.25,0.5,1.75,not AD,II,criteria not met,0. No Lewy bodies,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Diseased Control; Left hippocampal sclerosis; ...,,,,,,,,,,,,,,,,,,,


(268, 108)

## Get Bams for each brain region

In [11]:
display(metadata['Region'].unique())
regions = list(metadata['Region'].unique())

array(['CAUD', 'HIPP', 'MDFG', 'MDTG', 'PTMN', 'SUNI', 'SMTG', 'PARL'],
      dtype=object)

In [12]:
with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/ctrl_bams.txt') as infile:
    ctrl_bams = infile.readlines()
multi_bam_regs = []
for reg in regions:
    print(reg)
    region_bams = []
    for bam in ctrl_bams:
        if reg in bam:
            region_bams.append(bam)
    if len(region_bams) > 0:
        multi_bam_regs.append(reg)
        region_bams.sort()
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/' + reg)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' + reg)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' + reg)
        with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt', 'w') as outfile:
            for bam in region_bams:
                outfile.write(bam)

CAUD
HIPP
MDFG
MDTG
PTMN
SUNI
SMTG
PARL


## Make Input VCF files for each region

In [13]:
# rm_cmd = 'rm /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/run_asvcf.sh'
# print(rm_cmd)
# ! {rm_cmd}

# for reg in multi_bam_regs:
#     print(reg)
#     with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt') as infile:
#         reg_bams = [i.strip() for i in infile.readlines()]
#     reg_vcf = hg38_vcf.copy(deep=True)
#     for bam in reg_bams:
#         bam_name = bam.split('/')[10]
#         print(bam_name)
#         reg_vcf[bam_name] = ['./.' for i in range(len(reg_vcf))]
#     reg_vcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/initial.vcf', header=False, index=False, sep='\t')
#     bgzip_cmd = 'bgzip /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/initial.vcf'
#     ! {bgzip_cmd}
#     tabix_cmd = 'tabix -p vcf /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/initial.vcf.gz'
#     ! {tabix_cmd}
#     asvcf_cmd_1 = 'echo sbatch --export=ALL -n 1 -t 1-0 -p akundaje --mail-type=ALL -J ' \
#                 + reg + ' -o /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' \
#                 + reg + '/asvcf.o -e /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' \
#                 + reg + '/asvcf.e /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' \
#                 + reg + '/asvcf.sh >> /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/run_asvcf.sh'
#     asvcf_cmd_2 = 'echo \'#!/bin/bash\' > /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' \
#                 + reg + '/asvcf.sh'
#     asvcf_cmd_3 = 'echo /home/users/soumyak/rasqual/src/ASVCF/createASVCF.sh paired_end ' \
#                 + '/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' \
#                 + reg + '_ctrl_bams.txt /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
#                 + reg + '/initial.vcf.gz /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
#                 + reg + '/asvcf.gz atac >> /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' \
#                 + reg + '/asvcf.sh'
#     print(asvcf_cmd_1)
#     print(asvcf_cmd_2)
#     print(asvcf_cmd_3)
#     ! {asvcf_cmd_1}
#     ! {asvcf_cmd_2}
#     ! {asvcf_cmd_3}

## Load IDR Counts Matrix

In [14]:
idr_counts = pd.read_csv('/mnt/lab_data2/annashch/alzheimers_parkinsons/adpd.atac.idr.counts.txt.gz', sep='\t')
counts_header = idr_counts.columns
display(idr_counts.head())
display(idr_counts.shape)

Unnamed: 0,chrom,start,end,ADAD_CAUD_00_0281,ADAD_CAUD_00_0387,ADAD_CAUD_01_0164,ADAD_CAUD_01_1400,ADAD_CAUD_06_0194,ADAD_CAUD_06_1486,ADAD_CAUD_07_0787,ADAD_CAUD_12829xx,ADAD_CAUD_23156xx,ADAD_CAUD_24281xx,ADAD_CAUD_26436xx,ADAD_CAUD_AWQ6JZx,ADAD_CAUD_C0T7AAx,ADAD_CAUD_DCQGVNx,ADAD_CAUD_SXIONRx,ADAD_CAUD_WYYIWGx,ADAD_CAUD_YUJW97x,ADAD_HIPP_12829xx,ADAD_PARL_00_0700,ADAD_PARL_02_0621,ADAD_PARL_06_1486,ADAD_PARL_12540xx,ADAD_PARL_24281xx,ADAD_PARL_25044xx,ADAD_PARL_26436xx,ADAD_PARL_DCQGVNx,ADAD_PARL_HQFTL0x,ADAD_PARL_SXIONRx,ADAD_PARL_WYYIWGx,ADAD_PARL_YUJW97x,ADAD_SMTG_00_0387,ADAD_SMTG_00_0700,ADAD_SMTG_02_0621,ADAD_SMTG_06_1486,ADAD_SMTG_07_0787,ADAD_SMTG_12540xx,ADAD_SMTG_12829xx,ADAD_SMTG_23156xx,ADAD_SMTG_23710xx,ADAD_SMTG_24281xx,ADAD_SMTG_26436xx,ADAD_SMTG_AWQ6JZx,ADAD_SMTG_C0T7AAx,ADAD_SMTG_DCQGVNx,ADAD_SMTG_SXIONRx,ADAD_SMTG_WYYIWGx,ADAD_SMTG_YUJW97x,CTRH_CAUD_07_1144,CTRH_CAUD_07_1287,CTRH_CAUD_08_0712,CTRH_CAUD_10_0867,CTRH_CAUD_13_0962,CTRH_CAUD_14_0380,CTRH_CAUD_14_0513,CTRH_CAUD_14_1383,CTRH_CAUD_15_1023,CTRH_CAUD_15_1025,CTRH_HIPP_07_1058,CTRH_HIPP_07_1144,CTRH_HIPP_07_1287,CTRH_HIPP_08_0298,CTRH_HIPP_10_0867,CTRH_HIPP_13_0962,CTRH_HIPP_14_0380,CTRH_HIPP_14_0513,CTRH_HIPP_14_1383,CTRH_HIPP_15_1023,CTRH_HIPP_15_1025,CTRH_PARL_07_1058,CTRH_PARL_07_1144,CTRH_PARL_07_1287,CTRH_PARL_08_0298,CTRH_PARL_10_0867,CTRH_PARL_13_0962,CTRH_PARL_14_0380,CTRH_PARL_14_0513,CTRH_PARL_14_1383,CTRH_PARL_15_1023,CTRH_PARL_15_1025,CTRH_SMTG_07_1058,CTRH_SMTG_07_1144,CTRH_SMTG_07_1287,CTRH_SMTG_08_0712,CTRH_SMTG_10_0867,CTRH_SMTG_13_0962,CTRH_SMTG_14_0380,CTRH_SMTG_14_0513,CTRH_SMTG_14_1383,CTRH_SMTG_15_1023,CTRH_SMTG_15_1025,CTRL_CAUD_00_38,CTRL_CAUD_01_31,CTRL_CAUD_03_15,CTRL_CAUD_03_39,CTRL_CAUD_03_41,CTRL_CAUD_03_66,CTRL_CAUD_04_38,CTRL_CAUD_05_16,CTRL_CAUD_06_0615,CTRL_CAUD_06_1516,CTRL_CAUD_08_90,CTRL_CAUD_09_1589,CTRL_CAUD_09_35,CTRL_CAUD_11_0393,CTRL_CAUD_13_0038,CTRL_CAUD_13_1226,CTRL_CAUD_14_0586,CTRL_CAUD_14_0941,CTRL_CAUD_14_1018,CTRL_CAUD_15_78,CTRL_CAUD_16_10,CTRL_CAUD_16_32,CTRL_HIPP_00_38,CTRL_HIPP_04_38,CTRL_HIPP_05_16,CTRL_HIPP_06_1516,CTRL_HIPP_09_35,CTRL_HIPP_11_0393,CTRL_HIPP_13_0038,CTRL_HIPP_13_1226,CTRL_HIPP_14_0586,CTRL_HIPP_14_0941,CTRL_HIPP_14_1018,CTRL_MDFG_00_38,CTRL_MDFG_01_37,CTRL_MDFG_03_39,CTRL_MDFG_03_41,CTRL_MDFG_03_66,CTRL_MDFG_04_38,CTRL_MDFG_05_16,CTRL_MDFG_09_35,CTRL_MDFG_09_50,CTRL_MDFG_15_78,CTRL_MDFG_16_10,CTRL_MDFG_16_32,CTRL_MDTG_00_38,CTRL_MDTG_03_15,CTRL_MDTG_03_39,CTRL_MDTG_03_41,CTRL_MDTG_03_66,CTRL_MDTG_04_38,CTRL_MDTG_05_16,CTRL_MDTG_09_35,CTRL_MDTG_09_50,CTRL_MDTG_15_78,CTRL_MDTG_16_10,CTRL_MDTG_16_32,CTRL_PARL_09_1589,CTRL_PARL_11_0311,CTRL_PARL_11_0393,CTRL_PARL_13_0038,CTRL_PARL_13_1226,CTRL_PARL_14_0586,CTRL_PARL_14_1018,CTRL_PTMN_00_38,CTRL_PTMN_03_39,CTRL_PTMN_03_41,CTRL_PTMN_03_66,CTRL_PTMN_04_38,CTRL_PTMN_05_16,CTRL_PTMN_07_28,CTRL_PTMN_09_35,CTRL_PTMN_09_50,CTRL_PTMN_15_78,CTRL_PTMN_16_32,CTRL_SMTG_06_1516,CTRL_SMTG_09_1589,CTRL_SMTG_11_0311,CTRL_SMTG_11_0393,CTRL_SMTG_13_0038,CTRL_SMTG_13_1226,CTRL_SMTG_14_0586,CTRL_SMTG_14_1018,CTRL_SUNI_00_38,CTRL_SUNI_01_31,CTRL_SUNI_03_15,CTRL_SUNI_03_39,CTRL_SUNI_03_41,CTRL_SUNI_03_66,CTRL_SUNI_04_38,CTRL_SUNI_05_16,CTRL_SUNI_06_57,CTRL_SUNI_07_28,CTRL_SUNI_09_35,CTRL_SUNI_09_50,CTRL_SUNI_15_78,CTRL_SUNI_16_32,GBA1_CAUD_00_26,GBA1_CAUD_00_27,GBA1_CAUD_00_45,GBA1_CAUD_03_43,GBA1_CAUD_03_51,GBA1_CAUD_04_37,GBA1_CAUD_07_41,GBA1_CAUD_11_90,GBA1_CAUD_12_65,GBA1_CAUD_12_69,GBA1_CAUD_93_19,GBA1_CAUD_94_31,GBA1_CAUD_97_52,GBA1_HIPP_03_43,GBA1_HIPP_11_90,GBA1_HIPP_12_65,GBA1_HIPP_93_19,GBA1_HIPP_94_31,GBA1_HIPP_97_52,GBA1_MDFG_00_26,GBA1_MDFG_00_27,GBA1_MDFG_00_45,GBA1_MDFG_03_43,GBA1_MDFG_03_51,GBA1_MDFG_04_37,GBA1_MDFG_07_41,GBA1_MDFG_11_90,GBA1_MDFG_12_65,GBA1_MDFG_12_69,GBA1_MDFG_13_22,GBA1_MDFG_93_19,GBA1_MDFG_94_31,GBA1_MDFG_97_52,GBA1_MDTG_00_09,GBA1_MDTG_00_26,GBA1_MDTG_00_27,GBA1_MDTG_00_45,GBA1_MDTG_03_43,GBA1_MDTG_03_51,GBA1_MDTG_04_37,GBA1_MDTG_11_90,GBA1_MDTG_12_50,GBA1_MDTG_12_65,GBA1_MDTG_12_69,GBA1_MDTG_13_22,GBA1_MDTG_93_19,GBA1_MDTG_94_31,GBA1_MDTG_97_52,GBA1_PTMN_00_09,GBA1_PTMN_00_26,GBA1_PTMN_00_45,GBA1_PTMN_03_43,GBA1_PTMN_03_51,GBA1_PTMN_04_37,GBA1_PTMN_07_41,GBA1_PTMN_11_90,GBA1_PTMN_12_65,GBA1_PTMN_12_69,GBA1_PTMN_13_22,GBA1_PTMN_93_19,GBA1_PTMN_94_31,GBA1_PTMN_97_52,GBA1_SUNI_00_26,GBA1_SUNI_00_27,GBA1_SUNI_00_45,GBA1_SUNI_03_43,GBA1_SUNI_03_51,GBA1_SUNI_04_37,GBA1_SUNI_07_41,GBA1_SUNI_11_90,GBA1_SUNI_12_50,GBA1_SUNI_12_65,GBA1_SUNI_13_22,GBA1_SUNI_94_31,GBA1_SUNI_97_52,LOAD_CAUD_07_0226,LOAD_CAUD_07_0664,LOAD_CAUD_07_1112,LOAD_CAUD_09_1166,LOAD_CAUD_10_0188,LOAD_CAUD_10_1363,LOAD_CAUD_10_1770,LOAD_CAUD_10_1930,LOAD_CAUD_10_2007,LOAD_CAUD_11_0352,LOAD_CAUD_11_0773,LOAD_CAUD_11_1475,LOAD_CAUD_11_1686,LOAD_CAUD_13_0855,LOAD_CAUD_13_1300,LOAD_HIPP_06_0308,LOAD_HIPP_07_0226,LOAD_HIPP_07_0664,LOAD_HIPP_07_0997,LOAD_HIPP_07_1112,LOAD_HIPP_09_1166,LOAD_HIPP_10_0188,LOAD_HIPP_10_1363,LOAD_HIPP_10_1770,LOAD_HIPP_11_0352,LOAD_HIPP_11_0773,LOAD_HIPP_11_1475,LOAD_HIPP_12_1181,LOAD_HIPP_13_0855,LOAD_HIPP_13_1300,LOAD_PARL_06_0308,LOAD_PARL_07_0226,LOAD_PARL_07_0664,LOAD_PARL_07_0997,LOAD_PARL_07_1112,LOAD_PARL_08_0789,LOAD_PARL_09_1166,LOAD_PARL_10_0188,LOAD_PARL_10_0213,LOAD_PARL_10_1363,LOAD_PARL_10_1770,LOAD_PARL_10_2007,LOAD_PARL_11_0352,LOAD_PARL_11_1686,LOAD_PARL_12_1181,LOAD_PARL_13_0855,LOAD_PARL_13_1300,LOAD_SMTG_06_0308,LOAD_SMTG_07_0226,LOAD_SMTG_07_1112,LOAD_SMTG_09_1166,LOAD_SMTG_10_0188,LOAD_SMTG_10_0213,LOAD_SMTG_10_1770,LOAD_SMTG_10_1930,LOAD_SMTG_10_2007,LOAD_SMTG_11_0352,LOAD_SMTG_11_1475,LOAD_SMTG_12_1181,LOAD_SMTG_13_0855,LOAD_SMTG_13_1300,LOPD_CAUD_10_28,LOPD_CAUD_10_30,LOPD_CAUD_10_76,LOPD_CAUD_10_83,LOPD_CAUD_11_70,LOPD_CAUD_12_04,LOPD_CAUD_12_22,LOPD_CAUD_12_29,LOPD_CAUD_12_42,LOPD_CAUD_13_05,LOPD_CAUD_13_11,LOPD_CAUD_13_17,LOPD_CAUD_13_44,LOPD_CAUD_14_04,LOPD_HIPP_10_28,LOPD_HIPP_10_83,LOPD_HIPP_11_70,LOPD_HIPP_12_42,LOPD_HIPP_13_05,LOPD_HIPP_13_11,LOPD_HIPP_14_04,LOPD_MDFG_10_28,LOPD_MDFG_10_30,LOPD_MDFG_10_76,LOPD_MDFG_10_83,LOPD_MDFG_11_70,LOPD_MDFG_12_04,LOPD_MDFG_12_22,LOPD_MDFG_12_29,LOPD_MDFG_12_42,LOPD_MDFG_13_05,LOPD_MDFG_13_11,LOPD_MDFG_13_17,LOPD_MDFG_13_44,LOPD_MDFG_14_04,LOPD_MDTG_01_38,LOPD_MDTG_10_28,LOPD_MDTG_10_30,LOPD_MDTG_10_76,LOPD_MDTG_10_83,LOPD_MDTG_11_70,LOPD_MDTG_12_22,LOPD_MDTG_12_42,LOPD_MDTG_13_05,LOPD_MDTG_13_11,LOPD_MDTG_13_44,LOPD_MDTG_14_04,LOPD_PTMN_10_28,LOPD_PTMN_10_30,LOPD_PTMN_10_76,LOPD_PTMN_10_83,LOPD_PTMN_11_70,LOPD_PTMN_12_04,LOPD_PTMN_12_22,LOPD_PTMN_12_29,LOPD_PTMN_12_42,LOPD_PTMN_13_05,LOPD_PTMN_13_11,LOPD_PTMN_13_17,LOPD_PTMN_13_44,LOPD_PTMN_14_04,LOPD_SUNI_10_28,LOPD_SUNI_10_76,LOPD_SUNI_10_83,LOPD_SUNI_11_70,LOPD_SUNI_12_04,LOPD_SUNI_12_22,LOPD_SUNI_12_42,LOPD_SUNI_13_05,LOPD_SUNI_13_11,LOPD_SUNI_13_17,LOPD_SUNI_13_44,LOPD_SUNI_14_04,LRRK_CAUD_04_10,LRRK_CAUD_10_37,LRRK_CAUD_13_60,LRRK_HIPP_04_10,LRRK_MDFG_04_10,LRRK_MDFG_13_60,LRRK_MDTG_01_39,LRRK_MDTG_04_10,LRRK_MDTG_10_37,LRRK_MDTG_13_60,LRRK_PTMN_01_39,LRRK_PTMN_04_10,LRRK_PTMN_10_37,LRRK_PTMN_13_60,LRRK_SUNI_04_10,LRRK_SUNI_10_37
0,chr1,10015,10231,12,16,22,12,14,20,12,22,16,10,10,4,0,6,2,6,10,22,0,8,4,4,4,10,12,4,12,10,4,6,10,4,2,10,14,12,10,4,8,4,8,8,4,2,20,4,2,16,6,26,8,24,24,10,28,38,34,24,20,20,20,16,24,8,6,70,28,64,12,12,12,16,16,8,4,10,28,34,37,18,6,12,22,8,8,18,10,10,28,26,32,22,4,8,12,13,41,16,34,38,6,40,13,56,26,12,26,16,30,14,18,30,14,28,11,22,10,28,50,12,20,23,14,22,24,2,10,19,22,24,30,8,18,6,37,32,17,43,12,24,36,18,14,6,11,20,28,26,6,24,18,10,10,16,35,8,18,8,39,10,2,24,8,8,31,24,28,6,16,26,2,6,26,10,12,18,26,4,18,20,8,6,5,12,36,2,16,8,16,4,8,14,15,12,8,16,12,20,34,22,8,12,4,13,26,20,20,18,0,20,8,6,14,10,8,2,12,26,12,16,30,18,16,21,16,38,20,8,6,22,4,2,15,50,14,6,18,6,16,26,10,4,6,16,13,14,11,16,4,21,4,8,26,18,0,8,6,16,20,2,14,18,28,8,21,12,24,26,16,18,42,36,40,24,8,18,35,14,4,12,10,12,21,8,18,16,17,14,20,32,44,33,4,4,15,0,12,14,12,12,22,20,36,14,2,10,16,20,18,0,12,2,5,22,10,20,10,12,12,32,4,6,18,10,12,2,12,26,6,10,2,6,29,18,20,32,28,6,5,4,6,4,24,23,4,8,8,20,22,16,6,6,4,66,6,10,26,42,2,12,4,19,6,8,10,2,40,28,32,24,2,4,16,8,24,26,16,2,12,17,18,12,8,18,6,44,6,42,12,22,4,13,24,0,15,8,10,12,8,6,0,4,3,2,14,18,0,4,12,9,22,26
1,chr1,181363,181563,1,6,4,2,10,6,1,9,1,3,6,1,6,7,4,2,6,22,4,5,6,2,0,3,9,7,1,5,3,10,3,6,0,6,5,2,10,3,2,2,5,2,2,9,14,2,0,0,3,3,7,7,5,3,4,14,31,5,2,9,18,8,5,2,11,18,9,39,4,0,15,25,9,16,5,5,18,9,27,5,0,5,7,11,5,8,1,12,11,13,9,4,3,5,4,22,7,17,24,24,1,25,6,22,2,3,2,3,2,3,21,8,7,11,8,10,1,8,9,6,3,19,2,29,3,6,2,26,10,10,12,3,7,6,14,21,9,27,2,19,38,10,5,5,6,18,19,16,2,0,12,8,4,3,20,9,6,8,12,2,5,4,3,4,5,11,59,3,4,10,8,4,4,6,9,2,9,8,23,13,7,0,4,2,16,11,15,11,12,10,14,4,10,10,7,4,6,3,8,7,4,4,4,7,8,9,11,15,10,6,7,12,26,15,17,0,0,4,11,9,2,4,18,14,6,19,28,13,13,13,10,3,7,15,2,9,7,6,7,12,6,2,9,4,2,8,1,5,11,38,4,5,6,4,8,6,6,12,8,3,11,7,5,3,4,7,11,5,2,28,1,10,27,4,6,0,12,1,6,19,4,11,5,1,3,8,5,10,4,14,1,29,3,10,6,1,11,8,8,1,8,8,8,4,2,7,6,13,12,9,3,7,6,9,2,6,6,12,10,3,5,4,3,5,8,7,4,14,6,23,2,1,8,11,9,15,14,8,2,0,1,2,8,20,15,2,9,3,22,12,10,4,0,27,7,5,6,29,0,5,7,8,5,6,11,0,5,12,11,5,7,7,11,0,13,21,2,8,0,8,10,4,11,19,9,71,7,48,4,11,5,6,12,3,10,4,12,0,5,8,12,7,1,5,2,14,0,4,7,4,8,18
2,chr1,183716,183916,7,4,4,12,2,10,19,14,8,10,10,5,5,5,11,1,10,16,7,4,8,11,3,6,7,2,15,3,0,2,9,10,3,21,11,11,13,13,1,10,10,4,4,4,16,0,17,4,5,7,15,12,18,4,17,31,11,19,4,7,12,20,18,9,3,25,15,23,10,0,9,14,9,16,22,11,12,12,20,9,0,21,5,10,8,21,11,15,15,16,13,15,1,18,5,16,16,17,25,4,2,23,12,18,5,6,2,11,2,8,10,12,7,21,10,3,10,24,15,13,2,17,9,30,12,24,5,36,21,12,35,6,14,14,21,47,15,26,4,24,15,16,13,10,15,15,21,21,18,15,16,15,9,8,24,11,5,16,8,22,1,16,5,14,12,7,11,5,8,14,18,5,10,13,16,5,27,3,11,21,8,6,21,7,13,26,24,7,10,16,18,10,21,2,21,5,8,15,6,8,8,13,5,7,14,11,25,9,13,13,25,25,27,14,32,5,16,22,8,3,19,17,28,35,8,35,29,19,18,29,9,5,16,31,14,14,16,9,10,27,17,4,7,17,10,6,8,8,2,33,2,12,13,6,3,12,8,8,20,8,8,9,5,10,10,9,18,6,0,9,29,14,43,5,3,8,9,8,7,20,5,21,13,16,13,9,11,11,5,13,16,8,9,3,14,1,9,2,8,6,24,6,12,10,13,1,8,26,18,5,9,17,12,10,21,6,6,9,7,14,4,9,18,28,10,15,9,28,6,35,1,5,6,19,9,14,25,11,5,13,3,1,10,28,16,5,9,25,54,16,19,19,4,21,10,12,22,29,4,23,24,17,10,16,12,5,19,21,9,26,23,10,20,8,20,19,8,10,5,6,11,11,8,28,10,44,15,62,11,26,10,4,25,9,17,11,16,5,13,15,16,24,1,19,10,21,3,16,8,6,20,13
3,chr1,184083,184283,11,8,4,11,2,19,20,27,6,18,24,1,11,16,9,0,12,22,13,6,3,10,3,3,14,10,6,4,3,13,11,11,4,12,4,9,28,15,1,7,15,0,2,9,17,0,16,4,4,7,19,17,15,16,23,15,11,26,6,17,23,23,27,9,24,53,14,23,15,3,10,21,21,13,22,14,25,11,44,7,4,25,9,17,6,33,22,15,20,24,25,15,2,23,11,26,17,20,37,16,8,25,16,25,9,14,8,20,10,11,26,19,13,32,6,7,11,22,24,14,7,30,11,47,13,18,20,54,19,24,30,12,18,22,25,52,25,53,4,50,31,21,15,7,11,17,38,39,15,26,16,12,14,7,26,19,14,22,16,16,7,7,13,14,17,19,51,7,15,18,18,8,12,13,21,16,30,17,31,23,15,10,24,5,25,28,22,22,16,19,26,18,36,8,43,13,17,9,21,13,9,24,13,12,13,19,31,24,12,6,33,31,37,15,34,5,14,22,18,10,33,33,40,46,14,40,34,22,11,29,11,3,31,61,10,19,45,13,9,61,14,10,20,21,20,10,11,18,19,38,7,7,22,32,5,19,12,20,17,7,16,4,17,5,14,9,17,10,10,16,40,15,58,2,7,8,15,14,14,33,7,40,26,11,25,6,13,18,6,5,33,18,11,9,13,5,18,12,16,7,16,7,17,24,16,6,10,30,20,7,22,17,24,12,14,11,15,27,26,8,23,6,19,29,19,11,14,29,6,40,3,6,17,31,6,16,29,21,1,8,8,10,15,39,37,11,23,19,50,35,32,27,8,37,11,19,18,31,4,26,15,19,17,18,24,12,16,22,19,33,20,27,24,14,31,31,15,13,6,8,23,20,23,28,15,33,30,52,15,42,11,11,31,12,16,8,17,2,27,18,27,26,1,20,26,25,3,22,18,11,26,24
4,chr1,184370,184570,7,6,6,0,6,9,6,11,5,9,7,0,0,9,3,4,9,15,9,0,7,9,2,1,2,3,1,8,0,3,7,10,2,5,4,4,18,11,3,2,10,0,6,3,13,0,7,0,5,8,12,11,7,9,7,16,15,17,0,10,7,11,12,1,8,28,22,20,11,1,8,12,17,8,6,8,16,10,25,7,1,5,11,14,3,13,5,9,9,14,14,8,1,13,2,14,15,9,23,11,2,23,9,25,12,5,2,6,5,7,6,9,6,10,6,1,5,9,8,8,4,12,3,28,6,17,8,39,6,24,14,8,9,9,9,24,9,27,3,37,15,8,5,7,15,19,5,15,3,7,3,3,6,5,18,6,12,12,22,10,2,9,1,11,7,4,21,1,12,24,8,6,10,5,8,10,24,6,29,20,7,10,7,2,15,11,10,18,6,8,15,11,13,5,18,8,9,19,16,3,5,2,3,14,15,6,24,14,12,7,13,10,16,5,26,2,5,10,6,5,12,11,21,20,9,16,10,3,10,17,8,4,12,31,4,7,18,8,4,41,9,7,10,6,6,9,8,7,7,18,4,8,9,6,1,13,9,5,10,2,13,3,9,5,7,4,2,8,8,9,10,20,22,0,4,5,9,5,10,13,4,18,9,11,4,2,12,13,5,14,7,12,9,0,13,2,10,7,10,1,10,5,4,9,4,1,5,18,16,0,10,12,7,8,11,4,5,12,15,0,14,4,14,23,10,5,3,20,2,18,3,1,5,19,4,15,17,12,2,6,2,4,10,25,22,7,13,11,25,13,9,16,9,18,6,7,18,15,1,4,4,15,8,14,14,1,2,8,10,11,9,17,7,9,9,18,11,7,4,6,6,8,9,30,5,22,15,32,6,13,7,7,20,6,9,4,8,7,14,9,19,3,5,16,11,18,1,13,7,4,23,15


(385725, 415)

## Intersect Counts Matrix with 10kb Expanded SNPs 

In [60]:
expand_by = 0
counts_bed = pybedtools.BedTool.from_dataframe(idr_counts)
expanded_snps = hg38_bed.copy(deep=True)
expanded_snps['new_start'] = expanded_snps['start'].map(lambda x : max(0, x - expand_by))
expanded_snps['new_end'] = expanded_snps['pos'].map(lambda x : x + expand_by)
expanded_snps = expanded_snps[['chr', 'new_start', 'new_end', 'rsid']]
display(expanded_snps.head())
snps_bed = pybedtools.BedTool.from_dataframe(expanded_snps)
intersect_bed = counts_bed.intersect(snps_bed, u=True, wa=True)
intersect_counts = pybedtools.BedTool.to_dataframe(intersect_bed, header=None)
intersect_counts.columns = counts_header
final_peaks = intersect_counts['chrom'] + '_' + intersect_counts['start'].astype(str) + '_' + intersect_counts['end'].astype(str)
final_peaks = list(final_peaks)
display(len(final_peaks))

Unnamed: 0,chr,new_start,new_end,rsid
0,chr1,39822150,39822151,rs34640847
1,chr1,39822244,39822245,rs36015266
2,chr1,39822259,39822260,rs61779808
3,chr1,39822791,39822792,rs61779809
4,chr1,39825224,39825225,rs72666941


616

## Calculate GC Content

In [16]:
# ref_fasta = '/mnt/lab_data3/soumyak/refs/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta'
# ref = pysam.FastaFile(ref_fasta)

# with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/gc_content.txt', 'w') as outfile:
#     for index,row in idr_counts.iterrows():                                         
#         seq = ref.fetch(row['chrom'],int(row['start']),int(row['end']))
#         seq = seq.upper()
#         g = seq.count('G')
#         c = seq.count('C')
#         gc = (g + c) / len(seq)
#         if index % 20000 == 0:
#             print(index)
#         outfile.write(str(gc) + '\n')

## Create Region-Specific Counts Matrices

In [17]:
for reg in multi_bam_regs:
    print(reg)
    with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt') as infile:
        reg_bams = [i.strip() for i in infile.readlines()]
    bam_names = [bam.split('/')[10] for bam in reg_bams]
    print(bam_names)
    reg_counts = idr_counts.copy(deep=True)
    reg_counts['index'] = reg_counts['chrom'] + '_' + reg_counts['start'].astype(str) + '_' + reg_counts['end'].astype(str)
    reg_counts = reg_counts[['index'] + bam_names]
    reg_counts.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                        + reg + '/idr_counts.txt', index=False, header=False, sep='\t')

CAUD
['CTRH_CAUD_07_1144', 'CTRH_CAUD_07_1287', 'CTRH_CAUD_08_0712', 'CTRH_CAUD_10_0867', 'CTRH_CAUD_13_0962', 'CTRH_CAUD_14_0380', 'CTRH_CAUD_14_0513', 'CTRH_CAUD_14_1383', 'CTRH_CAUD_15_1023', 'CTRH_CAUD_15_1025', 'CTRL_CAUD_06_0615', 'CTRL_CAUD_06_1516', 'CTRL_CAUD_09_1589', 'CTRL_CAUD_11_0393', 'CTRL_CAUD_13_0038', 'CTRL_CAUD_13_1226', 'CTRL_CAUD_14_0586', 'CTRL_CAUD_14_0941', 'CTRL_CAUD_14_1018', 'CTRL_CAUD_00_38', 'CTRL_CAUD_01_31', 'CTRL_CAUD_03_15', 'CTRL_CAUD_03_39', 'CTRL_CAUD_03_41', 'CTRL_CAUD_03_66', 'CTRL_CAUD_04_38', 'CTRL_CAUD_05_16', 'CTRL_CAUD_08_90', 'CTRL_CAUD_09_35', 'CTRL_CAUD_15_78', 'CTRL_CAUD_16_10', 'CTRL_CAUD_16_32']
HIPP
['CTRH_HIPP_07_1058', 'CTRH_HIPP_07_1144', 'CTRH_HIPP_07_1287', 'CTRH_HIPP_08_0298', 'CTRH_HIPP_10_0867', 'CTRH_HIPP_13_0962', 'CTRH_HIPP_14_0380', 'CTRH_HIPP_14_0513', 'CTRH_HIPP_14_1383', 'CTRH_HIPP_15_1023', 'CTRH_HIPP_15_1025', 'CTRL_HIPP_06_1516', 'CTRL_HIPP_11_0393', 'CTRL_HIPP_13_0038', 'CTRL_HIPP_13_1226', 'CTRL_HIPP_14_0586', 'CTRL_

## Generate Offsets

In [18]:
# cd /home/users/soumyak/rasqual <- Run in Sherlock
# bash /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/make_offsets.sh

## Create Chromosome-Specific Inputs

In [19]:
chroms = ['chr' + str(i) for i in range(1, 23)]
chroms.append('chrX')
chroms.append('chrY')
# print(chroms)
# for reg in multi_bam_regs:
#     print(reg)
#     reg_counts = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
#                         + reg + '/idr_counts.txt', header=None, sep='\t')
#     reg_offset = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
#                         + reg + '/offset.txt',header=None, sep='\t')
#     for chrom in chroms:
#         if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
#                              + reg + '/' + chrom):
#             os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
#                      + reg + '/' + chrom)
#         print(chrom)
#         reg_chrom_counts = reg_counts.loc[reg_counts[0].apply(lambda x : x.startswith(chrom + '_'))]
#         reg_chrom_offset = reg_offset.loc[reg_offset[0].apply(lambda x : x.startswith(chrom + '_'))]
#         reg_chrom_counts.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
#                                  + reg + '/' + chrom + '/Y.txt', index=False, header=False, sep='\t')
#         reg_chrom_offset.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
#                                  + reg + '/' + chrom + '/K.txt', index=False, header=False, sep='\t')

## Generate Binaries

In [20]:
# cd /home/users/soumyak/rasqual <- Run in Sherlock
# bash /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/make_binary.sh

## Get Genotyped ASVCFs

In [21]:
for reg in multi_bam_regs:
    print(reg)
    with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt') as infile:
        reg_bams = [i.strip() for i in infile.readlines()]
    asvcf = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/asvcf.gz',
                        sep='\t', skiprows=1, header=None)
    bam_names = [bam.split('/')[10] for bam in reg_bams]
    bam_patients = [name.split('_')[2] + '_' + name.split('_')[3] for name in bam_names]
    header = ['chrom', 'pos', 'rsid', 'ref', 'alt', 'score', 'qc', 'info', 'format'] + bam_patients
    asvcf.columns = header
    asvcf.set_index('rsid', inplace=True)
    #print(asvcf.head())
    for patient in bam_patients:
        if os.path.isfile('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/quasar/output/'
                            + patient + '/genotypes.txt'):
            genotype = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/quasar/output/'
                                   + patient + '/genotypes.txt', sep='\t')
        genotype = genotype[['rsID', 'map.g0', 'map.g1', 'map.g2']]
        for index,row in genotype.iterrows():
            if row['map.g1'] >= row['map.g0']:
                if row['map.g1'] >= row['map.g2']:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '0/1')
                else:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '1/1')
            else:
                if row['map.g0'] >= row['map.g2']:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '0/0')
                else:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '1/1')
    asvcf.reset_index(inplace=True)
    asvcf = asvcf[header]
    #print(asvcf.head())
    asvcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/genotyped_asvcf.txt',
                    sep='\t', index=False)
    for chrom in chroms:
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                     + reg + '/' + chrom)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                     + reg + '/' + chrom)
        #print(chrom)
        chrom_asvcf = asvcf.loc[asvcf['chrom'] == chrom]
        chrom_asvcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                                 + reg + '/' + chrom + '/asvcf', index=False, header=False, sep='\t')
        bgzip_cmd = 'bgzip -f /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/' + chrom + '/asvcf'
        ! {bgzip_cmd}
        tabix_cmd = 'tabix -f -p vcf /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/' + chrom + '/asvcf.gz'
        ! {tabix_cmd}

CAUD
HIPP
MDFG
MDTG
PTMN
SUNI
SMTG
PARL


In [22]:
for reg in multi_bam_regs:
    print(reg)
    for chrom in chroms:
        #print(chrom)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/'
                     + reg + '/' + chrom)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/'
                     + reg + '/' + chrom)

CAUD
HIPP
MDFG
MDTG
PTMN
SUNI
SMTG
PARL


## Generate RASQUAL commands

In [23]:
chroms_noXY = ['chr' + str(i) for i in range(1, 23)]
for reg in multi_bam_regs:
    print(reg)
    reg_counts = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                            + reg + '/idr_counts.txt', header=None, sep='\t')
    for chrom in chroms_noXY:
        print(chrom)
        asvcf = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                            + reg + '/' + chrom + '/asvcf.gz', header=None, sep='\t')
        reg_chrom_counts = reg_counts.loc[reg_counts[0].apply(lambda x : x.startswith(chrom + '_'))]
        #print(asvcf.head())
        #print(reg_chrom_counts.head())
        samples_n = reg_counts.shape[1] - 1
        #print("Samples (n): ", samples_n)
        with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/'
                     + reg + '/' + chrom + '/run_rasqual.sh', 'w') as outfile, \
             open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/'
                     + reg + '/' + chrom + '/run_rasqual_permutation.sh', 'w') as outfile_perm:
            outfile.write('#!/bin/bash\n')
            outfile_perm.write('#!/bin/bash\n')
            for ind,feature in reg_chrom_counts.iterrows():
                #print(ind)
                if feature[0] in final_peaks:
                    feat_j = ind + 1
                    feat_n = feature[0]
                    #print(feat_j)
                    feature_start = int(feature[0].split('_')[1])
                    feature_end = int(feature[0].split('_')[2])
                    testing_start = max(0, feature_start - expand_by)
                    testing_end = feature_end + expand_by
                    feature_snps = 0
                    testing_snps = 0
                    for index2,snp in asvcf.iterrows():
                        if snp[1] >= feature_start and snp[1] <= feature_end:
                            testing_snps += 1
                            feature_snps += 1
                        elif snp[1] >= testing_start and snp[1] <= testing_end:
                            testing_snps += 1
                    assert expand_by == 0
                    assert feature_snps == testing_snps
                    rasqual_cmd = 'tabix /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                    + reg + '/' + chrom + '/asvcf.gz ' + chrom + ':' + str(testing_start) + '-' + str(testing_end) \
                                    + ' | /home/users/soumyak/rasqual/bin/rasqual -y /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                    + reg + '/idr_counts.bin -k /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                    + reg + '/offset.bin -n ' + str(samples_n) + ' -j ' + str(feat_j) + ' -l ' + str(testing_snps) + ' -m ' \
                                    + str(feature_snps) + ' -s ' + str(feature_start) + ' -e ' + str(feature_end) + ' -f ' + feat_n
                    rasqual_cmd_r = 'tabix /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                    + reg + '/' + chrom + '/asvcf.gz ' + chrom + ':' + str(testing_start) + '-' + str(testing_end) \
                                    + ' | /home/users/soumyak/rasqual/bin/rasqual -y /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                    + reg + '/idr_counts.bin -k /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                    + reg + '/offset.bin -n ' + str(samples_n) + ' -j ' + str(feat_j) + ' -l ' + str(testing_snps) + ' -m ' \
                                    + str(feature_snps) + ' -s ' + str(feature_start) + ' -e ' + str(feature_end) + ' -f ' + feat_n + ' -r'
                    outfile.write(rasqual_cmd + '\n')
                    outfile_perm.write(rasqual_cmd_r + '\n')

CAUD
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
HIPP
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
MDFG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
MDTG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
PTMN
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
SUNI
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
SMTG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
PARL
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18


## Concatenate RASQUAL outputs

In [24]:
for reg in multi_bam_regs:
    print(reg)
    ! cat /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/{reg}/chr*/rasqual_output.txt | sort -k11,11rn > /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/{reg}/all_{reg}_output.txt
    ! cat /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/{reg}/chr*/rasqual_output_permutation.txt | sort -k11,11rn > /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/{reg}/all_{reg}_output_permutation.txt    
! cat /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/*/all_*_output.txt | sort -k11,11rn> /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/all_regions_output.txt
! cat /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/*/all_*_output_permutation.txt | sort -k11,11rn> /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/all_regions_output_permutation.txt

CAUD
HIPP
MDFG
MDTG
PTMN
SUNI
SMTG
PARL


## Get P-values

In [25]:
for region in multi_bam_regs:
    print(region)
    region_output = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                                    + region + '/all_' + region + '_output.txt', sep='\t', header=None)
    region_output.columns = ['feature', 'rsid', 'chrom', 'snp_pos', 'ref', 'alt', 'allele_freq', 'HWE_chi_square_statistic',
                             'imputation_quality_score', 'log10_bh_qvalue', 'chi_square_statistic', 'effect_size', 'mapping_error_rate',
                            'ref_allele_mapping_bias', 'overdispersion', 'snp_id_within_region', 'num_feature_snps', 'num_testing_snps',
                            'num_iterations_null_hypothesis', 'null_iterations_alt_hypothesis', 'random_location_of_ties',
                             'log_likelihood_null_hypothesis', 'convergence_status', 'squared_correlation_fsnps', 'squared_correlation_rsnps']
    region_output = region_output.loc[region_output['rsid'] != 'SKIPPED']
    in_feature = []
    for ind,test in region_output.iterrows():
        start = int(test['feature'].split('_')[1])
        end = int(test['feature'].split('_')[2])
        if int(test['snp_pos']) >= start and int(test['snp_pos']) <= end:
            in_feature.append(True)
        else:
            in_feature.append(False)
    region_output['in_feature'] = in_feature
    region_output['region'] = [region for i in range(len(region_output))]
    region_output['raw_pvalue'] = region_output['chi_square_statistic'].map(lambda x : (1 - stats.chi2.cdf(float(x), 1)))
    region_output['cis_window_qvalue'] = 10**region_output['log10_bh_qvalue']
    
    region_permutation = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                                    + region + '/all_' + region + '_output_permutation.txt', sep='\t', header=None)
    region_permutation.columns = ['feature', 'rsid', 'chrom', 'snp_pos', 'ref', 'alt', 'allele_freq', 'HWE_chi_square_statistic',
                             'imputation_quality_score', 'log10_bh_qvalue', 'chi_square_statistic', 'effect_size', 'mapping_error_rate',
                            'ref_allele_mapping_bias', 'overdispersion', 'snp_id_within_region', 'num_feature_snps', 'num_testing_snps',
                            'num_iterations_null_hypothesis', 'null_iterations_alt_hypothesis', 'random_location_of_ties',
                             'log_likelihood_null_hypothesis', 'convergence_status', 'squared_correlation_fsnps', 'squared_correlation_rsnps']
    region_permutation = region_permutation.loc[region_permutation['rsid'] != 'SKIPPED']
    region_permutation['raw_pvalue'] = region_permutation['chi_square_statistic'].map(lambda x : (1 - stats.chi2.cdf(float(x), 1)))
    region_permutation['cis_window_qvalue'] = 10**region_permutation['log10_bh_qvalue']
    
    region_output['null_cis_qvalue'] = region_permutation['cis_window_qvalue']
    region_output.sort_values(by='cis_window_qvalue', inplace=True)
    region_output['genomewide_bh_qvalue'] = sm.stats.fdrcorrection(list(region_output['cis_window_qvalue']))[1]
    region_output.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                            + region + '/final_' + region + '_output.txt', sep='\t', index=False)
    
    display(region_output.head())
    display(region_output.shape)

CAUD


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
0,chr17_78431572_78433595,rs7212019,chr17,78432472,G,A,0.171875,1.378426,1.0,-7.673398,34.864044,0.764919,0.054194,0.143244,2.808754,2.0,8,6,7,6,78432472,16.662091,0,0.993437,0.990011,True,CAUD,3.535502e-09,2.121301e-08,0.002443,1.1e-05
3,chr17_78431572_78433595,rs11077365,chr17,78432412,C,A,0.9375,1.378426,1.0,-5.68607,24.651913,0.778447,0.037254,0.127387,2.468079,1.0,8,6,6,6,78432472,11.668557,0,0.992228,0.981582,True,CAUD,6.867655e-07,2.060297e-06,0.005189,0.000398
1,chr17_45702992_45704386,rs17762308,chr17,45703582,T,C,0.359375,0.888889,1.0,-5.638462,25.425502,0.719547,0.079023,0.338486,2.867342,0.0,5,5,6,6,45703582,11.654617,0,0.989611,0.986383,True,CAUD,4.597995e-07,2.298997e-06,0.005189,0.000398
2,chr17_45818390_45819780,rs62057109,chr17,45819271,T,C,0.21875,0.235102,1.0,-5.436042,24.878265,0.705236,0.039329,0.295675,2.962826,4.0,6,6,8,9,45819271,11.40273,0,0.985955,0.993407,True,CAUD,6.106707e-07,3.664024e-06,0.005189,0.000475
5,chr17_78431572_78433595,rs692329,chr17,78431841,C,T,0.703125,2.492678,1.0,-5.167449,21.576268,0.703714,0.059994,0.179475,2.233529,0.0,8,6,6,6,78432472,9.971584,0,0.993179,0.992335,True,CAUD,3.400333e-06,6.800666e-06,0.010112,0.000664


(519, 31)

HIPP


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
0,chr17_78431572_78433595,rs7212019,chr17,78432472,G,A,0.181818,1.08642,1.0,-8.792368,39.097073,0.79128,0.06059,0.16157,3.462957,2.0,8,4,6,6,78432472,18.747591,0,0.993585,0.991026,True,HIPP,4.032481e-10,1.612992e-09,0.001692,8.500469e-07
1,chr17_78431572_78433595,rs691346,chr17,78433056,G,T,0.727273,3.414141,1.0,-5.049101,21.053873,0.710199,0.069978,0.226901,2.324247,6.0,8,4,6,6,78432472,9.743852,0,0.99309,0.990593,True,HIPP,4.465493e-06,8.930985e-06,0.001692,0.002353315
2,chr17_78431572_78433595,rs11077365,chr17,78432412,C,A,0.818182,1.08642,1.0,-4.347109,17.195408,0.744396,0.04538,0.17318,2.657648,1.0,8,4,8,6,78432472,7.90058,0,0.990224,0.990035,True,HIPP,3.372506e-05,4.496674e-05,0.001692,0.007899158
3,chr17_78431572_78433595,rs692329,chr17,78431841,C,T,0.75,2.148438,1.0,-3.366332,12.396246,0.673225,0.068775,0.231607,2.102971,0.0,8,4,6,6,78432472,5.5135,0,0.993048,0.993161,True,HIPP,0.0004301978,0.0004301978,0.001692,0.05667855
4,chr7_65960407_65960696,rs62470932,chr7,65960530,C,T,0.340909,3.32716,1.0,-2.92287,10.49907,0.306683,5.9e-05,0.595304,11.227217,0.0,1,1,5,5,65960530,4.589529,0,0.896464,0.896464,True,HIPP,0.001194346,0.001194346,0.003558,0.1258841


(527, 31)

MDFG


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
3,chr8_22599221_22600284,rs755934,chr8,22599692,T,G,0.375,0.148148,1.0,-2.385009,8.229762,0.394382,0.000146,0.498644,40.121462,0.0,1,1,5,5,22599692,2.729497,0,0.996045,0.996045,True,MDFG,0.004121,0.004121,0.02466,0.190429
0,chr8_22622374_22623929,rs11780207,chr8,22623006,G,A,0.458333,2.958189,1.0,-2.352835,9.359037,0.392856,4.8e-05,0.494381,29.332559,1.0,2,2,5,5,22623006,3.137108,0,0.996551,0.997301,True,MDFG,0.002219,0.004438,0.017543,0.190429
1,chr17_45825137_45827395,rs62057154,chr17,45827244,C,T,0.333333,4.6875,1.0,-2.225874,8.504661,0.300795,0.042145,0.321162,6.965365,1.0,3,3,6,5,45827244,3.412369,0,0.991781,0.994368,True,MDFG,0.003542,0.005945,0.02466,0.190429
2,chr17_45825137_45827395,rs62057148,chr17,45826119,G,C,0.875,7.647765,1.0,-2.225874,8.300645,0.309707,0.042193,0.321536,5.765476,0.0,3,3,5,5,45827244,3.386116,0,0.991024,0.987951,True,MDFG,0.003963,0.005945,0.02466,0.190429
18,chr11_86067972_86070115,rs2458500,chr11,86068268,A,G,0.833333,0.48,1.0,-2.112704,7.615858,0.662747,0.002169,0.343247,11.983433,2.0,4,4,7,7,86068255,2.791775,0,0.685511,0.678352,True,MDFG,0.005786,0.007714,0.088001,0.190429


(553, 31)

MDTG


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
0,chr11_60251518_60251763,rs636341,chr11,60251688,A,C,0.75,2.958189,1.0,-4.614461,17.83144,0.287161,7.9e-05,0.377744,19.795544,1.0,2,2,5,4,60251688,7.626432,0,0.996875,0.997271,True,MDTG,2.4e-05,2.4e-05,0.029677,0.004054
1,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.75,2.958189,1.0,-4.614461,17.818867,0.287154,7.9e-05,0.377505,19.789926,0.0,2,2,4,4,60251688,7.620118,0,0.997225,0.997193,True,MDTG,2.4e-05,2.4e-05,0.028981,0.004054
2,chr17_78431572_78433595,rs7212019,chr17,78432472,G,A,0.166667,0.48,1.0,-4.157169,17.630667,0.723166,0.036605,0.183028,3.364151,6.0,18,18,5,5,78432472,8.120604,0,0.992652,0.99077,True,MDTG,2.7e-05,7e-05,0.028981,0.004054
3,chr17_78431572_78433595,rs691362,chr17,78432944,T,C,0.166667,0.48,1.0,-4.157169,17.627564,0.723161,0.036603,0.183028,3.363916,9.0,18,18,5,5,78432472,8.119062,0,0.992666,0.990261,True,MDTG,2.7e-05,7e-05,0.028981,0.004054
4,chr17_78431572_78433595,rs72907463,chr17,78432579,G,A,0.166667,0.48,1.0,-4.157169,17.627417,0.72316,0.036602,0.18303,3.363953,8.0,18,18,5,5,78432472,8.119033,0,0.992666,0.990462,True,MDTG,2.7e-05,7e-05,0.028981,0.004054


(524, 31)

PTMN


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
0,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.818182,2.272727,1.0,-4.343199,16.632233,0.251451,0.000382,0.48246,21.296903,0.0,1,1,5,5,60251677,7.212099,0,0.996979,0.996979,True,PTMN,4.5e-05,4.5e-05,0.030643,0.012513
1,chr17_45825137_45827395,rs62057154,chr17,45827244,C,T,0.318182,1.517551,1.0,-3.997813,16.552042,0.236428,0.023228,0.425026,8.262196,4.0,5,4,6,6,45827244,7.408472,0,0.992381,0.994588,True,PTMN,4.7e-05,0.000101,0.016011,0.012513
2,chr17_45825137_45827395,rs62057147,chr17,45825932,G,A,0.909091,3.227431,1.0,-3.997813,15.781771,0.242174,0.024419,0.412235,8.062102,1.0,5,4,6,6,45827244,7.022824,0,0.991537,0.99626,True,PTMN,7.1e-05,0.000101,0.030718,0.012513
3,chr17_45825137_45827395,rs1912151,chr17,45825578,C,T,0.909091,3.227431,1.0,-3.997813,15.670806,0.242677,0.024599,0.413606,8.004409,0.0,5,4,6,6,45827244,6.974272,0,0.993062,0.990183,True,PTMN,7.5e-05,0.000101,0.030718,0.012513
4,chr17_45639355_45639798,rs413778,chr17,45639519,A,G,0.727273,2.090072,1.0,-3.751973,14.060507,0.74828,0.016716,0.514541,15.421235,0.0,1,1,6,6,45639519,5.765139,0,0.98705,0.98705,True,PTMN,0.000177,0.000177,0.030718,0.017631


(498, 31)

SUNI


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
1,chr17_45894052_45896463,rs11575895,chr17,45894419,A,G,0.357143,0.062222,1.0,-2.561455,11.480721,0.671862,0.010848,0.301631,7.377854,3.0,13,13,9,10,45894419,4.588737,0,0.978662,0.869372,True,SUNI,0.000703,0.002745,0.043506,0.099234
2,chr17_45894052_45896463,rs62056779,chr17,45894571,C,A,0.357143,0.062222,1.0,-2.561455,11.46703,0.671855,0.01085,0.301618,7.376435,4.0,13,13,9,10,45894419,4.582022,0,0.978731,0.869331,True,SUNI,0.000708,0.002745,0.034931,0.099234
3,chr17_45894052_45896463,rs80233201,chr17,45894115,T,C,0.357143,0.062222,1.0,-2.561455,10.719334,0.674705,0.010953,0.298847,7.208121,1.0,13,13,9,10,45894419,4.214025,0,0.978907,0.869343,True,SUNI,0.00106,0.002745,0.043506,0.099234
4,chr17_45894052_45896463,rs111972148,chr17,45895755,G,C,0.357143,0.062222,1.0,-2.561455,10.718434,0.674692,0.010955,0.298866,7.209797,8.0,13,13,9,10,45894419,4.21347,0,0.978911,0.868927,True,SUNI,0.001061,0.002745,0.047231,0.099234
5,chr17_45894052_45896463,rs74548327,chr17,45895714,A,G,0.642857,0.884383,1.0,-2.561455,10.29734,0.666734,0.011023,0.31552,6.77984,7.0,13,13,10,10,45894419,3.985714,0,0.977652,0.887832,True,SUNI,0.001332,0.002745,0.047231,0.099234


(401, 31)

SMTG


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
4,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.947368,0.120222,1.0,-2.646028,9.325969,0.349554,0.023827,0.45427,14.67685,0.0,1,1,5,5,60251677,3.764374,0,0.990307,0.990307,True,SMTG,0.002259,0.002259,0.03464,0.280309
0,chr17_45702992_45704386,rs17762308,chr17,45703582,T,C,0.342105,0.824653,1.0,-2.609435,11.141087,0.684378,0.051434,0.236782,4.800978,0.0,5,4,8,8,45703582,4.751464,0,0.988776,0.98482,True,SMTG,0.000844,0.002458,0.011683,0.280309
2,chr17_45702992_45704386,rs62056879,chr17,45704060,T,C,0.342105,0.824653,1.0,-2.609435,9.891001,0.66871,0.052973,0.242037,5.23267,2.0,5,4,10,8,45703582,4.025779,0,0.985551,0.973879,True,SMTG,0.001661,0.002458,0.047964,0.280309
3,chr17_45702992_45704386,rs968028,chr17,45703739,A,G,0.342105,0.824653,1.0,-2.609435,9.699255,0.668376,0.052029,0.243765,4.841157,1.0,5,4,8,8,45703582,3.975581,0,0.986465,0.976028,True,SMTG,0.001843,0.002458,0.032534,0.280309
1,chr17_45870069_45870629,rs56398500,chr17,45870122,A,G,0.342105,0.824653,1.0,-2.486283,9.923435,0.349655,0.133834,0.209595,34.113632,0.0,2,2,7,7,45870122,3.698863,0,0.989065,0.985043,True,SMTG,0.001632,0.003264,0.020303,0.280309


(498, 31)

PARL


Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,HWE_chi_square_statistic,imputation_quality_score,log10_bh_qvalue,chi_square_statistic,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,num_iterations_null_hypothesis,null_iterations_alt_hypothesis,random_location_of_ties,log_likelihood_null_hypothesis,convergence_status,squared_correlation_fsnps,squared_correlation_rsnps,in_feature,region,raw_pvalue,cis_window_qvalue,null_cis_qvalue,genomewide_bh_qvalue
0,chr17_45825137_45827395,rs62057155,chr17,45827307,C,T,0.333333,1.655026,1.0,-2.621835,13.396173,0.71611,0.130264,0.123697,2.667632,13.0,14,14,5,5,45827307,6.089785,0,0.990664,0.990264,True,PARL,0.000252,0.002389,0.008219,0.249884
1,chr17_45825137_45827395,rs62057154,chr17,45827244,C,T,0.333333,1.655026,1.0,-2.621835,11.757083,0.706438,0.132745,0.123332,2.540562,12.0,14,14,5,5,45827307,5.31094,0,0.99061,0.989764,True,PARL,0.000606,0.002389,0.04527,0.249884
2,chr17_45825137_45827395,rs17763050,chr17,45825970,G,A,0.055556,0.062284,1.0,-2.621835,11.541501,0.720707,0.127308,0.125002,3.177627,5.0,14,14,5,5,45827307,5.226442,0,0.990575,0.970495,True,PARL,0.000681,0.002389,0.04527,0.249884
3,chr17_45825137_45827395,rs62057148,chr17,45826119,G,C,0.055556,0.062284,1.0,-2.621835,11.536333,0.720665,0.127318,0.125014,3.174356,6.0,14,14,6,5,45827307,5.223527,0,0.9906,0.969942,True,PARL,0.000682,0.002389,0.04527,0.249884
4,chr8_22599221_22600284,rs755934,chr8,22599692,T,G,0.305556,0.571478,1.0,-2.607441,9.163267,0.37323,0.007269,0.554673,22.269325,0.0,1,1,4,4,22599692,3.459936,0,0.99374,0.99374,True,PARL,0.002469,0.002469,0.076945,0.249884


(506, 31)

In [26]:
# run run_permutation_rasqual.ipynb

In [27]:
with pd.ExcelWriter('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                    + 'rasqual_output.xlsx') as writer:  
    for region in multi_bam_regs:
        print(region)
        region_output = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                                    + region + '/final_' + region + '_output.txt', sep='\t')
        region_output.to_excel(writer, sheet_name=region, index=False)

CAUD
HIPP
MDFG
MDTG
PTMN
SUNI
SMTG
PARL


In [28]:
! cat /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/*/final_*_output.txt | grep -v ^feature > /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/final_regions_output.txt
! cat /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/*/final_*_output_permutation.txt | grep -v ^feature > /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/final_regions_output_permutation.txt



In [89]:
ryan_bias_snps = ['rs1237999','rs3755519','rs6781790','rs55682376','rs12119232','rs147889095',
                'rs3104768','rs58392387','rs17631303','rs17762308','rs56327054','rs12150223',
                'rs2532307','rs72914882','rs72914893','rs72914895','rs600834','rs600850','rs11077365',
                'rs72907463','rs691362','rs691346','rs691331','rs691328','rs691317','rs6501212',
                'rs76516995','rs10182292','rs744373','rs12493578']
in_original = []
in_original_sig = []
combined_output = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/final_regions_output.txt', sep='\t', header=None)
display(combined_output.head())
combined_output.columns = ['feature', 'rsid', 'chrom', 'snp_pos', 'ref', 'alt', 'allele_freq', 'HWE_chi_square_statistic',
                             'imputation_quality_score', 'log10_bh_qvalue', 'chi_square_statistic', 'effect_size', 'mapping_error_rate',
                            'ref_allele_mapping_bias', 'overdispersion', 'snp_id_within_region', 'num_feature_snps', 'num_testing_snps',
                            'num_iterations_null_hypothesis', 'null_iterations_alt_hypothesis', 'random_location_of_ties',
                             'log_likelihood_null_hypothesis', 'convergence_status', 'squared_correlation_fsnps', 'squared_correlation_rsnps',
                              'in_feature', 'region', 'raw_pvalue', 'cis_window_qvalue', 'null_cis_qvalue', 'genomewide_bh_qvalue',
                               'permutation_significant', 'fdr10_threshold']
combined_output.sort_values(by=['rsid','cis_window_qvalue'], inplace=True)
sig_snps = combined_output.loc[combined_output['permutation_significant'] == True]
for ind,val in combined_output.iterrows():
    if val['rsid'] in ryan_bias_snps:
        in_original.append(True)
    else:
        in_original.append(False)
combined_output['in_original'] = in_original

for ind,val in sig_snps.iterrows():
    if val['rsid'] in ryan_bias_snps:
        in_original_sig.append(True)
    else:
        in_original_sig.append(False)
sig_snps['in_original'] = in_original_sig
unique_sig_snps = sig_snps.drop_duplicates(subset='rsid')
unique_sig_snps.sort_values(by=['cis_window_qvalue'], inplace=True)
display('All Significant SNPs: ', sig_snps.shape)
display('Unique Significant SNPs: ', unique_sig_snps.shape)
sig_snps.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/rasqual_significant_snps.csv', sep='\t', index=False)
unique_sig_snps.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/unique_rasqual_significant_snps.csv', sep='\t', index=False)

'All Significant SNPs: '

(209, 34)

'Unique Significant SNPs: '

(116, 34)

## Plot P-Value Distributions

In [32]:
# for region in multi_bam_regs:
#     print(region)
#     region_output = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
#                                     + region + '/final_' + region + '_output.txt', sep='\t')
#     print(len(region_output))
#     display(p9.ggplot(data=region_output, mapping=p9.aes(x='raw_pvalue')) + p9.geom_histogram(bins=20) + p9.labs(title='Raw P-value: ' + region))
#     display(p9.ggplot(data=region_output, mapping=p9.aes(x='cis_window_qvalue')) + p9.geom_histogram(bins=20) + p9.labs(title='Q-value: ' + region))
#     display(p9.ggplot(data=region_output, mapping=p9.aes(x='null_cis_qvalue')) + p9.geom_histogram(bins=20) + p9.labs(title='Null Q-value: ' + region))

In [33]:
adpd_stage3 = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/snps_final/191121_ld_buddies_table_stage3.tsv', sep='\t')
display(adpd_stage3.head())
display(adpd_stage3.shape)

Unnamed: 0,chr,pos,r2_with_ld_tag,ld_tag_chr,ld_tag_pos,source_gwas,source,snp_id,locus_num,number_ld_buddies,ld_tag_locus,rsid,chrom_hg19,snp_pos_hg19,file,effect_allele,noneffect_allele,direction,pvalue,has_coloc,direct_atac_overlap_narrow_tissue_regions,containing_atac_tissues_narrow_tissue_regions,nearest_atac_tissue_narrow_tissue_regions,start_narrow_tissue_regions,end_narrow_tissue_regions,dist_narrow_tissue_regions,direct_atac_overlap_broad_tissue_regions,containing_atac_tissues_broad_tissue_regions,nearest_atac_tissue_broad_tissue_regions,start_broad_tissue_regions,end_broad_tissue_regions,dist_broad_tissue_regions,direct_atac_overlap_single_cell,containing_atac_tissues_single_cell,nearest_atac_tissue_single_cell,start_single_cell,end_single_cell,dist_single_cell
0,10,102207833,0.805869,10,102255522,Nalls_23andMe,LD,10_102207833,1,9,10_102255522,rs10883717,10.0,103967590.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,T,C,-,0.014434,False,False,none,CAUD,102225620,102226734,17780.0,False,none,ALLO,102225838,102226054,18005.0,False,none,microglia,102213693,102214072,5860.0
1,10,102245653,0.98749,10,102255522,Nalls_23andMe,LD,10_102245653,1,9,10_102255522,rs2296887,10.0,104005410.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,T,C,-,0.006977,False,True,"CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI","CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI",102244764,102246025,258.5,False,none,ALLO,102245836,102246040,183.0,True,"astrocytes,doublets,excitatory_neurons,inhibit...","astrocytes,doublets,excitatory_neurons,inhibit...",102244973,102245978,177.5
2,10,102250385,0.809348,10,102255522,Nalls_23andMe,LD,10_102250385,1,9,10_102255522,rs7913281,10.0,104010142.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,G,A,+,0.012859,False,False,none,CAUD,102244791,102246164,4221.0,False,none,ALLO,102245836,102246040,4345.0,False,none,opcs,102247774,102248243,2142.0
3,10,102251214,0.809348,10,102255522,Nalls_23andMe,LD,10_102251214,1,9,10_102255522,rs10883720,10.0,104010971.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,G,C,-,0.012617,False,False,none,CAUD,102244791,102246164,5050.0,False,none,ALLO,102245836,102246040,5174.0,False,none,opcs,102247774,102248243,2971.0
4,10,102255522,1.0,10,102255522,Nalls_23andMe,Nalls-Chang,10_102255522,1,9,10_102255522,rs10748818,10.0,104015279.0,/users/mgloud/projects/ad-pd/data/gwas/prepare...,G,A,+,0.006335,False,False,none,CAUD,102244791,102246164,9358.0,False,none,ALLO,102245836,102246040,9482.0,False,none,inhibitory_neurons,102255548,102256050,26.0


(13773, 38)

In [34]:
trimmed_sig_snps = sig_snps[['feature', 'rsid', 'chrom', 'snp_pos', 'ref', 'alt', 'allele_freq',
                            'effect_size', 'mapping_error_rate', 'ref_allele_mapping_bias', 'overdispersion',
                            'snp_id_within_region', 'num_feature_snps', 'num_testing_snps',
                            'in_feature', 'region', 'raw_pvalue', 'cis_window_qvalue', 'genomewide_bh_qvalue',
                            'permutation_significant', 'fdr10_threshold', 'in_original']]
trimmed_adpd_stage3 = adpd_stage3[['rsid', 'r2_with_ld_tag', 'ld_tag_chr', 'ld_tag_pos', 'source_gwas', 'source',
                                    'locus_num', 'number_ld_buddies', 'ld_tag_locus', 'effect_allele', 'noneffect_allele', 'direction',
                                    'pvalue', 'has_coloc', 'direct_atac_overlap_narrow_tissue_regions',
                                    'containing_atac_tissues_narrow_tissue_regions',
                                    'nearest_atac_tissue_narrow_tissue_regions',
                                    'start_narrow_tissue_regions',
                                    'end_narrow_tissue_regions',
                                    'dist_narrow_tissue_regions',
                                    'direct_atac_overlap_broad_tissue_regions',
                                    'containing_atac_tissues_broad_tissue_regions',
                                    'nearest_atac_tissue_broad_tissue_regions',
                                    'start_broad_tissue_regions',
                                    'end_broad_tissue_regions',
                                    'dist_broad_tissue_regions',
                                    'direct_atac_overlap_single_cell',
                                    'containing_atac_tissues_single_cell',
                                    'nearest_atac_tissue_single_cell',
                                    'start_single_cell',
                                    'end_single_cell',
                                    'dist_single_cell']]
trimmed_adpd_stage3.drop_duplicates(inplace=True)
trim_sig_merge = trimmed_sig_snps.merge(trimmed_adpd_stage3, on='rsid')
trim_sig_merge.sort_values(by=['chrom', 'snp_pos', 'rsid', 'feature', 'region'], inplace=True)
display(trim_sig_merge.head())
display(trim_sig_merge.shape)

Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,in_feature,region,raw_pvalue,cis_window_qvalue,genomewide_bh_qvalue,permutation_significant,fdr10_threshold,in_original,r2_with_ld_tag,ld_tag_chr,ld_tag_pos,source_gwas,source,locus_num,number_ld_buddies,ld_tag_locus,effect_allele,noneffect_allele,direction,pvalue,has_coloc,direct_atac_overlap_narrow_tissue_regions,containing_atac_tissues_narrow_tissue_regions,nearest_atac_tissue_narrow_tissue_regions,start_narrow_tissue_regions,end_narrow_tissue_regions,dist_narrow_tissue_regions,direct_atac_overlap_broad_tissue_regions,containing_atac_tissues_broad_tissue_regions,nearest_atac_tissue_broad_tissue_regions,start_broad_tissue_regions,end_broad_tissue_regions,dist_broad_tissue_regions,direct_atac_overlap_single_cell,containing_atac_tissues_single_cell,nearest_atac_tissue_single_cell,start_single_cell,end_single_cell,dist_single_cell
18,chr1_161185016_161186504,rs11585858,chr1,161186243,C,A,0.552632,0.382894,4.6e-05,0.441369,40.915012,1,2,2,True,SMTG,0.009947,0.019893,0.586346,True,0.032534,False,1.0,1,161186243,Alzheimers_Jansen_2018,GWAS,129,2,1_161186243,A,C,+,5.58e-10,True,True,"HIPP,MDFG,PARL,SMTG","HIPP,MDFG,PARL,SMTG",161185040,161186430,422.0,True,ALLO,ALLO,161185897,161186291,149.0,True,microglia,microglia,161185232,161186486,384.0
0,chr1_232528515_232530038,rs10797576,chr1,232528865,C,T,0.28125,0.359946,2.8e-05,0.581322,27.696111,0,1,1,True,CAUD,0.000556,0.000556,0.016024,True,0.005189,False,1.0,1,232528865,Chang_23andMe_Parkinsons,Nalls-Chang,22,14,1_232528865,T,C,+,0.000748,False,True,"CAUD,PARL,PTMN,SMTG","CAUD,PARL,PTMN,SMTG",232528497,232530050,287.5,False,none,STRI,232528568,232528768,97.0,True,"excitatory_neurons,inhibitory_neurons,neurons_...","excitatory_neurons,inhibitory_neurons,neurons_...",232528552,232529236,29.0
1,chr1_232528515_232530038,rs10797576,chr1,232528865,C,T,0.28125,0.359946,2.8e-05,0.581322,27.696111,0,1,1,True,CAUD,0.000556,0.000556,0.016024,True,0.005189,False,1.0,1,232528865,Nalls_23andMe,Nalls-Chang,22,14,1_232528865,T,C,+,0.000748,False,True,"CAUD,PARL,PTMN,SMTG","CAUD,PARL,PTMN,SMTG",232528497,232530050,287.5,False,none,STRI,232528568,232528768,97.0,True,"excitatory_neurons,inhibitory_neurons,neurons_...","excitatory_neurons,inhibitory_neurons,neurons_...",232528552,232529236,29.0
163,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.833333,0.387836,0.007736,0.49199,18.930226,0,2,2,True,MDFG,0.008743,0.008745,0.190429,True,0.02466,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5
164,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.833333,0.387836,0.007736,0.49199,18.930226,0,2,2,True,MDFG,0.008743,0.008745,0.190429,True,0.02466,False,1.0,11,60251677,Alzheimers_Kunkle_2019,GWAS,11,261,11_60251677,T,C,+,5.91e-15,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5


(276, 53)

In [35]:
snp_dict = {}
for ind,val in trim_sig_merge.iterrows():
    if val['rsid'] in snp_dict:
        snp_dict[val['rsid']].append((val['source_gwas'],val['pvalue']))
    else:
        snp_dict[val['rsid']] = []
        snp_dict[val['rsid']].append((val['source_gwas'],val['pvalue']))
for snp in snp_dict:
    snp_dict[snp] = sorted(snp_dict[snp], key=lambda x: x[-1])
for ind,val in trim_sig_merge.iterrows():
    val['source_gwas'] = snp_dict[val['rsid']][0][0]
    val['pvalue'] = snp_dict[val['rsid']][0][1]
gwas_trim_sig_merge = trim_sig_merge.drop_duplicates(subset=['chrom', 'snp_pos', 'rsid', 'feature', 'region'])
display(gwas_trim_sig_merge.head())
display(gwas_trim_sig_merge.shape)
gwas_trim_sig_merge.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/gwas_rasqual_significant_snps.csv', sep='\t', index=False)
unique_gwas_trim_sig_merge = gwas_trim_sig_merge.sort_values(by=['rsid', 'cis_window_qvalue'])
unique_gwas_trim_sig_merge.drop_duplicates(subset='rsid', inplace=True)
unique_gwas_trim_sig_merge.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/unique_gwas_rasqual_significant_snps.csv', sep='\t', index=False)

Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,in_feature,region,raw_pvalue,cis_window_qvalue,genomewide_bh_qvalue,permutation_significant,fdr10_threshold,in_original,r2_with_ld_tag,ld_tag_chr,ld_tag_pos,source_gwas,source,locus_num,number_ld_buddies,ld_tag_locus,effect_allele,noneffect_allele,direction,pvalue,has_coloc,direct_atac_overlap_narrow_tissue_regions,containing_atac_tissues_narrow_tissue_regions,nearest_atac_tissue_narrow_tissue_regions,start_narrow_tissue_regions,end_narrow_tissue_regions,dist_narrow_tissue_regions,direct_atac_overlap_broad_tissue_regions,containing_atac_tissues_broad_tissue_regions,nearest_atac_tissue_broad_tissue_regions,start_broad_tissue_regions,end_broad_tissue_regions,dist_broad_tissue_regions,direct_atac_overlap_single_cell,containing_atac_tissues_single_cell,nearest_atac_tissue_single_cell,start_single_cell,end_single_cell,dist_single_cell
18,chr1_161185016_161186504,rs11585858,chr1,161186243,C,A,0.552632,0.382894,4.6e-05,0.441369,40.915012,1,2,2,True,SMTG,0.009947,0.019893,0.586346,True,0.032534,False,1.0,1,161186243,Alzheimers_Jansen_2018,GWAS,129,2,1_161186243,A,C,+,5.58e-10,True,True,"HIPP,MDFG,PARL,SMTG","HIPP,MDFG,PARL,SMTG",161185040,161186430,422.0,True,ALLO,ALLO,161185897,161186291,149.0,True,microglia,microglia,161185232,161186486,384.0
0,chr1_232528515_232530038,rs10797576,chr1,232528865,C,T,0.28125,0.359946,2.8e-05,0.581322,27.696111,0,1,1,True,CAUD,0.000556,0.000556,0.016024,True,0.005189,False,1.0,1,232528865,Chang_23andMe_Parkinsons,Nalls-Chang,22,14,1_232528865,T,C,+,0.000748,False,True,"CAUD,PARL,PTMN,SMTG","CAUD,PARL,PTMN,SMTG",232528497,232530050,287.5,False,none,STRI,232528568,232528768,97.0,True,"excitatory_neurons,inhibitory_neurons,neurons_...","excitatory_neurons,inhibitory_neurons,neurons_...",232528552,232529236,29.0
163,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.833333,0.387836,0.007736,0.49199,18.930226,0,2,2,True,MDFG,0.008743,0.008745,0.190429,True,0.02466,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5
151,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.75,0.287154,7.9e-05,0.377505,19.789926,0,2,2,True,MDTG,2.4e-05,2.4e-05,0.004054,True,0.0257,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5
154,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.818182,0.251451,0.000382,0.48246,21.296903,0,1,1,True,PTMN,4.5e-05,4.5e-05,0.012513,True,0.030718,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5


(209, 53)

In [36]:
orig_sig_snps = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/orig_sig_snps.tsv', sep='\t')
orig_sig_snps = orig_sig_snps[['rsid', 'ism_pval', 'delta_pval', 'explain_pval', 'confidence',
                              'motif', 'diff_motif', 'fc_track', 'pval_track', 'annotation', 'cluster']]
display(orig_sig_snps.head())
display(orig_sig_snps.shape)

Unnamed: 0,rsid,ism_pval,delta_pval,explain_pval,confidence,motif,diff_motif,fc_track,pval_track,annotation,cluster
0,rs2296887,0.021475,0.022689,0.125544,1,AACAGGCTCC,AACAGGC(T/C)CC,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,12411
1,rs7900536,0.029275,0.030456,0.191728,1,CCTCTAGAGGGATGATTCA,CCTCTAGAGGGATGATTC(A/C),http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,127
2,rs2856653,0.028402,0.028299,0.134918,1,TCTCCCACGG,TCTCCCA(C/T)GG,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,2
3,rs11039225,0.046656,0.047686,0.999999,0,TGGAAGTCCT,TGGA(A/G)GTCCT,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,19
4,rs636317,0.000131,0.000235,0.027467,2,TGCTGCCATCTGCTGGGAA,TGCTGC(C/T)ATCTGCTGGGAA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"MS4A4A, microglia specific",14192024


(123, 11)

In [37]:
orig_sig_merge = gwas_trim_sig_merge.merge(orig_sig_snps, on='rsid')
display(orig_sig_merge.head())
display(orig_sig_merge.shape)
orig_sig_merge.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/ml_gwas_rasqual_significant_snps.csv', sep='\t', index=False)
unique_orig_sig_merge = orig_sig_merge.sort_values(by=['rsid'])
unique_orig_sig_merge.drop_duplicates(subset='rsid', inplace=True)
unique_orig_sig_merge.sort_values(by=['confidence', 'feature'], ascending=False, inplace=True)
unique_orig_sig_merge.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/unique_ml_gwas_rasqual_significant_snps.csv', sep='\t', index=False)

Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,in_feature,region,raw_pvalue,cis_window_qvalue,genomewide_bh_qvalue,permutation_significant,fdr10_threshold,in_original,r2_with_ld_tag,ld_tag_chr,ld_tag_pos,source_gwas,source,locus_num,number_ld_buddies,ld_tag_locus,effect_allele,noneffect_allele,direction,pvalue,has_coloc,direct_atac_overlap_narrow_tissue_regions,containing_atac_tissues_narrow_tissue_regions,nearest_atac_tissue_narrow_tissue_regions,start_narrow_tissue_regions,end_narrow_tissue_regions,dist_narrow_tissue_regions,direct_atac_overlap_broad_tissue_regions,containing_atac_tissues_broad_tissue_regions,nearest_atac_tissue_broad_tissue_regions,start_broad_tissue_regions,end_broad_tissue_regions,dist_broad_tissue_regions,direct_atac_overlap_single_cell,containing_atac_tissues_single_cell,nearest_atac_tissue_single_cell,start_single_cell,end_single_cell,dist_single_cell,ism_pval,delta_pval,explain_pval,confidence,motif,diff_motif,fc_track,pval_track,annotation,cluster
0,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.833333,0.387836,0.007736,0.49199,18.930226,0,2,2,True,MDFG,0.008743,0.008745,0.190429,True,0.02466,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5,0.000131,0.000235,0.027467,2,TGCTGCCATCTGCTGGGAA,TGCTGC(C/T)ATCTGCTGGGAA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"MS4A4A, microglia specific",14192024
1,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.75,0.287154,7.9e-05,0.377505,19.789926,0,2,2,True,MDTG,2.4e-05,2.4e-05,0.004054,True,0.0257,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5,0.000131,0.000235,0.027467,2,TGCTGCCATCTGCTGGGAA,TGCTGC(C/T)ATCTGCTGGGAA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"MS4A4A, microglia specific",14192024
2,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.818182,0.251451,0.000382,0.48246,21.296903,0,1,1,True,PTMN,4.5e-05,4.5e-05,0.012513,True,0.030718,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5,0.000131,0.000235,0.027467,2,TGCTGCCATCTGCTGGGAA,TGCTGC(C/T)ATCTGCTGGGAA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"MS4A4A, microglia specific",14192024
3,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.947368,0.349554,0.023827,0.45427,14.67685,0,1,1,True,SMTG,0.002259,0.002259,0.280309,True,0.032534,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5,0.000131,0.000235,0.027467,2,TGCTGCCATCTGCTGGGAA,TGCTGC(C/T)ATCTGCTGGGAA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"MS4A4A, microglia specific",14192024
4,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.678571,0.318225,0.016438,0.42532,10.7544,0,2,1,True,SUNI,0.007008,0.007008,0.165306,True,0.043506,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5,0.000131,0.000235,0.027467,2,TGCTGCCATCTGCTGGGAA,TGCTGC(C/T)ATCTGCTGGGAA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"MS4A4A, microglia specific",14192024


(40, 63)

In [38]:
display(unique_orig_sig_merge.head(10))
display(unique_orig_sig_merge.shape)

Unnamed: 0,feature,rsid,chrom,snp_pos,ref,alt,allele_freq,effect_size,mapping_error_rate,ref_allele_mapping_bias,overdispersion,snp_id_within_region,num_feature_snps,num_testing_snps,in_feature,region,raw_pvalue,cis_window_qvalue,genomewide_bh_qvalue,permutation_significant,fdr10_threshold,in_original,r2_with_ld_tag,ld_tag_chr,ld_tag_pos,source_gwas,source,locus_num,number_ld_buddies,ld_tag_locus,effect_allele,noneffect_allele,direction,pvalue,has_coloc,direct_atac_overlap_narrow_tissue_regions,containing_atac_tissues_narrow_tissue_regions,nearest_atac_tissue_narrow_tissue_regions,start_narrow_tissue_regions,end_narrow_tissue_regions,dist_narrow_tissue_regions,direct_atac_overlap_broad_tissue_regions,containing_atac_tissues_broad_tissue_regions,nearest_atac_tissue_broad_tissue_regions,start_broad_tissue_regions,end_broad_tissue_regions,dist_broad_tissue_regions,direct_atac_overlap_single_cell,containing_atac_tissues_single_cell,nearest_atac_tissue_single_cell,start_single_cell,end_single_cell,dist_single_cell,ism_pval,delta_pval,explain_pval,confidence,motif,diff_motif,fc_track,pval_track,annotation,cluster
11,chr17_4901692_4902448,rs79436576,chr17,4901872,C,T,0.090909,0.673282,0.005003,0.51453,33.035472,0,3,3,True,PTMN,0.019599,0.019988,0.348312,True,0.030718,False,0.873516,17,4860256,Alzheimers_Kunkle_2019,LD,55,80,17_4860256,T,C,+,0.000306,True,True,"HIPP,MDFG,PARL,SMTG","HIPP,MDFG,PARL,SMTG",4901724,4903885,904.0,True,ISOC,ISOC,4901711,4901911,61.0,True,"astrocytes,microglia,oligodendrocytes","astrocytes,microglia,oligodendrocytes",4901620,4902717,296.5,0.004616,0.004449,0.041519,2,TGGCCCCGCCCC,TGGCC(C/T)CGCCCC,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,Many possible linked genes. ARRB2 - arrestin B...,6111314151920212224
16,chr17_45825137_45827395,rs62057150,chr17,45826182,C,T,0.555556,0.662942,0.135228,0.132052,2.083686,8,14,14,True,PARL,0.009645,0.015776,0.448529,True,0.031533,False,1.0,17,45826182,23andme_PD_hg38,GWAS,54,2496,17_45826182,T,C,-,1.33e-22,True,True,"CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI","CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI",45825194,45826634,268.0,False,none,ALLO,45826333,45826558,151.0,True,oligodendrocytes,oligodendrocytes,45825230,45826777,178.5,0.042619,0.045282,0.031732,2,CCATTGGTCG,CCAT(T/C)GGTCG,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,MAPT Locus,23
13,chr17_45702992_45704386,rs968027,chr17,45703884,C,T,0.90625,0.663832,0.083686,0.354419,2.166129,2,5,5,True,CAUD,0.000616,0.000715,0.018553,True,0.005189,False,1.0,17,45703884,23andme_PD_hg38,GWAS,54,2496,17_45703884,T,C,-,2.07e-22,True,True,"CAUD,PARL,PTMN,SMTG","CAUD,PARL,PTMN,SMTG",45702996,45706091,659.5,False,none,ISOC,45703478,45703678,206.0,True,"excitatory_neurons,inhibitory_neurons","excitatory_neurons,inhibitory_neurons",45703290,45704925,223.5,0.007962,0.008333,0.04134,2,TGACTAA,TGAC(T/C)AA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,MAPT Locus,2
9,chr16_31117024_31117373,rs4889619,chr16,31117294,C,T,0.888889,0.611059,0.008871,0.476374,8.603434,0,2,2,True,PARL,0.022687,0.029441,0.572968,True,0.031533,False,0.919429,16,31110472,Chang_23andMe_Parkinsons,LD,48,47,16_31110472,T,C,+,0.0340138,False,True,"CAUD,HIPP,MDFG,PARL,SMTG,SUNI","CAUD,HIPP,MDFG,PARL,SMTG,SUNI",31117280,31118525,608.5,False,none,SUNI,31117841,31118041,547.0,True,"excitatory_neurons,inhibitory_neurons",microglia,31117415,31118239,121.0,0.022485,0.024622,0.044335,2,CGGAGGCGG,(C/T)GGAGGCGG,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"KAT8, multiple cells, also in AD?? Im surprisd...",23
7,chr11_86103106_86104633,rs1237999,chr11,86103988,G,A,0.5,0.342093,0.005153,0.373118,9.175715,1,2,2,True,PTMN,0.008599,0.010799,0.347583,True,0.030718,True,1.0,11,86103988,Alzheimers_Jansen_2018,GWAS,16,163,11_86103988,G,A,-,7.14e-16,False,True,"CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI","CAUD,HIPP,MDFG,PARL,PTMN,SMTG,SUNI",86103138,86104596,121.0,True,STRI,STRI,86103829,86104179,16.0,True,"doublets,neurons_unknown,oligodendrocytes","doublets,neurons_unknown,oligodendrocytes",86103388,86104560,14.0,0.001043,0.001603,0.006443,2,TATGAGTCACC,TAT(G/A)AGTCACC,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"PICALM, oligo specific",13713181920212223
0,chr11_60251518_60251763,rs636317,chr11,60251677,C,T,0.833333,0.387836,0.007736,0.49199,18.930226,0,2,2,True,MDFG,0.008743,0.008745,0.190429,True,0.02466,False,1.0,11,60251677,Alzheimers_Jansen_2018,GWAS,11,385,11_60251677,C,T,-,2.83e-13,False,False,none,MDFG,60264747,60266009,13070.0,False,none,ISOC,60265062,60265262,13385.0,True,"excitatory_neurons,microglia","excitatory_neurons,microglia",60251481,60252738,432.5,0.000131,0.000235,0.027467,2,TGCTGCCATCTGCTGGGAA,TGCTGC(C/T)ATCTGCTGGGAA,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,"MS4A4A, microglia specific",14192024
39,chr8_22599221_22600284,rs755934,chr8,22599692,T,G,0.714286,0.385599,0.000215,0.428795,24.758803,0,1,1,True,SUNI,0.00297,0.00297,0.099234,True,0.043506,False,0.808324,8,22668467,Chang_23andMe_Parkinsons,LD,125,32,8_22668467,T,G,-,0.0510743,False,False,none,PARL,22597288,22597925,1751.0,False,none,ISOC,22597666,22597887,1805.0,True,"astrocytes,inhibitory_neurons,microglia,opcs","astrocytes,inhibitory_neurons,microglia,opcs",22598476,22600229,339.5,0.043331,0.032394,0.092178,1,GCATTGGCCG,GCA(T/G)TGGCCG,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,7
36,chr7_100217213_100217819,rs1727139,chr7,100217744,C,T,0.583333,0.346145,0.000634,0.436404,16.179343,0,1,1,True,MDFG,0.008799,0.008799,0.190429,True,0.02466,False,1.0,7,100217744,Alzheimers_Jansen_2018,GWAS,117,109,7_100217744,C,T,-,9.85e-10,True,True,PTMN,PTMN,100217219,100217897,186.0,False,none,ALLO,100218052,100218259,308.0,True,"microglia,oligodendrocytes","microglia,oligodendrocytes",100216695,100217925,403.5,0.039411,0.0386,0.12015,1,GCGGAGAGAGAGAG,G(C/T)GGAGAGAGAGAG,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,9
30,chr2_95323422_95324085,rs3755519,chr2,95323524,A,T,0.710526,0.377405,6.6e-05,0.419191,15.6021,0,1,1,True,SMTG,0.003689,0.003689,0.280309,True,0.032534,True,0.954653,2,95335195,Nalls_23andMe,LD,85,94,2_95335195,T,A,+,0.00768245,False,True,"CAUD,MDFG,PARL,SMTG","CAUD,MDFG,PARL,SMTG",95323063,95324126,64.5,True,"ISOC,STRI","ISOC,STRI",95323441,95323644,18.5,True,"excitatory_neurons,inhibitory_neurons","excitatory_neurons,inhibitory_neurons",95323307,95323985,122.0,0.011536,0.01424,0.146132,1,AGTGCTCTCTG,(A/T)GTGCTCTCTG,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,1345710141522
28,chr19_18461919_18462167,rs8103622,chr19,18462024,C,T,0.375,0.368914,7.4e-05,0.546707,16.761091,0,1,1,True,MDTG,0.020112,0.020112,0.501839,True,0.0257,False,0.988859,19,18448066,Alzheimers_Jansen_2018,LD,69,73,19_18448066,T,C,+,0.07305735,True,True,"MDFG,PTMN,SMTG","MDFG,PTMN,SMTG",18461893,18462180,11.0,True,"ISOC,STRI","ISOC,STRI",18461893,18462093,31.0,True,"excitatory_neurons,inhibitory_neurons,oligoden...","excitatory_neurons,inhibitory_neurons,oligoden...",18461930,18462272,77.0,0.036737,0.039399,0.389165,1,AGAGCGCCACCTGCT,AGAG(C/T)GCCACCTGCT,http://mitra.stanford.edu/kundaje/projects/alz...,http://mitra.stanford.edu/kundaje/projects/alz...,---,21120212224


(19, 63)

In [39]:
gwas_trim_sig_merge['feature'].value_counts()

chr17_78431572_78433595     53
chr17_45894052_45896463     24
chr17_45825137_45827395     18
chr17_45702992_45704386     13
chr5_102971168_102972471     8
chr17_45818390_45819780      7
chr11_60251518_60251763      7
chr17_45583892_45585718      4
chr17_45740360_45742185      4
chr16_31117024_31117373      4
chr11_86103106_86104633      4
chr11_86067972_86070115      4
chr8_22622374_22623929       3
chr17_4901692_4902448        3
chr8_22599221_22600284       3
chr17_45976701_45977374      2
chr4_89834926_89835137       2
chr2_127105481_127107473     2
chr17_46130574_46131153      2
chr17_45585766_45586646      2
chr4_89762368_89763049       2
chr3_48692599_48693032       2
chr2_95323422_95324085       2
chr17_45914420_45914901      2
chr4_89722620_89722820       2
chr17_45841532_45841760      1
chr19_18513115_18513639      1
chr16_70676530_70676768      1
chr8_22672754_22674048       1
chr2_101823575_101823819     1
chr17_45907812_45908508      1
chr3_48966889_48967612       1
chr3_521

In [41]:
gwas_trim_sig_merge['in_feature'].value_counts()

True    209
Name: in_feature, dtype: int64

In [95]:
print("Total Number of Unique Features:")
print(len(idr_counts))
print("Total Number of Unique SNPs:")
all_unique_snps = adpd_stage3.sort_values(by='rsid')
all_unique_snps.drop_duplicates(subset='rsid', inplace=True)
print(len(all_unique_snps))
print("Total Number of Unique SNPs in Features:")
new_intersect_bed = snps_bed.intersect(counts_bed, u=True, wa=True)
new_intersect_df = pybedtools.BedTool.to_dataframe(new_intersect_bed, header=None)
new_intersect_df.columns = ['chr', 'new_start', 'new_end', 'rsid']
new_intersect_df.sort_values(by='rsid', inplace=True)
new_intersect_df.drop_duplicates(subset='rsid', inplace=True)
print(len(new_intersect_df))
print("Total Number of Unique Features with SNPs:")
uniq_features_with_snps = intersect_counts.sort_values(by=['chrom', 'start'])
uniq_features_with_snps.drop_duplicates(subset=['chrom', 'start'], inplace=True)
print(len(uniq_features_with_snps))
print('---')
print("Number of Tests:")
print(len(combined_output))
print("Number of Unique Feature-SNP Combos Tested:")
feature_snp_combos = combined_output.sort_values(by=['rsid', 'feature'])
feature_snp_combos.drop_duplicates(subset=['rsid', 'feature'], inplace=True)
print(len(feature_snp_combos))
print("Number of Unique SNPs Tested:")
unique_snps = combined_output.sort_values(by=['rsid'])
unique_snps.drop_duplicates(subset=['rsid'], inplace=True)
print(len(unique_snps))
print("Number of Unique Features Tested:")
unique_features = combined_output.sort_values(by=['feature'])
unique_features.drop_duplicates(subset=['feature'], inplace=True)
print(len(unique_features))
print('---')
print("Number of Significant Feature-SNP Combos:")
print(len(gwas_trim_sig_merge))
print("Number of Unique Significant Feature-SNP Combos:")
sig_feature_snp_combos = gwas_trim_sig_merge.sort_values(by=['rsid', 'feature'])
sig_feature_snp_combos.drop_duplicates(subset=['rsid', 'feature'], inplace=True)
print(len(sig_feature_snp_combos))
print("Number of Unique Significant SNPs:")
unique_sig_snps = gwas_trim_sig_merge.sort_values(by=['rsid'])
unique_sig_snps.drop_duplicates(subset=['rsid'], inplace=True)
print(len(unique_sig_snps))
print("Number of Unique Significant Features:")
unique_sig_features = gwas_trim_sig_merge.sort_values(by=['feature'])
unique_sig_features.drop_duplicates(subset=['feature'], inplace=True)
print(len(unique_sig_features))
print('---')
print("Number of Significant Feature-SNP Combos in ML intersect:")
print(len(orig_sig_merge))
print("Number of Unique Significant Feature-SNP Combos in ML intersect:")
sig_feature_snp_combos_ml = orig_sig_merge.sort_values(by=['rsid', 'feature'])
sig_feature_snp_combos_ml.drop_duplicates(subset=['rsid', 'feature'], inplace=True)
print(len(sig_feature_snp_combos_ml))
print("Number of Unique Significant SNPs in ML intersect:")
unique_sig_snps_ml = orig_sig_merge.sort_values(by=['rsid'])
unique_sig_snps_ml.drop_duplicates(subset=['rsid'], inplace=True)
print(len(unique_sig_snps_ml))
print("Number of Unique Significant Features in ML intersect:")
unique_sig_features_ml = orig_sig_merge.sort_values(by=['feature'])
unique_sig_features_ml.drop_duplicates(subset=['feature'], inplace=True)
print(len(unique_sig_features_ml))
print('---')
print('Number of SNPs from Ryan\'s List that were in features')
print(len([i for i in list(new_intersect_df['rsid']) if i in ryan_bias_snps]))
print('Number of SNPs from Ryan\'s List that were tested')
print(len(unique_snps.loc[unique_snps['in_original'] == True]))
print('Intersect with Ryan\'s List')
print(len(unique_gwas_trim_sig_merge.loc[unique_gwas_trim_sig_merge['in_original'] == True]))

Total Number of Unique Features:
385725
Total Number of Unique SNPs:
9707
Total Number of Unique SNPs in Features:
1072
Total Number of Unique Features with SNPs:
616
---
Number of Tests:
4026
Number of Unique Feature-SNP Combos Tested:
685
Number of Unique SNPs Tested:
685
Number of Unique Features Tested:
398
---
Number of Significant Feature-SNP Combos:
209
Number of Unique Significant Feature-SNP Combos:
116
Number of Unique Significant SNPs:
116
Number of Unique Significant Features:
55
---
Number of Significant Feature-SNP Combos in ML intersect:
40
Number of Unique Significant Feature-SNP Combos in ML intersect:
19
Number of Unique Significant SNPs in ML intersect:
19
Number of Unique Significant Features in ML intersect:
18
---
Number of SNPs from Ryan's List that were in features
26
Number of SNPs from Ryan's List that were tested
23
Intersect with Ryan's List
14
