Requirements:
LCL data Tewhey et al., Table S1. Combined LCL Analysis for All 39,478 Ref/Alt Pairs Tested by MPRA, Related to Figure 2 (https://www.cell.com/fulltext/S0092-8674(16)30421-4)

In [None]:
import pandas as pd
import re
from collections import Counter
import scipy.stats as stats
import numpy as np
import statistics as stat
from get_config_yaml import get_config

In [None]:
def get_start_end_coord(position_sig):
    """
    LCL SNPs are centered in 150 bp sequences
    Determine start (-75 bp) and end (+75) coordinate of regulatory elements MPRA
    """ 
    start_mpra = []
    end_mpra = []
    for i in position_sig:
        start_mpra.append(i - 75)
        end_mpra.append(i + 75)
    return start_mpra, end_mpra


In [None]:
config = get_config()

In [23]:

lcl_mpra_path = (config['lcl_mpra'])

In [24]:
lcl = pd.read_csv(lcl_mpra_path, sep=';')

In [25]:
lcl

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
0,rs11548103_RC,rs11548103,neg,ref,893.147913,1403.234147,0.637129,20.726159,16.129804,985.132571,1413.418102,0.512199,24.239298,19.642943,-0.157649,-0.070399,-0.124930,1.197350,0.803895
1,rs2016366,rs2016366,pos,ref,316.596386,258.902025,-0.281146,1.799521,0.000000,345.506770,401.928486,0.195118,0.437249,0.000000,0.344054,0.696614,0.476264,,
2,rs2016366_alt,rs2016366,pos,alt,653.148636,605.357051,-0.104744,0.364373,0.000000,627.652912,774.785548,0.287498,1.736553,0.000000,0.390497,0.395150,0.392242,,
3,rs11102212_RC,rs11102212,neg,ref,272.682393,724.187989,1.276380,20.903079,16.306723,270.606119,663.387712,1.183784,18.276201,13.679846,-0.182603,0.057417,-0.092596,0.151356,0.102546
4,rs646867_RC,rs646867,neg,ref,605.412960,595.896429,-0.023234,0.213685,0.000000,978.751908,806.449311,-0.270002,2.243712,0.000000,-0.287121,-0.179513,-0.246768,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39473,rs9621715_alt,rs9621715,pos,alt,650.529931,507.281685,-0.342574,2.658698,0.000000,563.523072,419.823896,-0.401997,2.811763,0.000000,-0.089697,-0.008966,-0.059423,,
39474,rs4275_RC,rs4275,neg,ref,718.716545,859.461801,0.250261,2.474741,0.000000,576.804932,802.800230,0.455674,5.422339,0.825984,0.167059,0.269337,0.205413,,
39475,rs131816_RC,rs131816,neg,ref,754.841862,634.680055,-0.245811,1.350291,0.000000,954.736488,1064.020915,0.151281,1.036517,0.000000,0.473005,0.270571,0.397092,,
39476,rs131816_RC_alt,rs131816,neg,alt,827.703506,840.290995,0.018192,0.117288,0.000000,753.191408,756.170624,0.003935,0.110077,0.000000,-0.011147,-0.019440,-0.014257,,


In [26]:
lcl['SNP'].value_counts()

rs115855724    4
rs118026199    4
rs116983424    4
rs112595714    4
rs118159794    4
              ..
rs114015819    1
rs76308922     1
rs10262443     1
rs11761517     1
rs2076041      1
Name: SNP, Length: 29173, dtype: int64

In [27]:
snp_chr = [i for i in lcl['SNP'] if i.startswith('chr')]

In [28]:
lcl[lcl['SNP'].isin(snp_chr)]['SNP'].value_counts()

chr17:44104410:D    4
chr17:44276431:I    4
chr17:44149352:D    4
chr17:44354157:I    4
chr17:44037106:I    4
                   ..
chr7:141469761:I    1
chr7:66072054:D     1
chr6:2930007:D      1
chr6:30558477:I     1
chr10:35471310:I    1
Name: SNP, Length: 2018, dtype: int64

In [29]:
len(snp_chr)

2755

Build conversion from grch37 to  grch38

In [30]:
#Extract chromosome, snp position and add an end position of snps without rsid for build conversion to grch38
snp_chr = [i for i in list(lcl['SNP'].unique()) if i.startswith('chr')]
start_pos = [int(i[5:].replace(':', '').replace('I', '').replace('D','')) for i in snp_chr]
end_pos = [i + 1 for i in start_pos]
chr_pos = [i[:5].replace(':', '') for i in list(lcl['SNP'].unique()) if i.startswith('chr')]

In [None]:
#Create input file for ensmbl
with open("total_unique_grch37_positions_input_ensmbl.txt","w") as f:
    if len(start_pos) == len(end_pos) == len(chr_pos) == len(snp_chr):
        for (chr,start,end,snp) in zip(chr_pos,start_pos,end_pos, snp_chr):
            f.write("{0} \t {1} \t {2} \t {3}\n".format(chr,start,end, snp))

In [None]:
#Read in output file of ensbml
unique_total_grch38 = pd.read_csv('data/total_unique_lcl_variants_grch38.bed', sep='\t', header=None)


In [None]:
unique_total_grch38

Unnamed: 0,0,1,2,3
0,chr1,150801063,150801064,chr1:150773539:I
1,chr1,247886531,247886532,chr1:248049833:D
2,chr1,95235667,95235668,chr1:95701223:I
3,chr1,150852051,150852052,chr1:150824527:I
4,chr1,175005506,175005507,chr1:174974642:D
...,...,...,...,...
1997,chr22,46291962,46291963,chr22:46687859:D
1998,chr22,32407055,32407056,chr22:32803042:D
1999,chr22,23969398,23969399,chr22:24311587:D
2000,chr22,49917230,49917231,chr22:50310878:I


In [35]:
#check missing snps
if len(unique_total_grch38) == len(chr_pos):
    print('True')
else:
    print('False', len(chr_pos) - len(unique_total_grch38))


False 16


In [36]:
lcl_chr = lcl[lcl['SNP'].isin(snp_chr)]

could not be converted:

In [37]:
missing_lcl_chr = lcl_chr[~lcl_chr['SNP'].isin(unique_total_grch38[3])]
missing_lcl_chr['SNP'].drop_duplicates()

11509     chr7:72197046:D
12229     chr7:74367161:D
12628     chr7:72209527:D
12865     chr7:72214746:D
15374    chr10:51583018:D
21592    chr15:22908713:D
22234    chr15:23283055:D
22635    chr15:83212280:D
24254    chr17:35254926:D
24487    chr17:34587955:D
29079    chr17:36904739:D
29411    chr17:36438743:I
30161    chr17:35598119:D
31397    chr17:36446921:D
31410    chr17:34968395:I
33858    chr17:34905449:I
Name: SNP, dtype: object

In [40]:
missing_ids = list(missing_lcl_chr['SNP'].unique())

In [41]:
len(unique_total_grch38)

2002

In [None]:
#check snps with no rsid
no_rs_id = [i for i in list(lcl['SNP']) if i.startswith('chr')]
no_rs_id = lcl[lcl['SNP'].isin(no_rs_id)]

In [None]:
no_rs_id

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
21,chr1:150773539:I_RC,chr1:150773539:I,neg,ref,333.963651,335.456529,-0.004300,0.349020,0.000000,367.746453,378.166913,0.022874,0.248667,0.000000,-0.244704,0.480306,0.027174,,
24,chr1:248049833:D_RC,chr1:248049833:D,neg,ref,101.584307,80.760099,-0.256048,0.523068,0.000000,163.008293,120.656706,-0.361262,1.444777,0.000000,0.025269,-0.322687,-0.105214,,
28,chr1:95701223:I,chr1:95701223:I,pos,ref,317.950783,339.020238,0.085936,0.110712,0.000000,312.317269,337.687680,0.099660,0.102379,0.000000,0.006836,0.025204,0.013724,,
34,chr1:150824527:I_RC,chr1:150824527:I,neg,ref,665.673312,928.793886,0.467121,7.948176,3.351821,616.995944,852.686405,0.437081,5.977152,1.380797,-0.149947,0.169805,-0.030040,0.385947,0.256515
69,chr1:174974642:D_RC,chr1:174974642:D,neg,ref,531.038174,458.535352,-0.196354,0.786907,0.000000,668.270004,726.296483,0.109224,0.502498,0.000000,0.310294,0.297718,0.305578,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39384,chr22:24311587:D_RC,chr22:24311587:D,neg,ref,981.570908,887.813493,-0.140719,0.825951,0.000000,1466.670551,1208.088993,-0.273060,3.078556,0.000000,-0.182409,-0.048896,-0.132342,,
39402,chr22:50310878:I,chr22:50310878:I,pos,ref,993.455891,850.487619,-0.218312,1.109198,0.000000,1174.904671,1006.382120,-0.217826,2.178905,0.000000,0.086053,-0.142126,0.000486,,
39403,chr22:50310878:I_alt,chr22:50310878:I,pos,alt,1126.995341,1137.968150,0.010008,0.234265,0.000000,938.036839,1040.206850,0.144965,0.732187,0.000000,0.199215,0.027862,0.134957,,
39446,chr22:32796098:D_RC,chr22:32796098:D,neg,ref,1051.218463,1032.588025,-0.025729,0.052462,0.000000,817.815702,785.592943,-0.057540,0.220910,0.000000,-0.096893,0.076659,-0.031811,,


In [43]:
no_i_or_d = [i for i in list(no_rs_id['SNP'].unique()) if not i.endswith(':I') and not i.endswith(':D')]
no_i_or_d

['chr1:25780893',
 'chr2:130952625',
 'chr4:7064219',
 'chr6:32629802',
 'chr6:32546828',
 'chr6:32605274',
 'chr6:32627992',
 'chr6:32629889',
 'chr7:56088811',
 'chr7:98741441',
 'chr7:56087474',
 'chr15:43897499',
 'chr16:74445537',
 'chr16:70187270',
 'chr16:1306981',
 'chr16:70190401',
 'chr16:70164334',
 'chr17:45214631',
 'chr17:21319121',
 'chr22:45723947']

In [44]:
chr_id = lcl[lcl['SNP'].isin(snp_chr)]
final_chr_id = chr_id[~chr_id['SNP'].isin(missing_ids)]
set_chr_id = final_chr_id.drop_duplicates(subset=['SNP'], keep='last')

In [45]:
final_chr_id['SNP'].value_counts()

chr17:44276431:I    4
chr17:44001549:I    4
chr17:44149352:D    4
chr17:44354157:I    4
chr17:44037106:I    4
                   ..
chr6:74179373:D     1
chr6:30566241:D     1
chr6:19956679:D     1
chr6:32631301:I     1
chr10:7793788:I     1
Name: SNP, Length: 2002, dtype: int64

In [None]:
id_chr = list(final_chr_id['SNP'].unique())
#set(list(set_chr_id['SNP']))

Create input file for snps without rsid

In [None]:
snp = [i for i in list(final_chr_id['SNP'].unique()) if i.startswith('chr')]
start_pos = [int(i[5:].replace(':', '').replace('I', '').replace('D','')) for i in snp]
end_pos = [i + 1 for i in start_pos]
chr_pos = [i[:5].replace(':', '') for i in list(final_chr_id['SNP'].unique()) if i.startswith('chr')]
with open("data/input_ensmbl_total_lcl_variants_grch37.txt","w") as f:
    if len(start_pos) == len(end_pos) == len(chr_pos) == len(snp):
        for (chr,start,end, snp) in zip(chr_pos,start_pos,end_pos, snp):
            f.write("{0} \t {1} \t {2} \t {3}\n".format(chr,start,end, snp))

***

output_assembly_converter file missing for reading: check and compare if worked

In [46]:
output_assembly_converter = unique_total_grch38


In [47]:
output_assembly_converter =output_assembly_converter[[3, 0 ,1]]

In [48]:
output_assembly_converter =output_assembly_converter.rename(columns={3:'snp', 0:'snp_chromosome', 1:'snp_position'})

In [49]:
final_chr_id[final_chr_id['SNP'].isin(list(output_assembly_converter['snp']))]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
21,chr1:150773539:I_RC,chr1:150773539:I,neg,ref,333.963651,335.456529,-0.004300,0.349020,0.000000,367.746453,378.166913,0.022874,0.248667,0.000000,-0.244704,0.480306,0.027174,,
24,chr1:248049833:D_RC,chr1:248049833:D,neg,ref,101.584307,80.760099,-0.256048,0.523068,0.000000,163.008293,120.656706,-0.361262,1.444777,0.000000,0.025269,-0.322687,-0.105214,,
28,chr1:95701223:I,chr1:95701223:I,pos,ref,317.950783,339.020238,0.085936,0.110712,0.000000,312.317269,337.687680,0.099660,0.102379,0.000000,0.006836,0.025204,0.013724,,
34,chr1:150824527:I_RC,chr1:150824527:I,neg,ref,665.673312,928.793886,0.467121,7.948176,3.351821,616.995944,852.686405,0.437081,5.977152,1.380797,-0.149947,0.169805,-0.030040,0.385947,0.256515
69,chr1:174974642:D_RC,chr1:174974642:D,neg,ref,531.038174,458.535352,-0.196354,0.786907,0.000000,668.270004,726.296483,0.109224,0.502498,0.000000,0.310294,0.297718,0.305578,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39384,chr22:24311587:D_RC,chr22:24311587:D,neg,ref,981.570908,887.813493,-0.140719,0.825951,0.000000,1466.670551,1208.088993,-0.273060,3.078556,0.000000,-0.182409,-0.048896,-0.132342,,
39402,chr22:50310878:I,chr22:50310878:I,pos,ref,993.455891,850.487619,-0.218312,1.109198,0.000000,1174.904671,1006.382120,-0.217826,2.178905,0.000000,0.086053,-0.142126,0.000486,,
39403,chr22:50310878:I_alt,chr22:50310878:I,pos,alt,1126.995341,1137.968150,0.010008,0.234265,0.000000,938.036839,1040.206850,0.144965,0.732187,0.000000,0.199215,0.027862,0.134957,,
39446,chr22:32796098:D_RC,chr22:32796098:D,neg,ref,1051.218463,1032.588025,-0.025729,0.052462,0.000000,817.815702,785.592943,-0.057540,0.220910,0.000000,-0.096893,0.076659,-0.031811,,


In [50]:
list(final_chr_id['SNP'].unique()) == list(output_assembly_converter['snp'])

True

In [None]:
len(output_assembly_converter)

2002

In [51]:
output_assembly_converter

Unnamed: 0,snp,snp_chromosome,snp_position
0,chr1:150773539:I,chr1,150801063
1,chr1:248049833:D,chr1,247886531
2,chr1:95701223:I,chr1,95235667
3,chr1:150824527:I,chr1,150852051
4,chr1:174974642:D,chr1,175005506
...,...,...,...
1997,chr22:46687859:D,chr22,46291962
1998,chr22:32803042:D,chr22,32407055
1999,chr22:24311587:D,chr22,23969398
2000,chr22:50310878:I,chr22,49917230


In [None]:
len(final_chr_id['SNP'].value_counts()) == len(output_assembly_converter)

True

***

### Format and combine files from biomart query and assembly converter to gain needed snp, chromosome and snp position information

In [None]:
total_biomart_syn_variants = pd.read_csv('data/biomart_total.txt')

In [None]:
total_biomart_syn_variants

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Synonym name
0,rs8768,dbSNP,1,26170849,26170849,rs79266459
1,rs66517664,dbSNP,1,2768235,2768235,rs76055539
2,rs66517664,dbSNP,HSCHR1_1_CTG3,2780219,2780219,rs76055539
3,rs259338,dbSNP,1,95272751,95272751,rs78140099
4,rs500513,dbSNP,1,234474793,234474793,rs76203336
...,...,...,...,...,...,...
13089,rs79990247,dbSNP,12,7924992,7924992,rs140493080
13090,rs77718176,dbSNP,7,100217868,100217868,rs113859809
13091,rs113177067,dbSNP,12,9965410,9965410,rs150347472
13092,rs112600168,dbSNP,9,31326631,31326631,rs141183894


In [None]:
missing_rsid_mart_export = pd.read_csv("mart_export (5).txt", sep='\t')

In [None]:
missing_rsid_mart_export =missing_rsid_mart_export.drop_duplicates(subset=['Variant name'], keep='first')
missing_rsid_mart_export

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Variant alleles,Minor allele (ALL),Synonym name
0,rs10413306,dbSNP,19,52782474,52782474,C/G,,
1,rs11279206,dbSNP,22,43120043,43120055,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG,,rs35315665
4,rs11673357,dbSNP,19,53197554,53197554,T/A/C,,NM_032559.5:c.1104G>A
13,rs35871241,dbSNP,7,72726370,72726370,G/C/T,,
14,rs3840965,dbSNP,22,50578781,50578782,CT/CTTCT,,rs140535917
15,rs59698086,dbSNP,22,21002604,21002609,AGACAG/AG,,rs148931161
16,rs61737955,dbSNP,19,54632756,54632756,C/A/G/T,,rs79640454


In [None]:
missing_rsid_mart_export = missing_rsid_mart_export[['Chromosome/scaffold position start (bp)', 'Variant alleles']]
missing_rsid_mart_export.columns = ['snp_position', 'variant_allele']
missing_rsid_mart_export

Unnamed: 0,snp_position,variant_allele
0,52782474,C/G
1,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
4,53197554,T/A/C
13,72726370,G/C/T
14,50578781,CT/CTTCT
15,21002604,AGACAG/AG
16,54632756,C/A/G/T


In [None]:
total_variants = pd.read_csv('data/martquery_1117101613_627.txt')

In [None]:
def change_column_names(dataframe):
    dataframe.columns = [i.lower().replace(' ', '_') for i in dataframe.columns]

In [None]:
change_column_names(total_biomart_syn_variants)
change_column_names(total_variants)

In [None]:
print('amount of unique variants found based on synonyms:',len(total_biomart_syn_variants['synonym_name'].value_counts()))

amount of unique variants found based on synonyms: 3497


In [None]:
chr_name = [i for i in total_biomart_syn_variants['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
syn_total_variants_unique = total_biomart_syn_variants[total_biomart_syn_variants['chromosome/scaffold_name'].isin(chr_name)]
syn_total_variants_unique = syn_total_variants_unique.drop_duplicates('synonym_name', keep='first')
syn_total_variants_unique

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs8768,dbSNP,1,26170849,26170849,rs79266459
1,rs66517664,dbSNP,1,2768235,2768235,rs76055539
3,rs259338,dbSNP,1,95272751,95272751,rs78140099
4,rs500513,dbSNP,1,234474793,234474793,rs76203336
5,rs873309,dbSNP,1,25431928,25431928,rs76583011
...,...,...,...,...,...,...
13089,rs79990247,dbSNP,12,7924992,7924992,rs140493080
13090,rs77718176,dbSNP,7,100217868,100217868,rs113859809
13091,rs113177067,dbSNP,12,9965410,9965410,rs150347472
13092,rs112600168,dbSNP,9,31326631,31326631,rs141183894


In [None]:
syn_total_variants_unique = syn_total_variants_unique.rename(columns={'synonym_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
syn_total_variants_unique = syn_total_variants_unique.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'variant_name'])
syn_total_variants_unique = syn_total_variants_unique[['snp', 'snp_chromosome', 'snp_position']]
#syn_total_variants_unique = syn_total_variants_unique.astype({'snp_chromosome': 'int64'})

In [None]:
syn_total_variants_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3495 entries, 0 to 13093
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   snp             3495 non-null   object
 1   snp_chromosome  3495 non-null   object
 2   snp_position    3495 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 109.2+ KB


In [None]:
total_variants

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11809905,dbSNP,1,227334540,227334540,rs59906147
1,rs114530232,dbSNP,1,42958380,42958380,rs118162020
2,rs114530232,dbSNP,1,42958380,42958380,VCV000668836
3,rs114530232,dbSNP,1,42958380,42958380,RCV000827794
4,rs114531441,dbSNP,1,37548222,37548222,
...,...,...,...,...,...,...
53195,rs60115620,dbSNP,12,6456710,6456710,
53196,rs57161853,dbSNP,7,72747687,72747687,
53197,rs112262084,dbSNP,16,28884350,28884350,VCV000678426
53198,rs112262084,dbSNP,16,28884350,28884350,RCV000838015


In [None]:
chr_name = [i for i in total_variants['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
unique_total_variants = total_variants[total_variants['chromosome/scaffold_name'].isin(chr_name)]
unique_total_variants

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11809905,dbSNP,1,227334540,227334540,rs59906147
1,rs114530232,dbSNP,1,42958380,42958380,rs118162020
2,rs114530232,dbSNP,1,42958380,42958380,VCV000668836
3,rs114530232,dbSNP,1,42958380,42958380,RCV000827794
4,rs114531441,dbSNP,1,37548222,37548222,
...,...,...,...,...,...,...
53195,rs60115620,dbSNP,12,6456710,6456710,
53196,rs57161853,dbSNP,7,72747687,72747687,
53197,rs112262084,dbSNP,16,28884350,28884350,VCV000678426
53198,rs112262084,dbSNP,16,28884350,28884350,RCV000838015


In [None]:
unique_total_variants = unique_total_variants.drop_duplicates('variant_name', keep='first')
unique_total_variants

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11809905,dbSNP,1,227334540,227334540,rs59906147
1,rs114530232,dbSNP,1,42958380,42958380,rs118162020
4,rs114531441,dbSNP,1,37548222,37548222,
5,rs11810220,dbSNP,1,163311300,163311300,
6,rs11811181,dbSNP,1,206551409,206551409,rs58730705
...,...,...,...,...,...,...
53194,rs73484568,dbSNP,9,33132494,33132494,rs73645258
53195,rs60115620,dbSNP,12,6456710,6456710,
53196,rs57161853,dbSNP,7,72747687,72747687,
53197,rs112262084,dbSNP,16,28884350,28884350,VCV000678426


In [None]:
#Select columns to keep
unique_total_variants = unique_total_variants.rename(columns={'variant_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
unique_total_variants = unique_total_variants.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'synonym_name'])
unique_total_variants

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,1,227334540
1,rs114530232,1,42958380
4,rs114531441,1,37548222
5,rs11810220,1,163311300
6,rs11811181,1,206551409
...,...,...,...
53194,rs73484568,9,33132494
53195,rs60115620,12,6456710
53196,rs57161853,7,72747687
53197,rs112262084,16,28884350


In [None]:
output_assembly_converter

Unnamed: 0,snp,snp_chromosome,snp_position
0,chr1:150773539:I,chr1,150801063
1,chr1:248049833:D,chr1,247886531
2,chr1:95701223:I,chr1,95235667
3,chr1:150824527:I,chr1,150852051
4,chr1:174974642:D,chr1,175005506
...,...,...,...
1997,chr22:46687859:D,chr22,46291962
1998,chr22:32803042:D,chr22,32407055
1999,chr22:24311587:D,chr22,23969398
2000,chr22:50310878:I,chr22,49917230


In [None]:
unique_total_variants['snp_chromosome'] = 'chr' + unique_total_variants['snp_chromosome']
syn_total_variants_unique['snp_chromosome'] = 'chr' + syn_total_variants_unique['snp_chromosome']

In [None]:
len(syn_total_variants_unique) +len(unique_total_variants)

27119

In [None]:
unique_total_variants

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,chr1,227334540
1,rs114530232,chr1,42958380
4,rs114531441,chr1,37548222
5,rs11810220,chr1,163311300
6,rs11811181,chr1,206551409
...,...,...,...
53194,rs73484568,chr9,33132494
53195,rs60115620,chr12,6456710
53196,rs57161853,chr7,72747687
53197,rs112262084,chr16,28884350


In [None]:
#Concat all dataframs with variants together
final_lcl_positions = pd.concat([unique_total_variants, syn_total_variants_unique, output_assembly_converter])
final_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,chr1,227334540
1,rs114530232,chr1,42958380
4,rs114531441,chr1,37548222
5,rs11810220,chr1,163311300
6,rs11811181,chr1,206551409
...,...,...,...
1997,chr22:46687859:D,chr22,46291962
1998,chr22:32803042:D,chr22,32407055
1999,chr22:24311587:D,chr22,23969398
2000,chr22:50310878:I,chr22,49917230


In [None]:
print(len(lcl['SNP'].value_counts()), 'variants in total at beginning')

29173 variants in total at beginning


In [None]:
print(len(unique_total_variants) 
      + len(syn_total_variants_unique) 
      + len(output_assembly_converter), 'variants converted')

29121 variants where position was found for


In [None]:
variants = pd.concat([unique_total_variants, syn_total_variants_unique])

In [None]:
variants_list = list(variants['snp'])

In [None]:
with open("data/bed_files/lcl_snp_list.txt","w") as f:
    for snp in variants_list:
        f.write("{0} \n".format(snp))

***
#### Create MPRA coordinates for LCL B cell SNPS to compare with single-cell eQTLs with bedtools intersect

In [57]:
all_positions = list(final_lcl_positions['snp_position'])
start_mpra, end_mpra = get_start_end_coord(all_positions)
final_lcl_positions['start_coord'] = start_mpra
final_lcl_positions['end_coord'] = end_mpra
final_lcl_positions = final_lcl_positions.rename(columns={'snp_chromosome': 'chromosome'})

save SNPs with rsid and coordinates to file

In [None]:
rs_ids_final = final_lcl_positions.iloc[:27119]

In [None]:
rs_ids_final['snp'].to_csv('data/assembly/rs_ids_lcl.txt', header=None, index=False)

save snps without rsid and coordiantes to file

In [None]:
chr_ids = final_lcl_positions.iloc[27119:]

In [None]:
chr_ids

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,chr1:150773539:I,chr1,150801063,150800988,150801138
1,chr1:248049833:D,chr1,247886531,247886456,247886606
2,chr1:95701223:I,chr1,95235667,95235592,95235742
3,chr1:150824527:I,chr1,150852051,150851976,150852126
4,chr1:174974642:D,chr1,175005506,175005431,175005581
...,...,...,...,...,...
1997,chr22:46687859:D,chr22,46291962,46291887,46292037
1998,chr22:32803042:D,chr22,32407055,32406980,32407130
1999,chr22:24311587:D,chr22,23969398,23969323,23969473
2000,chr22:50310878:I,chr22,49917230,49917155,49917305


In [None]:
chr_ids_regions = chr_ids[['chromosome', 'start_coord', 'end_coord']]

In [None]:
chr_ids_regions.to_csv('data/assembly/chr_id_regions.txt', sep='\t', header=None, index=False)

Save information of all converted LCL SNPs to csv and excel

In [None]:
final_lcl_positions.to_csv('positions_all_lcl_variants.csv', index=False)
final_lcl_positions.to_excel('positions_all__lcl_variants.xlsx', index=False)

***
### Build converted LCL MPRA coordinates

In [77]:
final_lcl_positions_path = (config['lcl_mpra_positions'])

In [2]:
final_lcl_positions =pd.read_csv('positions_all_lcl_variants.csv', sep=',')

In [None]:
final_lcl_positions

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,rs11809905,chr1,227334540,227334465,227334615
1,rs114530232,chr1,42958380,42958305,42958455
2,rs114531441,chr1,37548222,37548147,37548297
3,rs11810220,chr1,163311300,163311225,163311375
4,rs11811181,chr1,206551409,206551334,206551484
...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037
29117,chr22:32803042:D,chr22,32407055,32406980,32407130
29118,chr22:24311587:D,chr22,23969398,23969323,23969473
29119,chr22:50310878:I,chr22,49917230,49917155,49917305


Bed format coordinates of all lcl variants

In [3]:
final_lcl_positions = final_lcl_positions.rename(columns={'chromosome': 'snp_chromosome'})

In [60]:
bed_total_lcl_variants_coords = final_lcl_positions[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]
bed_total_lcl_variants_coords.to_csv('data/total_lcl_variants_coords.txt', sep='\t', header=None, index=False)

***

### Determine significant LCL MPRA regions

In [61]:
sig_variants = lcl.dropna()

In [62]:
sig_variants

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
0,rs11548103_RC,rs11548103,neg,ref,893.147913,1403.234147,0.637129,20.726159,16.129804,985.132571,1413.418102,0.512199,24.239298,19.642943,-0.157649,-0.070399,-0.124930,1.197350,0.803895
3,rs11102212_RC,rs11102212,neg,ref,272.682393,724.187989,1.276380,20.903079,16.306723,270.606119,663.387712,1.183784,18.276201,13.679846,-0.182603,0.057417,-0.092596,0.151356,0.102546
6,rs112338151,rs112338151,pos,ref,311.022339,938.600426,1.454065,35.570382,30.974027,441.094084,1188.919426,1.333956,46.863102,42.266747,-0.149852,-0.070538,-0.120109,1.108686,0.743469
14,rs10910099_RC,rs10910099,neg,ref,1565.369298,2410.813917,0.609195,18.726474,14.130119,1597.275307,1901.533139,0.243351,4.277202,0.000000,-0.319983,-0.442277,-0.365843,2.609976,1.744575
17,rs61731104,rs61731104,pos,ref,787.310108,1106.543203,0.472118,7.158719,2.562364,851.116339,948.197074,0.137500,2.071566,0.000000,-0.426533,-0.181428,-0.334618,1.448329,0.973007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39435,rs6002380_RC,rs6002380,neg,ref,1021.939757,1393.401924,0.436551,7.957672,3.361317,1410.403292,1772.580532,0.318500,3.737057,0.000000,-0.072219,-0.194436,-0.118050,0.225938,0.152430
39454,rs12165508_RC,rs12165508,neg,ref,905.677775,1186.698469,0.370023,10.844473,6.248118,602.818906,793.453679,0.368042,5.622711,1.026356,-0.004909,0.002897,-0.001982,0.082316,0.061067
39461,rs73439311_RC,rs73439311,neg,ref,225.047471,518.727303,1.021873,15.889120,11.292765,264.268231,367.242135,0.333214,5.915880,1.319524,-0.768413,-0.555736,-0.688659,2.142339,1.440353
39467,rs2234058,rs2234058,pos,ref,673.043758,1373.481543,0.938229,34.201737,29.605382,587.687732,1299.793269,1.040252,37.364005,32.767650,0.089103,0.123557,0.102023,0.925545,0.622003


In [None]:
chr_variants = [i for i in sig_variants['SNP'].unique() if i.startswith('chr')]

build 37 to 38 for snps without a rsid

In [63]:
# Input assembly converter ensmbl: chromosome, start position, stop position
# Add an stop position just in case the position in removed in the new build
snp_sig = [i for i in sig_variants['SNP'].unique() if i.startswith('chr')]
start_pos_sig = [int(i[5:].replace(':', '').replace('I','').replace('D', '')) for i in snp_sig]
end_pos_sig = [i + 1 for i in start_pos_sig]
chr_pos_sig = [i[:5].replace(':', '') for i in snp_sig if i.startswith('chr')]

In [None]:
# Create a txt file in bed format for input in assembly converter of ensmbl
with open("grch37sig_input_assembly_converter.txt","w") as f:
    for (chr_pos_sig,start_pos_sig,end_pos_sig, snp_sig) in zip(chr_pos_sig,start_pos_sig,end_pos_sig,snp_sig):
        f.write("{0} \t {1} \t {2} \t {3}\n".format(chr_pos_sig,start_pos_sig,end_pos_sig,snp_sig))

In [None]:
#Load complete file converted positions to build 38
grch38 = pd.read_csv('data/unique_lcl_sig_chr_coord.bed', sep='\t', header=None, names=['snp_chromosome', 'snp_position', 'end_coord', 'snp'])

In [None]:
grch38

Unnamed: 0,snp_chromosome,snp_position,end_coord,snp
0,chr1,150852051,150852052,chr1:150824527:I
1,chr1,205784748,205784749,chr1:205753876:D
2,chr1,172024714,172024715,chr1:171993854:D
3,chr1,41034295,41034296,chr1:41499967:I
4,chr1,22025902,22025903,chr1:22352395:D
...,...,...,...,...
208,chr20,25549209,25549210,chr20:25529845:D
209,chr21,36973064,36973065,chr21:38345364:I
210,chr21,28955410,28955411,chr21:30327732:D
211,chr22,49917233,49917234,chr22:50310881:D


In [65]:
#Missing variants build lift over
sig_variants_chr = [i for i in sig_variants['SNP'].unique() if i.startswith('chr')]
sig_variants_chr = sig_variants[sig_variants['SNP'].isin(sig_variants_chr)]
sig_variants_chr[~sig_variants_chr['SNP'].isin(grch38['snp'])]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
15374,chr10:51583018:D_RC,chr10:51583018:D,neg,ref,182.744215,453.229658,1.186026,16.662832,12.066477,256.76964,320.133604,0.275901,0.678229,0.0,-0.95012,-0.843468,-0.910125,1.952377,1.313647
29079,chr17:36904739:D,chr17:36904739:D,pos,ref,1594.207418,2515.344218,0.615043,22.826099,18.229744,1589.62954,3432.485335,1.049011,54.936248,50.339893,0.409717,0.474387,0.433968,4.923863,3.157711
29080,chr17:36904739:D_RC,chr17:36904739:D,neg,ref,1546.975363,3675.040906,1.170987,69.106165,64.50981,957.421529,2705.992119,1.388305,93.211848,88.615492,0.152803,0.324841,0.217317,1.512971,1.016804
29411,chr17:36438743:I_RC,chr17:36438743:I,neg,ref,1110.254661,1702.627803,0.598501,10.226223,5.629867,888.939888,2324.063225,1.347996,60.284617,55.688262,0.788642,0.68425,0.749495,4.144482,2.724976


In [None]:
#Check if the file has the same length as the dataframe
if len(grch38) == len(chr_pos_sig):
    print('True')
else:
    print('False')
    print('missing', len(chr_variants)- len(grch38))

In [None]:
#Check in wich position coordinates are missingA
#Counter(chr_pos_sig)

In [None]:
#Check in wich position coordinates are missingA
#Counter(list(grch38['snp_chromosome']))

Positions that are missing in output from assembly converter
- chr17 	 36904739 	 36904740 - chr17:36904739:D
- chr17 	 36438743 	 36438744 - chr17:36438743:I
- chr10 	 51583018 	 51583019 - chr10:51583018:D

In [None]:
sig_variants

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
0,rs11548103_RC,rs11548103,neg,ref,893.147913,1403.234147,0.637129,20.726159,16.129804,985.132571,1413.418102,0.512199,24.239298,19.642943,-0.157649,-0.070399,-0.124930,1.197350,0.803895
3,rs11102212_RC,rs11102212,neg,ref,272.682393,724.187989,1.276380,20.903079,16.306723,270.606119,663.387712,1.183784,18.276201,13.679846,-0.182603,0.057417,-0.092596,0.151356,0.102546
6,rs112338151,rs112338151,pos,ref,311.022339,938.600426,1.454065,35.570382,30.974027,441.094084,1188.919426,1.333956,46.863102,42.266747,-0.149852,-0.070538,-0.120109,1.108686,0.743469
14,rs10910099_RC,rs10910099,neg,ref,1565.369298,2410.813917,0.609195,18.726474,14.130119,1597.275307,1901.533139,0.243351,4.277202,0.000000,-0.319983,-0.442277,-0.365843,2.609976,1.744575
17,rs61731104,rs61731104,pos,ref,787.310108,1106.543203,0.472118,7.158719,2.562364,851.116339,948.197074,0.137500,2.071566,0.000000,-0.426533,-0.181428,-0.334618,1.448329,0.973007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39435,rs6002380_RC,rs6002380,neg,ref,1021.939757,1393.401924,0.436551,7.957672,3.361317,1410.403292,1772.580532,0.318500,3.737057,0.000000,-0.072219,-0.194436,-0.118050,0.225938,0.152430
39454,rs12165508_RC,rs12165508,neg,ref,905.677775,1186.698469,0.370023,10.844473,6.248118,602.818906,793.453679,0.368042,5.622711,1.026356,-0.004909,0.002897,-0.001982,0.082316,0.061067
39461,rs73439311_RC,rs73439311,neg,ref,225.047471,518.727303,1.021873,15.889120,11.292765,264.268231,367.242135,0.333214,5.915880,1.319524,-0.768413,-0.555736,-0.688659,2.142339,1.440353
39467,rs2234058,rs2234058,pos,ref,673.043758,1373.481543,0.938229,34.201737,29.605382,587.687732,1299.793269,1.040252,37.364005,32.767650,0.089103,0.123557,0.102023,0.925545,0.622003


In [None]:
sig_pos_not_found = ['chr17:36904739:D', 'chr17:36438743:I', 'chr10:51583018:D']
#select rows that do not contain above id's
sig_variants = sig_variants[~sig_variants['SNP'].isin(sig_pos_not_found)]
# id's to a list
sig_chr_variants =[i for i in sig_variants['SNP'].unique() if i.startswith('chr')]
# # create new coordinates
# start_pos_sig_new = [int(i[5:].replace(':', '').replace('I','').replace('D', '')) for i in list(sig_variants['SNP'].unique()) if i.startswith('chr')]
# end_pos_sig_new = [i + 1 for i in start_pos_sig_new]
# chr_pos_sig_new = [i[:5].replace(':', '') for i in list(sig_variants['SNP'].unique()) if i.startswith('chr')]

In [None]:
#Check if the file has the same length as the dataframe
if len(grch38) == len(sig_chr_variants):
    print('True')
else:
    print('False')
    print('missing', len(chr_variants)- len(grch38))

True


In [None]:
grch38 = grch38[['snp', 'snp_chromosome', 'snp_position']]

Load biomart export/output files

In [67]:
total_synonym = pd.read_csv('mart_export_total_synonym.txt')
total_variant_name_biomart = pd.read_csv('mart_export_total_variantname.txt')

In [68]:
change_column_names(total_synonym)
change_column_names(total_variant_name_biomart)

In [69]:
len(total_variant_name_biomart['variant_name'].value_counts()) + len(total_synonym['synonym_name'].value_counts())

3374

In [70]:
chr_name = [i for i in total_synonym['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
unique_syn = total_synonym[total_synonym['chromosome/scaffold_name'].isin(chr_name)]
unique_syn

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs2356416,dbSNP,1,45567593,45567593,rs74785550
1,rs7534581,dbSNP,1,1659114,1659114,rs9661285
2,rs112868731,dbSNP,3,41841863,41841863,rs144523572
3,rs111248130,dbSNP,4,55242231,55242231,rs145051830
4,rs55993837,dbSNP,3,41850539,41850539,rs140970237
...,...,...,...,...,...,...
1927,rs2885047,dbSNP,9,470259,470259,rs113679677
1928,rs4301823,dbSNP,12,57792811,57792811,rs56261123
1929,rs13221668,dbSNP,7,73763368,73763368,rs74539570
1930,rs4000157,dbSNP,7,32730090,32730090,rs145127609


In [71]:
unique_syn = unique_syn.rename(columns={'synonym_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
unique_syn = unique_syn.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'variant_name'])
unique_syn[['snp', 'snp_chromosome', 'snp_position']]

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs74785550,1,45567593
1,rs9661285,1,1659114
2,rs144523572,3,41841863
3,rs145051830,4,55242231
4,rs140970237,3,41850539
...,...,...,...
1927,rs113679677,9,470259
1928,rs56261123,12,57792811
1929,rs74539570,7,73763368
1930,rs145127609,7,32730090


In [72]:
total_chr_name = [i for i in total_variant_name_biomart['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
unique_chr_total = total_variant_name_biomart[total_variant_name_biomart['chromosome/scaffold_name'].isin(total_chr_name)]
unique_chr_total

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11810220,dbSNP,1,163311300,163311300,
1,rs11585048,dbSNP,1,2602648,2602648,rs59642996
3,rs11585844,dbSNP,1,37563668,37563668,
4,rs11587500,dbSNP,1,24190390,24190390,rs17184644
5,rs11587500,dbSNP,1,24190390,24190390,rs59459702
...,...,...,...,...,...,...
7902,rs111980103,dbSNP,16,970874,970874,RCV001667153
7903,rs111980103,dbSNP,16,970874,970874,RCV002421238
7904,rs59522292,dbSNP,12,124914185,124914185,
7905,rs56812038,dbSNP,7,32681035,32681035,


In [73]:
unique_chr_total = unique_chr_total.drop_duplicates(['variant_name'], keep='last')

In [74]:
unique_chr_total = unique_chr_total.rename(columns={'variant_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
unique_chr_total = unique_chr_total.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'synonym_name'])

In [75]:
unique_chr_total['snp_chromosome'] = 'chr' + unique_chr_total['snp_chromosome']

In [76]:
unique_syn = unique_syn[['snp', 'snp_chromosome', 'snp_position']]

In [None]:
#unique_syn['snp_chromosome'] = 'chr' + unique_syn['snp_chromosome']

In [None]:
grch38

Unnamed: 0,snp_chromosome,snp_position,end_coord,snp
0,chr1,150852051,150852052,chr1:150824527:I
1,chr1,205784748,205784749,chr1:205753876:D
2,chr1,172024714,172024715,chr1:171993854:D
3,chr1,41034295,41034296,chr1:41499967:I
4,chr1,22025902,22025903,chr1:22352395:D
...,...,...,...,...
208,chr20,25549209,25549210,chr20:25529845:D
209,chr21,36973064,36973065,chr21:38345364:I
210,chr21,28955410,28955411,chr21:30327732:D
211,chr22,49917233,49917234,chr22:50310881:D


In [77]:
unique_syn['snp_chromosome'] = 'chr' +unique_syn['snp_chromosome']
unique_syn

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs74785550,chr1,45567593
1,rs9661285,chr1,1659114
2,rs144523572,chr3,41841863
3,rs145051830,chr4,55242231
4,rs140970237,chr3,41850539
...,...,...,...
1927,rs113679677,chr9,470259
1928,rs56261123,chr12,57792811
1929,rs74539570,chr7,73763368
1930,rs145127609,chr7,32730090


In [None]:
unique_chr_total

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11810220,chr1,163311300
1,rs11585048,chr1,2602648
3,rs11585844,chr1,37563668
5,rs11587500,chr1,24190390
6,rs11588318,chr1,200669534
...,...,...,...
7892,rs4553633,chr16,2660393
7903,rs111980103,chr16,970874
7904,rs59522292,chr12,124914185
7905,rs56812038,chr7,32681035


In [79]:
positions_sig_variants = pd.concat([unique_chr_total, unique_syn, grch38])

In [None]:
positions_sig_variants

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11810220,chr1,163311300
1,rs11585048,chr1,2602648
3,rs11585844,chr1,37563668
5,rs11587500,chr1,24190390
6,rs11588318,chr1,200669534
...,...,...,...
208,chr20:25529845:D,chr20,25549209
209,chr21:38345364:I,chr21,36973064
210,chr21:30327732:D,chr21,28955410
211,chr22:50310881:D,chr22,49917233


In [None]:
positions_sig_variants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3585 entries, 0 to 212
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   snp             3585 non-null   object
 1   snp_chromosome  3585 non-null   object
 2   snp_position    3585 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 112.0+ KB


With the variant of interest centered within 150 bp of genomic sequence.

In [81]:
position_sig = list(positions_sig_variants['snp_position'])

In [82]:
start_mpra, end_mpra = get_start_end_coord(position_sig)

In [83]:
positions_sig_variants['start_coord'] = start_mpra
positions_sig_variants['end_coord'] = end_mpra

In [84]:
positions_sig_variants = positions_sig_variants.rename(columns={'snp_chromosome':'chromosome'})

Save MPRA coordinate information of signifciant LCL regions + variants

In [None]:
positions_sig_variants.to_csv('positions_sig_lcl_variants.csv', index=False)
positions_sig_variants.to_excel('positions_sig_lcl_variants.xlsx', index=False)

In [None]:
#ositions_sig_variants = pd.read_csv('positions_sig_lcl_variants.csv')

In [5]:
positions_sig_variants

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,rs11810220,chr1,163311300,163311225,163311375
1,rs11585048,chr1,2602648,2602573,2602723
2,rs11585844,chr1,37563668,37563593,37563743
3,rs11587500,chr1,24190390,24190315,24190465
4,rs11588318,chr1,200669534,200669459,200669609
...,...,...,...,...,...
3580,chr20:25529845:D,chr20,25549209,25549134,25549284
3581,chr21:38345364:I,chr21,36973064,36972989,36973139
3582,chr21:30327732:D,chr21,28955410,28955335,28955485
3583,chr22:50310881:D,chr22,49917233,49917158,49917308


In [6]:
positions_sig_variants = positions_sig_variants.rename(columns={'chromosome': 'snp_chromosome'})

Bed format txt file significant lcl variants

In [88]:
bed_format_sig_variants = positions_sig_variants[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]

In [None]:
bed_format_sig_variants.to_csv('data/lcl_significant_variants_coords.txt', sep='\t', header=None, index=False)

In [89]:
bed_format_sig_variants.dtypes

snp_chromosome    object
start_coord        int64
end_coord          int64
snp               object
dtype: object

***
### lcl_mpra_position_overlap.txt

In [None]:
with open('lcl_mpra_position_overlap.txt', 'r') as file:
    overlap_positions = [line.strip() for line in file]

In [None]:
overlap_positions =[int(i) for i in overlap_positions]

In [None]:
final_lcl_positions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29121 entries, 0 to 29120
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   snp             29121 non-null  object
 1   snp_chromosome  29121 non-null  object
 2   snp_position    29121 non-null  int64 
 3   start_coord     29121 non-null  int64 
 4   end_coord       29121 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 1.1+ MB


In [None]:
snp_overlap = final_lcl_positions[final_lcl_positions['snp_position'].isin(overlap_positions)]

In [None]:
len(snp_overlap)

42

In [None]:
id_overlap = list(snp_overlap['snp'].unique())

In [None]:
lcl[lcl['SNP'].isin(id_overlap)]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
655,rs74045976_RC,rs74045976,neg,ref,1306.076342,983.851139,-0.400317,8.330693,3.734337,1291.250043,1131.056900,-0.190530,3.094454,0.000000,0.251751,0.139849,0.209788,2.237498,1.511666
2987,rs113612815_RC,rs113612815,neg,ref,800.811823,882.603007,0.135318,0.475015,0.000000,634.185218,891.976620,0.478592,12.801986,8.205631,0.426001,0.205397,0.343275,1.188536,0.798784
3278,rs73153267_RC,rs73153267,neg,ref,561.648645,1675.101454,1.322058,26.541084,21.944729,582.374586,1965.788116,1.465996,29.878681,25.282326,0.138312,0.153316,0.143938,1.061545,0.713922
5759,rs6766641_RC,rs6766641,neg,ref,926.149452,1689.439863,0.843489,34.232160,29.635805,1105.476659,2033.612235,0.850945,31.809001,27.212646,0.030574,-0.031075,0.007456,0.113630,0.079122
8403,rs35188965_RC,rs35188965,neg,ref,1559.033700,3084.504088,0.873597,40.807681,36.211326,1947.007675,2445.217021,0.298748,6.243791,1.647436,-0.393967,-0.876318,-0.574849,5.300203,3.353410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38823,rs7284713_RC,rs7284713,neg,ref,1111.082823,1537.327584,0.456201,6.626448,2.030093,1350.609225,2099.631551,0.621997,14.834651,10.238296,0.126262,0.231685,0.165796,1.886733,1.271346
38824,rs7284713_RC_alt,rs7284713,neg,alt,1347.691448,1929.248091,0.506624,11.097647,6.501292,885.809304,1278.147327,0.513627,8.404860,3.808504,0.006982,0.007037,0.007003,0.006943,0.005538
39125,rs7293064_RC,rs7293064,neg,ref,1310.697897,3027.322186,1.176897,58.828213,54.231858,1382.291887,2557.973698,0.866727,27.741980,23.145625,-0.398131,-0.163569,-0.310170,2.046625,1.372637
39126,rs7293064_RC_alt,rs7293064,neg,alt,1199.390610,2625.811458,1.099562,41.503023,36.906668,1176.475150,2072.832588,0.794935,20.681986,16.085631,-0.348721,-0.231136,-0.304627,2.106100,1.414579


In [None]:
sig_overlap = sig_variants[sig_variants['SNP'].isin(id_overlap)]

In [None]:
sig_overlap.drop_duplicates('SNP').head()

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
655,rs74045976_RC,rs74045976,neg,ref,1306.076342,983.851139,-0.400317,8.330693,3.734337,1291.250043,1131.0569,-0.19053,3.094454,0.0,0.251751,0.139849,0.209788,2.237498,1.511666
2987,rs113612815_RC,rs113612815,neg,ref,800.811823,882.603007,0.135318,0.475015,0.0,634.185218,891.97662,0.478592,12.801986,8.205631,0.426001,0.205397,0.343275,1.188536,0.798784
3278,rs73153267_RC,rs73153267,neg,ref,561.648645,1675.101454,1.322058,26.541084,21.944729,582.374586,1965.788116,1.465996,29.878681,25.282326,0.138312,0.153316,0.143938,1.061545,0.713922
5759,rs6766641_RC,rs6766641,neg,ref,926.149452,1689.439863,0.843489,34.23216,29.635805,1105.476659,2033.612235,0.850945,31.809001,27.212646,0.030574,-0.031075,0.007456,0.11363,0.079122
8403,rs35188965_RC,rs35188965,neg,ref,1559.0337,3084.504088,0.873597,40.807681,36.211326,1947.007675,2445.217021,0.298748,6.243791,1.647436,-0.393967,-0.876318,-0.574849,5.300203,3.35341


***
create bedfile for checking intersect between lcl mpra and starr seq

In [None]:
final_lcl_positions

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,rs11809905,chr1,227334540,227334465,227334615
1,rs114530232,chr1,42958380,42958305,42958455
2,rs114531441,chr1,37548222,37548147,37548297
3,rs11810220,chr1,163311300,163311225,163311375
4,rs11811181,chr1,206551409,206551334,206551484
...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037
29117,chr22:32803042:D,chr22,32407055,32406980,32407130
29118,chr22:24311587:D,chr22,23969398,23969323,23969473
29119,chr22:50310878:I,chr22,49917230,49917155,49917305


In [None]:
positions_sig_variants

Unnamed: 0,snp,snp_chromosome,snp_position,end_coord,start_coord
0,rs11810220,chr1,163311300,163311375,163311225
1,rs11585048,chr1,2602648,2602723,2602573
3,rs11585844,chr1,37563668,37563743,37563593
5,rs11587500,chr1,24190390,24190465,24190315
6,rs11588318,chr1,200669534,200669609,200669459
...,...,...,...,...,...
208,chr20:25529845:D,chr20,25549209,25549284,25549134
209,chr21:38345364:I,chr21,36973064,36973139,36972989
210,chr21:30327732:D,chr21,28955410,28955485,28955335
211,chr22:50310881:D,chr22,49917233,49917308,49917158


In [7]:
non_significant_lcl_positions = final_lcl_positions[~final_lcl_positions['snp'].isin(positions_sig_variants['snp'])]

In [8]:
non_significant_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
0,rs11809905,chr1,227334540,227334465,227334615
1,rs114530232,chr1,42958380,42958305,42958455
2,rs114531441,chr1,37548222,37548147,37548297
4,rs11811181,chr1,206551409,206551334,206551484
5,rs114569995,chr1,169828815,169828740,169828890
...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037
29117,chr22:32803042:D,chr22,32407055,32406980,32407130
29118,chr22:24311587:D,chr22,23969398,23969323,23969473
29119,chr22:50310878:I,chr22,49917230,49917155,49917305


In [None]:
(len(final_lcl_positions) - len(positions_sig_variants)) == len(non_significant_lcl_positions)

True

In [9]:
positions_sig_variants = positions_sig_variants.astype({'snp_chromosome':'str'})
#positions_sig_variants['snp_chromosome'] = 'chr' + positions_sig_variants['snp_chromosome']
sig_lcl_coordinates = positions_sig_variants[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]

In [None]:
duplicate_start_coord = list(final_lcl_positions[final_lcl_positions['start_coord'].duplicated()]['start_coord'])


In [None]:
sig_lcl_coordinates[sig_lcl_coordinates['start_coord'].isin(duplicate_start_coord)]

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp
5232,chr2,64917483,64917633,rs4671630


In [None]:
final_lcl_positions[(final_lcl_positions['start_coord'] == 45550611) | (final_lcl_positions['start_coord'] == 45854732)
                    | (final_lcl_positions['start_coord'] == 46021420)| (final_lcl_positions['start_coord'] == 46088639	)]

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
15489,rs78363416,chr10,45550686,45550611,45550761
24952,rs111961982,chr17,45854807,45854732,45854882
26054,rs117757814,chr17,46021495,46021420,46021570
26119,rs187481091,chr17,46088714,46088639,46088789
28103,chr10:46046134:D,chr10,45550686,45550611,45550761
28805,chr17:43932173:I,chr17,45854807,45854732,45854882
28841,chr17:44098861:D,chr17,46021495,46021420,46021570
28867,chr17:44166080:I,chr17,46088714,46088639,46088789


In [None]:
len(final_lcl_positions)

29121

In [None]:
final_lcl_positions[final_lcl_positions['start_coord'] == 35409905]

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
29052,chr20:33997783:D,chr20,35409980,35409905,35410055


In [None]:
overlap_sig_lcl_vs_total = final_lcl_positions[final_lcl_positions['start_coord'].isin(list(sig_lcl_coordinates['start_coord']))]
sig_lcl_coordinates[~sig_lcl_coordinates['start_coord'].isin(list(overlap_sig_lcl_vs_total['start_coord']))]

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp


In [None]:
not_found = ['chr10:51583018:D', 'chr16:70190401', 'chr17:36438743:I', 'chr17:36904739:D', 'chr6:32546828', 'chr6:32627992', 'chr6:32629889' ]

In [None]:
sig_lcl_coordinates[sig_lcl_coordinates['snp'].isin(not_found)]

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp
56,chr6,32578976,32579126,chr6:32546828
60,chr6,32660140,32660290,chr6:32627992
65,chr6,32662037,32662187,chr6:32629889
132,chr16,70156423,70156573,chr16:70190401


In [None]:
lcl[lcl['SNP'].isin(not_found)]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
9211,chr6:32546828_RC,chr6:32546828,neg,ref,1751.475564,2126.262888,0.269681,8.242406,3.646051,962.859339,1318.048194,0.428452,5.508337,0.911982,0.169623,0.140685,0.158771,0.840362,0.563019
10004,chr6:32627992,chr6:32627992,pos,ref,929.803534,1035.868995,0.150967,0.908504,0.0,766.375543,749.135712,-0.031337,0.065648,0.0,-0.144168,-0.245862,-0.182303,,
10005,chr6:32627992_RC,chr6:32627992,neg,ref,488.609921,658.676346,0.405744,6.276884,1.680529,641.959949,748.224151,0.19468,1.38265,0.0,-0.22191,-0.192988,-0.211065,,
10006,chr6:32627992_alt,chr6:32627992,pos,alt,1200.106215,1189.044047,-0.013631,0.010621,0.0,1185.058096,1159.889391,-0.030602,0.058264,0.0,-0.032936,0.009636,-0.016972,,
10007,chr6:32627992_RC_alt,chr6:32627992,neg,alt,678.591264,899.653581,0.383805,3.394353,0.0,583.460405,948.598412,0.683989,23.760747,19.164392,0.291108,0.315311,0.300184,1.116858,0.747744
10483,chr6:32629889_RC,chr6:32629889,neg,ref,768.454246,813.127091,0.078635,0.158112,0.0,905.965463,950.395477,0.058967,0.32909,0.0,-0.133572,0.170173,-0.019668,,
10484,chr6:32629889_RC_alt,chr6:32629889,neg,alt,1111.677995,1519.630078,0.441915,10.166889,5.570534,858.686278,1200.966879,0.466767,9.820155,5.2238,0.137883,-0.163532,0.024852,0.345416,0.231954
15374,chr10:51583018:D_RC,chr10:51583018:D,neg,ref,182.744215,453.229658,1.186026,16.662832,12.066477,256.76964,320.133604,0.275901,0.678229,0.0,-0.95012,-0.843468,-0.910125,1.952377,1.313647
23634,chr16:70190401,chr16:70190401,pos,ref,647.729075,1106.308855,0.742134,22.68277,18.086415,641.985305,797.271487,0.294588,1.852254,0.0,-0.354863,-0.602018,-0.447546,2.368307,1.597605
23635,chr16:70190401_RC,chr16:70190401,neg,ref,1295.031274,1419.48431,0.125623,0.835485,0.0,1006.937677,1151.313527,0.183254,1.414055,0.0,0.049362,0.071414,0.057631,,


chr10	51252949	51253099	chr10:51583018:D
chr16	651829	651979	chr16:70190401
chr17	33692192	33692342	chr17:36438743:I
chr17	34158190	34158340	chr17:36904739:D
chr6	325393	325543	chr6:32546828
chr6	326204	326354	chr6:32627992
chr6	326223	326373	chr6:32629889

Final version LCL coordinates and sig LCL coordinates

In [None]:
sig_lcl_coordinates.to_csv('data/sig_lcl_coordinates.txt', header=None, index=False, sep='\t')

In [10]:
final_lcl_positions = final_lcl_positions.astype({'snp_chromosome':'str'})
#final_lcl_positions['snp_chromosome'] = 'chr' + final_lcl_positions['snp_chromosome']
lcl_coordinates = final_lcl_positions[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]

In [17]:
lcl_coordinates

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp
0,chr1,227334465,227334615,rs11809905
1,chr1,42958305,42958455,rs114530232
2,chr1,37548147,37548297,rs114531441
3,chr1,163311225,163311375,rs11810220
4,chr1,206551334,206551484,rs11811181
...,...,...,...,...
29116,chr22,46291887,46292037,chr22:46687859:D
29117,chr22,32406980,32407130,chr22:32803042:D
29118,chr22,23969323,23969473,chr22:24311587:D
29119,chr22,49917155,49917305,chr22:50310878:I


In [None]:
lcl_coordinates.to_csv('data/lcl_coordinates.txt', header=None, index=False, sep='\t')

***

In [9]:
b_qtl = pd.read_csv('eQTL\WMA_meta_B_qtl_results_fastApprox.txt', sep='\t')


In [10]:
b_qtl = b_qtl.astype({'snp_position': 'int64', 'snp_chromosome': 'str'})
b_qtl = b_qtl.astype({'snp_position': 'str'})

In [11]:
b_qtl['snp_chromosome'] = 'chr' + b_qtl['snp_chromosome']

In [12]:
b_qtl['chr_snp_pos'] = b_qtl['snp_chromosome'] + '_' +  b_qtl['snp_position']

In [98]:
b_qtl

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,470835,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,1.219312e+01,211365.1,12.408954,chr12_56007301
1,1991637,SMDT1,22:42123461:A:AT,4.472720e-20,1.006116,0.252152,6.857645e-19,22,42079691,42084284,...,42123461,A,1,0.315789,1.000000,22:42123461:A:AT-SMDT1,9.176031e+00,211365.1,9.403753,chr22_42123461
2,1991535,SMDT1,22:42119191:GAGAT:G,4.763233e-20,1.078521,0.252800,6.857645e-19,22,42079691,42084284,...,42119191,GAGAT,1,0.342105,1.000000,22:42119191:GAGAT:G-SMDT1,9.169250e+00,211365.1,9.320564,chr22_42119191
3,1991538,SMDT1,22:42092156:C:A,4.933741e-20,1.073046,0.253754,6.857645e-19,22,42079691,42084284,...,42092156,C,1,0.342105,1.000000,22:42092156:C:A-SMDT1,9.165457e+00,211365.1,9.320889,chr22_42092156
4,1991555,SMDT1,22:42094636:CAAA:C,5.097736e-20,1.063314,0.251781,6.857645e-19,22,42079691,42084284,...,42094636,CAAA,1,0.342105,1.000000,22:42094636:CAAA:C-SMDT1,9.161930e+00,211365.1,9.318367,chr22_42094636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466240,1616942,CEP350,1:179674899:T:G,9.999275e-01,0.050972,0.117085,9.999998e-01,1,179954773,180114875,...,179674899,T,1,0.368421,1.000000,1:179674899:T:G-CEP350,-9.090477e-05,211365.1,0.082259,chr1_179674899
3466241,1616937,CEP350,1:179677732:C:T,9.999939e-01,0.051011,0.117116,9.999998e-01,1,179954773,180114875,...,179677732,C,1,0.368421,1.000000,1:179677732:C:T-CEP350,-7.689936e-06,211365.1,0.082429,chr1_179677732
3466242,1616935,CEP350,1:179676283:G:T,9.999939e-01,0.051011,0.117116,9.999998e-01,1,179954773,180114875,...,179676283,G,1,0.368421,1.000000,1:179676283:G:T-CEP350,-7.689936e-06,211365.1,0.082429,chr1_179676283
3466243,1616936,CEP350,1:179676284:G:A,9.999939e-01,0.051011,0.117116,9.999998e-01,1,179954773,180114875,...,179676284,G,1,0.368421,1.000000,1:179676284:G:A-CEP350,-7.689936e-06,211365.1,0.082429,chr1_179676284


In [None]:
b_qtl['chr_snp_pos'].value_counts()

chr6_32685969    66
chr6_32900883    57
chr6_31457941    54
chr6_31326454    51
chr6_31223865    51
                 ..
chr8_70976554     1
chr8_70967802     1
chr8_70967929     1
chr8_70974691     1
chr12_4119931     1
Name: chr_snp_pos, Length: 1318229, dtype: int64

In [None]:
b_qtl[b_qtl['snp_id'] == '6:32446944:GA:G']

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
7025,3086312,HLA-DQA1,6:32446944:GA:G,5.5e-05,-0.556653,0.216203,0.000484,6,32628179,32647062,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-HLA-DQA1,-4.03342,211365.1,-3.789862,chr6_32446944
28814,3149613,HLA-DQB1,6:32446944:GA:G,0.005375,0.377058,0.266517,0.015222,6,32659467,32668383,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-HLA-DQB1,2.783655,211365.1,2.609782,chr6_32446944
63455,3074322,CLIC1,6:32446944:GA:G,0.002674,0.692691,0.217887,0.065953,6,31730581,31739763,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-CLIC1,3.002926,211365.1,2.72352,chr6_32446944
106378,3179780,PSMB8-AS1,6:32446944:GA:G,0.006802,0.283964,0.253415,0.134269,6,32844078,32846500,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-PSMB8-AS1,2.70638,211365.1,2.768136,chr6_32446944
290858,3130071,DDAH2,6:32446944:GA:G,0.052899,0.49458,0.300558,0.320963,6,31727038,31730617,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-DDAH2,1.935741,211365.1,1.966558,chr6_32446944
312000,3071080,CSNK2B,6:32446944:GA:G,0.035139,0.859171,0.20407,0.338213,6,31665227,31670343,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-CSNK2B,2.106755,211365.1,1.578399,chr6_32446944
318986,3255840,HLA-DRA,6:32446944:GA:G,0.099583,-0.081964,0.154794,0.344685,6,32439878,32445046,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-HLA-DRA,1.646877,211365.1,1.986274,chr6_32446944
375478,3077701,GPSM3,6:32446944:GA:G,0.028843,0.416428,0.150519,0.384564,6,32190766,32195523,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-GPSM3,2.185624,211365.1,2.186581,chr6_32446944
870219,3110893,TAPBP,6:32446944:GA:G,0.370217,0.38377,0.199995,0.628421,6,33299694,33314284,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-TAPBP,-0.896066,211365.1,-1.076158,chr6_32446944
967398,3244707,NELFE,6:32446944:GA:G,0.45561,0.145105,0.237255,0.662491,6,31952087,31959038,...,32446944,GA,1,0.25,0.190745,6:32446944:GA:G-NELFE,-0.746096,211365.1,-0.771687,chr6_32446944


In [22]:
lcl_positions = list(final_lcl_positions['snp_position'])
lcl_chrom = list(final_lcl_positions['snp_chromosome'])

In [23]:
b_qtl_position = list(b_qtl['snp_position'])

In [24]:
bqtl_chr = list(b_qtl['snp_chromosome'])

In [11]:
final_lcl_positions = final_lcl_positions.astype({'snp_position': 'str'})
final_lcl_positions['chr_snp_pos'] = final_lcl_positions['snp_chromosome'] + '_' + final_lcl_positions['snp_position']

In [None]:
final_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
0,rs11809905,chr1,227334540,227334465,227334615,chr1_227334540
1,rs114530232,chr1,42958380,42958305,42958455,chr1_42958380
2,rs114531441,chr1,37548222,37548147,37548297,chr1_37548222
3,rs11810220,chr1,163311300,163311225,163311375,chr1_163311300
4,rs11811181,chr1,206551409,206551334,206551484,chr1_206551409
...,...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037,chr22_46291962
29117,chr22:32803042:D,chr22,32407055,32406980,32407130,chr22_32407055
29118,chr22:24311587:D,chr22,23969398,23969323,23969473,chr22_23969398
29119,chr22:50310878:I,chr22,49917230,49917155,49917305,chr22_49917230


In [51]:
non_sig_bqtl = pd.read_csv('data/bed_files/non_sig_b_qtl_bed_unique.txt', sep='\t',names=['chr','start', 'end', 'chr_snp_pos'])
sig_b_qtl = pd.read_csv('data/bed_files/sig_b_qtl_bed_unique.txt', sep='\t', names=['chr','start', 'end', 'chr_snp_pos'])
b_qtl = pd.read_csv('data/bed_files/b_qtl_bed_unique.txt', sep='\t')

In [26]:
bqtl_in_lcl = b_qtl[b_qtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
bqtl_in_lcl

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,470835,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365.1,12.408954,chr12_56007301
13,1991558,SMDT1,22:42092341:A:G,5.301338e-20,1.061741,0.251468,6.857645e-19,22,42079691,42084284,...,42092341,A,1,0.342105,1.000000,22:42092341:A:G-SMDT1,9.157704,211365.1,9.313558,chr22_42092341
49,1991543,SMDT1,22:42080750:A:C,5.975150e-20,1.068933,0.252964,6.857645e-19,22,42079691,42084284,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,9.144780,211365.1,9.298145,chr22_42080750
90,1991548,SMDT1,22:42079564:A:G,6.305804e-20,1.067545,0.252661,6.857645e-19,22,42079691,42084284,...,42079564,A,1,0.342105,1.000000,22:42079564:A:G-SMDT1,9.138956,211365.1,9.291881,chr22_42079564
106,1187419,EIF5A,17:7304645:A:C,1.232962e-20,1.014169,0.231219,4.903394e-18,17,7306999,7312463,...,7304645,A,1,0.407895,1.000000,17:7304645:A:C-EIF5A,9.313837,211365.1,9.338378,chr17_7304645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465624,3745853,EDF1,9:136771753:A:C,7.851306e-01,0.104662,0.182139,9.999956e-01,9,136862119,136866308,...,136771753,A,1,0.276316,0.691117,9:136771753:A:C-EDF1,0.272639,211365.1,0.402345,chr9_136771753
3465673,3748679,EDF1,9:135927961:G:GCA,8.459497e-01,-0.004290,0.183079,9.999956e-01,9,136862119,136866308,...,135927961,G,1,0.223684,0.651845,9:135927961:G:GCA-EDF1,-0.194289,211365.1,-0.146756,chr9_135927961
3465814,3746426,EDF1,9:136788448:G:A,9.229997e-01,0.083622,0.177976,9.999956e-01,9,136862119,136866308,...,136788448,G,1,0.276316,0.691117,9:136788448:G:A-EDF1,-0.096656,211365.1,0.029169,chr9_136788448
3465821,3742055,EDF1,9:136540945:G:C,9.317760e-01,-0.204284,0.147459,9.999956e-01,9,136862119,136866308,...,136540945,G,1,0.394737,1.000000,9:136540945:G:C-EDF1,0.085611,211365.1,0.242237,chr9_136540945


In [None]:
bqtl_in_lcl[bqtl_in_lcl['chr_snp_pos'] == 'chr6_32655465']

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
937,3154686,HLA-DQB1,6:32655465:G:A,5.022681e-12,0.409783,0.300745,1.323373e-10,6,32659467,32668383,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-HLA-DQB1,6.904934,211365.1,7.394238,chr6_32655465
3307,3089763,HLA-DQA1,6:32655465:G:A,5.648797e-07,0.608445,0.242468,6.052621e-06,6,32628179,32647062,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-HLA-DQA1,5.002853,211365.1,5.228173,chr6_32655465
58197,3171630,HLA-DPB1,6:32655465:G:A,0.009191648,0.309502,0.25859,0.05767038,6,33075990,33089696,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-HLA-DPB1,2.604843,211365.1,2.761646,chr6_32655465
166645,3102995,TAP1,6:32655465:G:A,0.02802845,-0.445751,0.215244,0.2021598,6,32845209,32853816,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-TAP1,-2.196888,211365.1,-1.932974,chr6_32655465
268375,3253584,TAPBP,6:32655465:G:A,0.04964388,-0.123626,0.22568,0.3034219,6,33299694,33314284,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-TAPBP,-1.96302,211365.1,-1.758028,chr6_32655465
439523,3152174,PHF1,6:32655465:G:A,0.08003026,0.420725,0.303059,0.4269215,6,33410399,33416453,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-PHF1,1.750511,211365.1,1.446744,chr6_32655465
473038,3112421,HLA-DMA,6:32655465:G:A,0.1724782,0.34203,0.181115,0.4483543,6,32948613,32969094,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-HLA-DMA,1.364284,211365.1,1.338886,chr6_32655465
584139,3110162,HLA-DRA,6:32655465:G:A,0.2070691,0.33768,0.174675,0.5123621,6,32439878,32445046,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-HLA-DRA,1.261666,211365.1,0.969992,chr6_32655465
610922,3288816,CLIC1,6:32655465:G:A,0.1971211,0.075921,0.244356,0.5252552,6,31730581,31739763,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-CLIC1,1.289797,211365.1,1.308849,chr6_32655465
743587,3322131,GPSM3,6:32655465:G:A,0.1855821,-0.015332,0.168804,0.5812949,6,32190766,32195523,...,32655465,G,1,0.486842,0.102074,6:32655465:G:A-GPSM3,-1.323762,211365.1,-1.405406,chr6_32655465


In [None]:
bqtl_in_lcl['chr_snp_pos'].value_counts()

chr6_32655465      23
chr6_32652845      23
chr19_1491172      23
chr6_32635710      23
chr6_32647891      23
                   ..
chr7_2767614        1
chr11_126598080     1
chr8_70653121       1
chr11_59264832      1
chr1_58585904       1
Name: chr_snp_pos, Length: 8934, dtype: int64

In [None]:
final_lcl_positions.chr_snp_pos.value_counts()

chr17_46021495    2
chr17_45854807    2
chr17_46088714    2
chr10_45550686    2
chr1_227334540    1
                 ..
chr5_96913944     1
chr11_8463349     1
chr5_181277862    1
chr5_150436487    1
chr22_32400111    1
Name: chr_snp_pos, Length: 29117, dtype: int64

In [None]:
len(final_lcl_positions.chr_snp_pos.value_counts()) - len(bqtl_in_lcl['chr_snp_pos'].value_counts())

20183

In [None]:
b_qtl['chr_snp_pos'].value_counts()

chr6_32685969    66
chr6_32900883    57
chr6_31457941    54
chr6_31326454    51
chr6_31223865    51
                 ..
chr8_70976554     1
chr8_70967802     1
chr8_70967929     1
chr8_70974691     1
chr12_4119931     1
Name: chr_snp_pos, Length: 1318229, dtype: int64

In [27]:
lcl_in_bqtl = final_lcl_positions[final_lcl_positions['chr_snp_pos'].isin(b_qtl['chr_snp_pos'])]

In [None]:
lcl_in_bqtl['chr_snp_pos'].value_counts()

chr17_46021495     2
chr17_46088714     2
chr17_45854807     2
chr1_42958380      1
chr12_69358656     1
                  ..
chr10_100559638    1
chr10_100559435    1
chr10_91873305     1
chr8_70728034      1
chr21_36036310     1
Name: chr_snp_pos, Length: 8934, dtype: int64

Select the positions with more than two counts

In [105]:
bqtl_in_lcl[bqtl_in_lcl.chr_snp_pos.map(lcl_in_bqtl.chr_snp_pos.value_counts() > 1)]

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
8471,1247461,FAM215B,17:46088714:T:A,0.000453,-0.155743,0.229956,0.00074,17,46558830,46562795,...,46088714,T,1,0.25,0.664641,17:46088714:T:A-FAM215B,-3.506827,211365.1,-3.527712,chr17_46088714
8859,1245407,FAM215B,17:45854807:A:G,0.000459,-0.156379,0.223208,0.00074,17,46558830,46562795,...,45854807,A,1,0.25,0.664641,17:45854807:A:G-FAM215B,-3.503826,211365.1,-3.516766,chr17_45854807
9427,1245859,FAM215B,17:46021495:G:T,0.000465,-0.155369,0.226854,0.00074,17,46558830,46562795,...,46021495,G,1,0.25,0.664641,17:46021495:G:T-FAM215B,-3.500361,211365.1,-3.519978,chr17_46021495
13520,1237870,KANSL1,17:45854807:A:G,0.000714,-0.643158,0.222571,0.002241,17,46029916,46225389,...,45854807,A,1,0.25,0.664641,17:45854807:A:G-KANSL1,-3.384009,211365.1,-3.132424,chr17_45854807
16128,1235712,KANSL1,17:46088714:T:A,0.001674,-0.675971,0.2293,0.002481,17,46029916,46225389,...,46088714,T,1,0.25,0.664641,17:46088714:T:A-KANSL1,-3.142758,211365.1,-2.891224,chr17_46088714
16262,1235803,KANSL1,17:46021495:G:T,0.001718,-0.662717,0.226206,0.002481,17,46029916,46225389,...,46021495,G,1,0.25,0.664641,17:46021495:G:T-KANSL1,-3.135027,211365.1,-2.887482,chr17_46021495
2319231,1213205,NMT1,17:46021495:G:T,0.71986,0.407907,0.20374,0.932527,17,44957992,45109016,...,46021495,G,1,0.25,0.664641,17:46021495:G:T-NMT1,0.358646,211365.1,0.072432,chr17_46021495
2319617,1214911,NMT1,17:46088714:T:A,0.741487,0.409542,0.206501,0.932527,17,44957992,45109016,...,46088714,T,1,0.25,0.664641,17:46088714:T:A-NMT1,0.329885,211365.1,0.047115,chr17_46088714
2520931,1230148,FMNL1,17:46021495:G:T,0.78668,0.03066,0.141599,0.951061,17,45221444,45247319,...,46021495,G,1,0.25,0.664641,17:46021495:G:T-FMNL1,-0.270624,211365.1,-0.340297,chr17_46021495
2520959,1229595,FMNL1,17:45854807:A:G,0.787669,0.034282,0.139187,0.951061,17,45221444,45247319,...,45854807,A,1,0.25,0.664641,17:45854807:A:G-FMNL1,-0.269338,211365.1,-0.34119,chr17_45854807


In [None]:
#bqtl_in_lcl[(bqtl_in_lcl['chr_snp_pos'] == 'chr17_45854807') |(bqtl_in_lcl['chr_snp_pos'] == 'chr17_46021495')| (bqtl_in_lcl['chr_snp_pos'] == 'chr17_46088714')]

In [3]:
sig_bqtl = pd.read_csv('data/b_qtl.csv', sep= '\t')

In [4]:
sig_bqtl['empirical_feature_p_value'][len(sig_bqtl)-1]

0.0056469198697246

In [5]:
sig_bqtl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_chromosome,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org
0,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,ENSG00000197728,...,12,56007301.0,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365.1,12.408954
1,SMDT1,22:42123461:A:AT,4.472720e-20,1.006116,0.252152,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,22,42123461.0,A,1,0.315789,1.000000,22:42123461:A:AT-SMDT1,9.176031,211365.1,9.403753
2,SMDT1,22:42119191:GAGAT:G,4.763233e-20,1.078521,0.252800,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,22,42119191.0,GAGAT,1,0.342105,1.000000,22:42119191:GAGAT:G-SMDT1,9.169250,211365.1,9.320564
3,SMDT1,22:42092156:C:A,4.933741e-20,1.073046,0.253754,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,22,42092156.0,C,1,0.342105,1.000000,22:42092156:C:A-SMDT1,9.165457,211365.1,9.320889
4,SMDT1,22:42094636:CAAA:C,5.097736e-20,1.063314,0.251781,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,22,42094636.0,CAAA,1,0.342105,1.000000,22:42094636:CAAA:C-SMDT1,9.161930,211365.1,9.318367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20012,TUFM,16:28892446:AAC:A,9.915560e-04,0.881049,0.314535,5.637180e-03,16,28842411,28846348,ENSG00000178952,...,16,28892446.0,AAC,1,0.381579,0.490409,16:28892446:AAC:A-TUFM,3.292912,211365.1,3.366263
20013,HLA-C,6:31478769:T:C,1.863080e-03,-0.390286,0.236738,5.640952e-03,6,31268749,31272130,ENSG00000204525,...,6,31478769.0,T,1,0.355263,1.000000,6:31478769:T:C-HLA-C,-3.111233,211365.1,-3.108471
20014,HLA-C,6:31516255:T:C,1.864925e-03,-0.751529,0.316263,5.643625e-03,6,31268749,31272130,ENSG00000204525,...,6,31516255.0,T,1,0.184211,1.000000,6:31516255:T:C-HLA-C,-3.110940,211365.1,-2.973853
20015,HLA-C,6:31396930:G:A,1.865420e-03,0.383531,0.268094,5.643625e-03,6,31268749,31272130,ENSG00000204525,...,6,31396930.0,G,1,0.368421,0.507756,6:31396930:G:A-HLA-C,3.110862,211365.1,3.149272


In [None]:
#all_bqtl_overlap_sig_lcl[all_bqtl_overlap_sig_lcl.chr_snp_pos == 'chr6_32659937']

In [13]:
non_sig_bqtls = b_qtl[b_qtl['empirical_feature_p_value'] > sig_bqtl['empirical_feature_p_value'][len(sig_bqtl)-1]]
non_sig_bqtls

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
20017,3149759,HLA-DQB1,6:32233982:A:AATTATT,0.001796,0.525891,0.372480,0.005655,6,32659467,32668383,...,32233982,A,1,0.131579,1.000000,6:32233982:A:AATTATT-HLA-DQB1,3.122067e+00,211365.1,3.033587,chr6_32233982
20018,1261982,SNHG25,17:63226123:A:G,0.001078,0.557476,0.278598,0.005660,17,64145937,64146476,...,63226123,A,1,0.197368,0.609606,17:63226123:A:G-SNHG25,3.269292e+00,211365.1,3.023179,chr17_63226123
20019,1235322,KANSL1,17:46096136:A:ATTCTTT,0.004174,-0.717224,0.241079,0.005667,17,46029916,46225389,...,46096136,A,1,0.250000,0.664641,17:46096136:A:ATTCTTT-KANSL1,-2.864675e+00,211365.1,-2.637423,chr17_46096136
20020,1632868,CD55,1:207892145:GC:G,0.000999,-0.448591,0.245504,0.005669,1,207321519,207386804,...,207892145,GC,1,0.328947,1.000000,1:207892145:GC:G-CD55,-3.290811e+00,211365.1,-3.622277,chr1_207892145
20021,1632869,CD55,1:207892448:G:A,0.000999,-0.448591,0.245504,0.005669,1,207321519,207386804,...,207892448,G,1,0.328947,1.000000,1:207892448:G:A-CD55,-3.290811e+00,211365.1,-3.622277,chr1_207892448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466240,1616942,CEP350,1:179674899:T:G,0.999927,0.050972,0.117085,1.000000,1,179954773,180114875,...,179674899,T,1,0.368421,1.000000,1:179674899:T:G-CEP350,-9.090477e-05,211365.1,0.082259,chr1_179674899
3466241,1616937,CEP350,1:179677732:C:T,0.999994,0.051011,0.117116,1.000000,1,179954773,180114875,...,179677732,C,1,0.368421,1.000000,1:179677732:C:T-CEP350,-7.689936e-06,211365.1,0.082429,chr1_179677732
3466242,1616935,CEP350,1:179676283:G:T,0.999994,0.051011,0.117116,1.000000,1,179954773,180114875,...,179676283,G,1,0.368421,1.000000,1:179676283:G:T-CEP350,-7.689936e-06,211365.1,0.082429,chr1_179676283
3466243,1616936,CEP350,1:179676284:G:A,0.999994,0.051011,0.117116,1.000000,1,179954773,180114875,...,179676284,G,1,0.368421,1.000000,1:179676284:G:A-CEP350,-7.689936e-06,211365.1,0.082429,chr1_179676284


In [None]:
# non_sig_bqtls = non_sig_bqtls.astype({'snp_position': 'int64', 'snp_chromosome': 'str'})
# non_sig_bqtls = non_sig_bqtls.astype({'snp_position': 'str'})
#non_sig_bqtls['chr_snp_pos'] = non_sig_bqtls['snp_chromosome'] + '_' +  non_sig_bqtls['snp_position']

In [None]:
final_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
0,rs11809905,chr1,227334540,227334465,227334615,chr1_227334540
1,rs114530232,chr1,42958380,42958305,42958455,chr1_42958380
2,rs114531441,chr1,37548222,37548147,37548297,chr1_37548222
3,rs11810220,chr1,163311300,163311225,163311375,chr1_163311300
4,rs11811181,chr1,206551409,206551334,206551484,chr1_206551409
...,...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037,chr22_46291962
29117,chr22:32803042:D,chr22,32407055,32406980,32407130,chr22_32407055
29118,chr22:24311587:D,chr22,23969398,23969323,23969473,chr22_23969398
29119,chr22:50310878:I,chr22,49917230,49917155,49917305,chr22_49917230


In [29]:
non_sig_bqtls_in_lcl = non_sig_bqtls[non_sig_bqtls['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
non_sig_bqtls_in_lcl

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
20051,478995,ATP5MC2,12:53676019:A:G,0.000122,-0.408987,0.226734,0.005735,12,53632726,53677408,...,53676019,A,1,0.342105,1.000000,12:53676019:A:G-ATP5MC2,-3.842411,211365.1,-3.899213,chr12_53676019
20181,1251525,FAM215B,17:46759287:G:A,0.004538,-0.052374,0.230183,0.005781,17,46558830,46562795,...,46759287,G,1,0.263158,0.691117,17:46759287:G:A-FAM215B,2.838090,211365.1,2.974278,chr17_46759287
20264,1993356,CRELD2,22:49913292:C:T,0.000339,1.654353,0.527537,0.005833,22,49918167,49927540,...,49913292,C,1,0.078947,0.191819,22:49913292:C:T-CRELD2,3.583529,211365.1,3.606900,chr22_49913292
20265,1993363,CRELD2,22:49915798:G:A,0.000344,1.652789,0.527153,0.005833,22,49918167,49927540,...,49915798,G,1,0.078947,0.191819,22:49915798:G:A-CRELD2,3.579957,211365.1,3.600813,chr22_49915798
20266,1993446,CRELD2,22:49914284:T:C,0.000349,1.613163,0.515588,0.005833,22,49918167,49927540,...,49914284,T,1,0.078947,0.191819,22:49914284:T:C-CRELD2,3.575624,211365.1,3.597336,chr22_49914284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465624,3745853,EDF1,9:136771753:A:C,0.785131,0.104662,0.182139,0.999996,9,136862119,136866308,...,136771753,A,1,0.276316,0.691117,9:136771753:A:C-EDF1,0.272639,211365.1,0.402345,chr9_136771753
3465673,3748679,EDF1,9:135927961:G:GCA,0.845950,-0.004290,0.183079,0.999996,9,136862119,136866308,...,135927961,G,1,0.223684,0.651845,9:135927961:G:GCA-EDF1,-0.194289,211365.1,-0.146756,chr9_135927961
3465814,3746426,EDF1,9:136788448:G:A,0.923000,0.083622,0.177976,0.999996,9,136862119,136866308,...,136788448,G,1,0.276316,0.691117,9:136788448:G:A-EDF1,-0.096656,211365.1,0.029169,chr9_136788448
3465821,3742055,EDF1,9:136540945:G:C,0.931776,-0.204284,0.147459,0.999996,9,136862119,136866308,...,136540945,G,1,0.394737,1.000000,9:136540945:G:C-EDF1,0.085611,211365.1,0.242237,chr9_136540945


In [14]:
sig_bqtl = sig_bqtl.astype({'snp_position': 'int64', 'snp_chromosome': 'str'})
sig_bqtl = sig_bqtl.astype({'snp_position': 'str'})
sig_bqtl['snp_chromosome'] = 'chr' + sig_bqtl['snp_chromosome']
sig_bqtl['chr_snp_pos'] = sig_bqtl['snp_chromosome'] + '_' +  sig_bqtl['snp_position']

In [15]:
sig_bqtl.chr_snp_pos.value_counts()

chr17_45857379    4
chr6_31867824     4
chr17_45786127    4
chr6_32693360     4
chr6_32692662     4
                 ..
chr15_76812515    1
chr15_76842550    1
chr15_76807324    1
chr15_76576323    1
chr3_196979325    1
Name: chr_snp_pos, Length: 15817, dtype: int64

In [18]:
b_qtl[~b_qtl['chr_snp_pos'].isin(sig_bqtl['chr_snp_pos'])]['chr_snp_pos'].value_counts()

chr6_32900883     57
chr6_31252277     51
chr6_31566642     48
chr6_32232308     42
chr6_32244685     42
                  ..
chr1_64299935      1
chr1_78889159      1
chr1_177777704     1
chr1_78887970      1
chr12_4119931      1
Name: chr_snp_pos, Length: 1302412, dtype: int64

In [28]:
non_sig_bqtls = b_qtl[~b_qtl['chr_snp_pos'].isin(sig_bqtl['chr_snp_pos'])]
non_sig_bqtls.drop(['V1'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_sig_bqtls.drop(['V1'], axis=1, inplace=True)


In [30]:
non_sig_bqtls.to_csv('non_sig_bqtls.tsv', sep='\t', index=False)

In [30]:
sig_bqtls_in_lcl = sig_bqtl[sig_bqtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
sig_bqtls_in_lcl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365.1,12.408954,chr12_56007301
13,SMDT1,22:42092341:A:G,5.301338e-20,1.061741,0.251468,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42092341,A,1,0.342105,1.000000,22:42092341:A:G-SMDT1,9.157704,211365.1,9.313558,chr22_42092341
49,SMDT1,22:42080750:A:C,5.975150e-20,1.068933,0.252964,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,9.144780,211365.1,9.298145,chr22_42080750
90,SMDT1,22:42079564:A:G,6.305804e-20,1.067545,0.252661,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42079564,A,1,0.342105,1.000000,22:42079564:A:G-SMDT1,9.138956,211365.1,9.291881,chr22_42079564
106,EIF5A,17:7304645:A:C,1.232962e-20,1.014169,0.231219,4.903394e-18,17,7306999,7312463,ENSG00000132507,...,7304645,A,1,0.407895,1.000000,17:7304645:A:C-EIF5A,9.313837,211365.1,9.338378,chr17_7304645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19430,ATP5MC2,12:53667573:T:C,9.575432e-05,-0.431953,0.232685,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53667573,T,1,0.342105,1.000000,12:53667573:T:C-ATP5MC2,-3.901106,211365.1,-3.948912,chr12_53667573
19431,ATP5MC2,12:53664128:G:A,9.597239e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53664128,G,1,0.342105,1.000000,12:53664128:G:A-ATP5MC2,-3.900555,211365.1,-3.948327,chr12_53664128
19433,ATP5MC2,12:53662963:TGA:T,9.620825e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53662963,TGA,1,0.342105,1.000000,12:53662963:TGA:T-ATP5MC2,-3.899961,211365.1,-3.947774,chr12_53662963
19434,ATP5MC2,12:53662017:T:C,9.644467e-05,-0.431939,0.232684,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53662017,T,1,0.342105,1.000000,12:53662017:T:C-ATP5MC2,-3.899367,211365.1,-3.946966,chr12_53662017


In [113]:
final_lcl_positions[final_lcl_positions['chr_snp_pos'].isin(sig_bqtls_in_lcl['chr_snp_pos'])]

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
179,rs10888650,chr1,39041489,39041414,39041564,chr1_39041489
337,rs7521893,chr1,28003218,28003143,28003293,chr1_28003218
340,rs7523306,chr1,219147504,219147429,219147579,chr1_219147504
349,rs12567986,chr1,39027946,39027871,39028021,chr1_39027946
395,rs76289224,chr1,39028109,39028034,39028184,chr1_39028109
...,...,...,...,...,...,...
28876,chr17:43707838:D,chr17,45630472,45630397,45630547,chr17_45630472
29017,chr20:33753791:I,chr20,35165988,35165913,35166063,chr20_35165988
29019,chr20:33727276:D,chr20,35139473,35139398,35139548,chr20_35139473
29069,chr20:33739804:D,chr20,35152001,35151926,35152076,chr20_35152001


In [114]:
sig_bqtls_in_lcl['chr_snp_pos'].value_counts()

chr17_46038074    2
chr17_46059437    2
chr17_46054903    2
chr17_46055013    2
chr17_46055092    2
                 ..
chr15_78939274    1
chr15_78942522    1
chr15_78940358    1
chr15_78930582    1
chr12_53663283    1
Name: chr_snp_pos, Length: 2601, dtype: int64

In [None]:
positions_sig_variants

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
0,rs11810220,chr1,163311300,163311225,163311375
1,rs11585048,chr1,2602648,2602573,2602723
3,rs11585844,chr1,37563668,37563593,37563743
5,rs11587500,chr1,24190390,24190315,24190465
6,rs11588318,chr1,200669534,200669459,200669609
...,...,...,...,...,...
208,chr20:25529845:D,chr20,25549209,25549134,25549284
209,chr21:38345364:I,chr21,36973064,36972989,36973139
210,chr21:30327732:D,chr21,28955410,28955335,28955485
211,chr22:50310881:D,chr22,49917233,49917158,49917308


In [None]:
non_significant_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
0,rs11809905,chr1,227334540,227334465,227334615
1,rs114530232,chr1,42958380,42958305,42958455
2,rs114531441,chr1,37548222,37548147,37548297
4,rs11811181,chr1,206551409,206551334,206551484
5,rs114569995,chr1,169828815,169828740,169828890
...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037
29117,chr22:32803042:D,chr22,32407055,32406980,32407130
29118,chr22:24311587:D,chr22,23969398,23969323,23969473
29119,chr22:50310878:I,chr22,49917230,49917155,49917305


In [46]:
sig_lcl = positions_sig_variants

In [47]:
sig_lcl = sig_lcl.astype({'snp_position': 'str'})
sig_lcl['chr_snp_pos'] = sig_lcl['snp_chromosome'] + '_' + sig_lcl['snp_position']

In [34]:
sig_lcl

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
0,rs11810220,chr1,163311300,163311225,163311375,chr1_163311300
1,rs11585048,chr1,2602648,2602573,2602723,chr1_2602648
2,rs11585844,chr1,37563668,37563593,37563743,chr1_37563668
3,rs11587500,chr1,24190390,24190315,24190465,chr1_24190390
4,rs11588318,chr1,200669534,200669459,200669609,chr1_200669534
...,...,...,...,...,...,...
3580,chr20:25529845:D,chr20,25549209,25549134,25549284,chr20_25549209
3581,chr21:38345364:I,chr21,36973064,36972989,36973139,chr21_36973064
3582,chr21:30327732:D,chr21,28955410,28955335,28955485,chr21_28955410
3583,chr22:50310881:D,chr22,49917233,49917158,49917308,chr22_49917233


In [35]:
sig_bqtls_in_sig_lcl = sig_bqtl[sig_bqtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
sig_bqtls_in_sig_lcl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365.1,12.408954,chr12_56007301
646,PILRB,7:100308061:A:G,5.495119e-12,-1.333647,0.266132,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100308061,A,1,0.171053,0.273867,7:100308061:A:G-PILRB,-6.892161,211365.1,-6.489465,chr7_100308061
656,PILRB,7:100382481:T:C,5.669940e-12,-1.334704,0.266742,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100382481,T,1,0.171053,0.273867,7:100382481:T:C-PILRB,-6.887706,211365.1,-6.481279,chr7_100382481
657,PILRB,7:100307702:C:T,5.671196e-12,-1.302142,0.260140,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100307702,C,1,0.171053,0.273867,7:100307702:C:T-PILRB,-6.887674,211365.1,-6.485234,chr7_100307702
660,PILRB,7:100377643:A:G,5.738188e-12,-1.334423,0.266771,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100377643,A,1,0.171053,0.273867,7:100377643:A:G-PILRB,-6.886003,211365.1,-6.479739,chr7_100377643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19222,NAGK,2:71072273:A:T,6.103800e-05,0.658381,0.295549,4.908836e-03,2,71064344,71079808,ENSG00000124357,...,71072273,A,1,0.131579,1.000000,2:71072273:A:T-NAGK,4.008762,211365.1,3.901452,chr2_71072273
19223,NAGK,2:71071208:C:T,6.157547e-05,0.658429,0.295532,4.908836e-03,2,71064344,71079808,ENSG00000124357,...,71071208,C,1,0.131579,1.000000,2:71071208:C:T-NAGK,4.006690,211365.1,3.899127,chr2_71071208
19230,NAGK,2:71066009:T:C,6.255455e-05,0.682553,0.300544,4.908836e-03,2,71064344,71079808,ENSG00000124357,...,71066009,T,1,0.131579,1.000000,2:71066009:T:C-NAGK,4.002961,211365.1,3.886025,chr2_71066009
19428,ATP5MC2,12:53669717:G:A,9.575113e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53669717,G,1,0.342105,1.000000,12:53669717:G:A-ATP5MC2,-3.901114,211365.1,-3.948934,chr12_53669717


In [None]:
sig_bqtls_in_sig_lcl.chr_snp_pos.value_counts()

In [120]:
sig_bqtls_in_sig_lcl_snp = list(final_lcl_positions.chr_snp_pos.unique())

In [None]:
sig_bqtls_in_sig_lcl[sig_bqtls_in_sig_lcl['chr_snp_pos'] == 'chr1_39027946']

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
6025,NDUFS5,1:39027946:T:C,2e-06,0.678978,0.228531,0.000206,1,39026318,39034636,ENSG00000168653,...,39027946,T,1,0.276316,0.691117,1:39027946:T:C-NDUFS5,4.726549,211365.1,4.368403,chr1_39027946


GET variant allele

In [122]:
sig_bqtl_sig_lcl_snps = list(sig_lcl[sig_lcl['chr_snp_pos'].isin(sig_bqtls_in_sig_lcl['chr_snp_pos'])]['snp'])

In [None]:
with open("data/bed_files/sig_bqtl_sig_lcl_snps.txt","w") as f:
    for snp in sig_bqtl_sig_lcl_snps:
        f.write("{0} \n".format(snp))

In [None]:
synonym_var_lcl_snps = pd.read_csv("C:/Users/annav/Downloads/martquery_0524115334_816.txt.gz", sep='\t')

In [None]:
var_lcl_snps = pd.read_csv("C:/Users/annav/Downloads/martquery_0524120923_962.txt.gz", sep='\t')
var_lcl_snps

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Minor allele (ALL),Variant alleles,Synonym name,Global minor allele frequency (all individuals)
0,rs1344,dbSNP,1,147647471,147647471,,G/A,rs698505,
1,rs1344,dbSNP,1,147647471,147647471,,G/A,rs3170854,
2,rs1344,dbSNP,1,147647471,147647471,,G/A,rs17850585,
3,rs1344,dbSNP,1,147647471,147647471,,G/A,rs60715787,
4,rs1344,dbSNP,1,147647471,147647471,,G/A,rs386528725,
...,...,...,...,...,...,...,...,...,...
50196,rs5996114,dbSNP,HSCHR22_2_CTG1,35204,35204,,C/G/T,,
50197,rs4995141,dbSNP,HSCHR14_3_CTG1,845060,845060,,C/T,,
50198,rs4995141,dbSNP,14,106353377,106353377,,C/T,,
50199,rs144711656,dbSNP,HSCHR14_3_CTG1,840975,840975,,T/C,,


In [None]:
var_lcl_snps = var_lcl_snps[var_lcl_snps['Chromosome/scaffold name'] != 'X']
var_lcl_snps = var_lcl_snps[var_lcl_snps['Chromosome/scaffold name'].str.contains(r'^\d+$')]
var_lcl_snps = var_lcl_snps.drop_duplicates(subset=['Variant name'], keep='first')


In [None]:
var_lcl_snps = var_lcl_snps[['Variant name', 'Chromosome/scaffold name', 'Chromosome/scaffold position start (bp)', 'Variant alleles']]
var_lcl_snps.columns = ['snp', 'chr', 'position', 'variant_allele']
var_lcl_snps

Unnamed: 0,snp,chr,position,variant_allele
0,rs1344,1,147647471,G/A
9,rs1496,1,169858717,G/A/T
10,rs4870,1,2556714,A/C/G
62,rs5065,1,11846011,A/G
76,rs5067,1,11845924,A/G/T
...,...,...,...,...
50187,rs5996087,22,41925587,A/C/G
50189,rs5996089,22,41936432,G/A
50190,rs5996114,22,42112860,C/G/T
50198,rs4995141,14,106353377,C/T


In [None]:
def create_snp_id(df):
    chromosome = list(df['chr'])
    position = list(df['position'])
    variant_allele = list(df['variant_allele'])
    snps_id = []
    for chr, pos, var in zip(chromosome, position, variant_allele):
        var = var.replace('/', ':')
        x = str(chr) + ':' + str(pos) +':' + var
        snps_id.append(x)
    df['snp_id'] = snps_id
    return df
    

In [None]:
var_lcl_snps = create_snp_id(var_lcl_snps)

In [None]:
var_lcl_snps

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
50187,rs5996087,22,41925587,A/C/G,22:41925587:A:C:G
50189,rs5996089,22,41936432,G/A,22:41936432:G:A
50190,rs5996114,22,42112860,C/G/T,22:42112860:C:G:T
50198,rs4995141,14,106353377,C/T,14:106353377:C:T


In [None]:
sig_bqtl[sig_bqtl['snp_id'].isin(var_lcl_snps['snp_id'])]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
13,SMDT1,22:42092341:A:G,5.301338e-20,1.061741,0.251468,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42092341,A,1,0.342105,1.000000,22:42092341:A:G-SMDT1,9.157704,211365.1,9.313558,chr22_42092341
49,SMDT1,22:42080750:A:C,5.975150e-20,1.068933,0.252964,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,9.144780,211365.1,9.298145,chr22_42080750
132,GABPB1-AS1,15:50356743:C:T,1.167074e-17,-1.313119,0.240504,7.149596e-16,15,50354944,50372202,ENSG00000244879,...,50356743,C,1,0.289474,0.459454,15:50356743:C:T-GABPB1-AS1,-8.556142,211365.1,-8.510348,chr15_50356743
646,PILRB,7:100308061:A:G,5.495119e-12,-1.333647,0.266132,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100308061,A,1,0.171053,0.273867,7:100308061:A:G-PILRB,-6.892161,211365.1,-6.489465,chr7_100308061
648,PILRB,7:100315306:A:T,5.519019e-12,-1.334578,0.266380,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100315306,A,1,0.171053,0.273867,7:100315306:A:T-PILRB,-6.891544,211365.1,-6.486483,chr7_100315306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19240,NAGK,2:71066569:A:G,6.597289e-05,0.672986,0.299214,4.908836e-03,2,71064344,71079808,ENSG00000124357,...,71066569,A,1,0.131579,1.000000,2:71066569:A:G-NAGK,3.990361,211365.1,3.875926,chr2_71066569
19424,ATP5MC2,12:53664454:C:T,8.956944e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53664454,C,1,0.342105,1.000000,12:53664454:C:T-ATP5MC2,-3.917238,211365.1,-3.969926,chr12_53664454
19426,ATP5MC2,12:53672959:T:C,9.244949e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53672959,T,1,0.342105,1.000000,12:53672959:T:C-ATP5MC2,-3.909599,211365.1,-3.959482,chr12_53672959
19427,ATP5MC2,12:53676661:A:T,9.244949e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53676661,A,1,0.342105,1.000000,12:53676661:A:T-ATP5MC2,-3.909599,211365.1,-3.959482,chr12_53676661


In [None]:
var_lcl_snps[var_lcl_snps['Variant name'] == 'rs4995141']

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Minor allele (ALL),Variant alleles,Synonym name,Global minor allele frequency (all individuals)
50198,rs4995141,dbSNP,14,106353377,106353377,,C/T,,


In [None]:
synonym_var_lcl_snps[synonym_var_lcl_snps['Synonym name'] =='rs9614690']

Unnamed: 0,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Variant alleles,Minor allele (ALL),Strand,Synonym name,Synonym source
10864,X,151150353,151150353,G/A,,1,rs9614690,Former dbSNP
10865,X,117849962,117849962,C/A,,1,rs9614690,Former dbSNP
10866,X,17053874,17053874,T/G,,1,rs9614690,Former dbSNP


In [None]:
synonym_var_lcl_snps = synonym_var_lcl_snps[synonym_var_lcl_snps['Chromosome/scaffold name'] != 'X']
synonym_var_lcl_snps = synonym_var_lcl_snps[synonym_var_lcl_snps['Chromosome/scaffold name'].str.contains(r'^\d+$')]
synonym_var_lcl_snps = synonym_var_lcl_snps.drop_duplicates(subset=['Synonym name'], keep='first')


In [None]:
synonym_var_lcl_snps = synonym_var_lcl_snps[['Synonym name', 'Chromosome/scaffold name', 'Chromosome/scaffold position start (bp)', 'Variant alleles']]
synonym_var_lcl_snps.columns = ['snp', 'chr', 'position', 'variant_allele']
synonym_var_lcl_snps

Unnamed: 0,snp,chr,position,variant_allele
0,rs116024440,1,805036,A/G
1,rs140081212,1,155215184,G/A/T
3,rs150913279,1,2568371,A/G
5,rs55803744,1,149703225,T/A/C
6,rs79266459,1,26170849,A/C/G/T
...,...,...,...,...
10853,rs79393060,22,49917234,C/G/T
10854,rs61634242,22,49601073,G/A/C
10855,rs80020284,22,23970400,G/A
10857,rs77196310,22,45413058,GGGGGG/GGGGG


In [None]:
synonym_var_lcl_snps = create_snp_id(synonym_var_lcl_snps)

rs10627369	chr22	50578781
12936	rs71707919	chr22	43120043
12974	rs66918515	chr22	21002604
13015	rs75892697	chr19	52782474
13016	rs77764310	chr19	53197554
13022	rs139074994	chr19	54632756
13084	rs73135170	

In [None]:
synonym_var_lcl_snps[synonym_var_lcl_snps['snp']== 'rs77764310']

Unnamed: 0,snp,chr,position,variant_allele,snp_id


In [None]:
synonym_var_lcl_snps =synonym_var_lcl_snps[~synonym_var_lcl_snps['snp'].isin(var_lcl_snps['snp'])]

In [None]:
lcl_snp_ids = pd.concat([var_lcl_snps, synonym_var_lcl_snps])

In [None]:
lcl_snp_ids

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
10853,rs79393060,22,49917234,C/G/T,22:49917234:C:G:T
10854,rs61634242,22,49601073,G/A/C,22:49601073:G:A:C
10855,rs80020284,22,23970400,G/A,22:23970400:G:A
10857,rs77196310,22,45413058,GGGGGG/GGGGG,22:45413058:GGGGGG:GGGGG


In [None]:
lcl_snp_ids['snp'].value_counts()

rs1344         1
rs4788099      1
rs4968013      1
rs4968011      1
rs4889679      1
              ..
rs28414073     1
rs28401739     1
rs28379833     1
rs28367131     1
rs116390392    1
Name: snp, Length: 27112, dtype: int64

In [None]:
lcl_snp_ids[lcl_snp_ids.groupby('snp')['snp'].transform('size') >= 2]

Unnamed: 0,snp,chr,position,variant_allele,snp_id
618,rs863850,1,146023711,G/A,1:146023711:G:A
1077,rs2864871,1,150794488,T/A/C,1:150794488:T:A:C
2687,rs79206743,1,146017421,T/C,1:146017421:T:C
5058,rs10399931,1,203186952,T/A/C,1:203186952:T:A:C
5592,rs826542,2,108717018,A/T,2:108717018:A:T
...,...,...,...,...,...
10808,rs62054803,19,3229769,C/T,19:3229769:C:T
10810,rs62054804,19,56062256,ACTACTA/ACTA,19:56062256:ACTACTA:ACTA
10812,rs62054805,19,14596367,C/A/T,19:14596367:C:A:T
10814,rs3760532,19,49138323,T/-,19:49138323:T:-


In [None]:
lcl_snp_ids[lcl_snp_ids['snp'] == 'rs3760532']

Unnamed: 0,snp,chr,position,variant_allele,snp_id
38715,rs3760532,17,81939573,G/A,17:81939573:G:A
10814,rs3760532,19,49138323,T/-,19:49138323:T:-


In [None]:
lcl_positions_rsid = pd.concat([unique_total_variants, syn_total_variants_unique])
lcl_positions_rsid

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,chr1,227334540
1,rs114530232,chr1,42958380
4,rs114531441,chr1,37548222
5,rs11810220,chr1,163311300
6,rs11811181,chr1,206551409
...,...,...,...
13089,rs140493080,chr12,7924992
13090,rs113859809,chr7,100217868
13091,rs150347472,chr12,9965410
13092,rs141183894,chr9,31326631


In [None]:
missing_rsid_lcl = lcl_positions_rsid[~lcl_positions_rsid['snp'].isin(lcl_snp_ids['snp'])]
missing_rsid_lcl

Unnamed: 0,snp,snp_chromosome,snp_position
12908,rs10627369,chr22,50578781
12936,rs71707919,chr22,43120043
12974,rs66918515,chr22,21002604
13015,rs75892697,chr19,52782474
13016,rs77764310,chr19,53197554
13022,rs139074994,chr19,54632756
13084,rs73135170,chr7,72726370


In [None]:
missing_rsid_lcl =missing_rsid_lcl.merge(missing_rsid_mart_export, how='inner', on='snp_position')
missing_rsid_lcl

Unnamed: 0,snp,snp_chromosome,snp_position,variant_allele
0,rs10627369,chr22,50578781,CT/CTTCT
1,rs71707919,chr22,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
2,rs66918515,chr22,21002604,AGACAG/AG
3,rs75892697,chr19,52782474,C/G
4,rs77764310,chr19,53197554,T/A/C
5,rs139074994,chr19,54632756,C/A/G/T
6,rs73135170,chr7,72726370,G/C/T


In [None]:
missing_rsid_lcl.columns = ['snp', 'chr', 'position', 'variant_allele']

In [None]:
chr_list = []
for chr in list(missing_rsid_lcl['chr']):
    chr = chr[3:]
    chr_list.append(chr)

In [None]:
missing_rsid_lcl['chr'] = chr_list

In [None]:
missing_rsid_lcl

Unnamed: 0,snp,chr,position,variant_allele
0,rs10627369,22,50578781,CT/CTTCT
1,rs71707919,22,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
2,rs66918515,22,21002604,AGACAG/AG
3,rs75892697,19,52782474,C/G
4,rs77764310,19,53197554,T/A/C
5,rs139074994,19,54632756,C/A/G/T
6,rs73135170,7,72726370,G/C/T


In [None]:
missing_rsid_lcl = create_snp_id(missing_rsid_lcl)

In [None]:
missing_rsid_lcl

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs10627369,22,50578781,CT/CTTCT,22:50578781:CT:CTTCT
1,rs71707919,22,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG,22:43120043:CTGGTGAGCTCTG:CTG:CTGGTGAGCTCTGGTG...
2,rs66918515,22,21002604,AGACAG/AG,22:21002604:AGACAG:AG
3,rs75892697,19,52782474,C/G,19:52782474:C:G
4,rs77764310,19,53197554,T/A/C,19:53197554:T:A:C
5,rs139074994,19,54632756,C/A/G/T,19:54632756:C:A:G:T
6,rs73135170,7,72726370,G/C/T,7:72726370:G:C:T


In [None]:
lcl_snp_ids = pd.concat([lcl_snp_ids, missing_rsid_lcl])
lcl_snp_ids

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
2,rs66918515,22,21002604,AGACAG/AG,22:21002604:AGACAG:AG
3,rs75892697,19,52782474,C/G,19:52782474:C:G
4,rs77764310,19,53197554,T/A/C,19:53197554:T:A:C
5,rs139074994,19,54632756,C/A/G/T,19:54632756:C:A:G:T


In [None]:
missing_rsid_mart_export

Unnamed: 0,snp_position,variant_allele
0,52782474,C/G
1,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
4,53197554,T/A/C
13,72726370,G/C/T
14,50578781,CT/CTTCT
15,21002604,AGACAG/AG
16,54632756,C/A/G/T


In [None]:
missing_rsid_lcl = list(lcl_positions_rsid[~lcl_positions_rsid['snp'].isin(lcl_snp_ids['snp'])]['snp'])

In [None]:
missing_rsid_lcl

['rs10627369',
 'rs71707919',
 'rs66918515',
 'rs75892697',
 'rs77764310',
 'rs139074994',
 'rs73135170']

In [None]:
len(lcl_snp_ids)

27112

In [None]:
lcl_snp_ids

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
10853,rs79393060,22,49917234,C/G/T,22:49917234:C:G:T
10854,rs61634242,22,49601073,G/A/C,22:49601073:G:A:C
10855,rs80020284,22,23970400,G/A,22:23970400:G:A
10857,rs77196310,22,45413058,GGGGGG/GGGGG,22:45413058:GGGGGG:GGGGG


In [None]:
lcl_snp_ids[~lcl_snp_ids['snp'].isin(final_lcl_positions['snp'])]

Unnamed: 0,snp,chr,position,variant_allele,snp_id


In [None]:
final_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
0,rs11809905,chr1,227334540,227334465,227334615,chr1_227334540
1,rs114530232,chr1,42958380,42958305,42958455,chr1_42958380
2,rs114531441,chr1,37548222,37548147,37548297,chr1_37548222
3,rs11810220,chr1,163311300,163311225,163311375,chr1_163311300
4,rs11811181,chr1,206551409,206551334,206551484,chr1_206551409
...,...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037,chr22_46291962
29117,chr22:32803042:D,chr22,32407055,32406980,32407130,chr22_32407055
29118,chr22:24311587:D,chr22,23969398,23969323,23969473,chr22_23969398
29119,chr22:50310878:I,chr22,49917230,49917155,49917305,chr22_49917230


In [None]:
final_lcl_positions[final_lcl_positions['snp'] == 'rs9614690']

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
27033,rs116390392,chr22,42126310,42126235,42126385,chr22_42126310


In [None]:
# chromosome = list(mart_export_sig_b_sig_lcl['Chromosome/scaffold name'])
# position = list(mart_export_sig_b_sig_lcl['Chromosome/scaffold position start (bp)'])
# variant_allele = list(mart_export_sig_b_sig_lcl['Variant alleles'])

In [None]:
# snps_id = []
# for chr, pos, var in zip(chromosome, position, variant_allele):
#     var = var.replace('/', ':')
#     x = str(chr) + ':' + str(pos) +':' + var
#     snps_id.append(x)

In [None]:
# sig_bqtls_in_sig_lcl[sig_bqtls_in_sig_lcl['snp_id'].isin(snps_id)]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
646,PILRB,7:100308061:A:G,5.495119e-12,-1.333647,0.266132,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100308061,A,1,0.171053,0.273867,7:100308061:A:G-PILRB,-6.892161,211365.1,-6.489465,chr7_100308061
657,PILRB,7:100307702:C:T,5.671196e-12,-1.302142,0.260140,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100307702,C,1,0.171053,0.273867,7:100307702:C:T-PILRB,-6.887674,211365.1,-6.485234,chr7_100307702
660,PILRB,7:100377643:A:G,5.738188e-12,-1.334423,0.266771,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100377643,A,1,0.171053,0.273867,7:100377643:A:G-PILRB,-6.886003,211365.1,-6.479739,chr7_100377643
781,PILRB,7:100355205:C:T,7.301640e-12,-1.340328,0.268553,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100355205,C,1,0.171053,0.273867,7:100355205:C:T-PILRB,-6.851624,211365.1,-6.446276,chr7_100355205
1281,DNAJC15,13:43023570:C:T,4.373026e-10,1.130099,0.238631,1.483959e-08,13,43023203,43114213,ENSG00000120675,...,43023570,C,1,0.210526,1.000000,13:43023570:C:T-DNAJC15,6.240096,211365.1,6.050010,chr13_43023570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18659,CRELD2,22:49921057:G:A,2.079634e-04,1.652924,0.527252,4.584030e-03,22,49918167,49927540,ENSG00000184164,...,49921057,G,1,0.078947,0.191819,22:49921057:G:A-CRELD2,3.709141,211365.1,3.758576,chr22_49921057
18675,CRELD2,22:49917199:G:T,2.106545e-04,1.652789,0.527153,4.584030e-03,22,49918167,49927540,ENSG00000184164,...,49917199,G,1,0.078947,0.191819,22:49917199:G:T-CRELD2,3.705884,211365.1,3.755542,chr22_49917199
18685,CRELD2,22:49924774:T:G,2.237519e-04,1.652924,0.527252,4.584030e-03,22,49918167,49927540,ENSG00000184164,...,49924774,T,1,0.078947,0.191819,22:49924774:T:G-CRELD2,3.690570,211365.1,3.737588,chr22_49924774
19223,NAGK,2:71071208:C:T,6.157547e-05,0.658429,0.295532,4.908836e-03,2,71064344,71079808,ENSG00000124357,...,71071208,C,1,0.131579,1.000000,2:71071208:C:T-NAGK,4.006690,211365.1,3.899127,chr2_71071208


END VARIANT ALLELE
***

In [48]:
non_sig_lcl = non_significant_lcl_positions
non_sig_lcl = non_sig_lcl.astype({'snp_position': 'str'})
non_sig_lcl['chr_snp_pos'] = non_sig_lcl['snp_chromosome'] + '_' + non_sig_lcl['snp_position']
non_sig_lcl

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
0,rs11809905,chr1,227334540,227334465,227334615,chr1_227334540
1,rs114530232,chr1,42958380,42958305,42958455,chr1_42958380
2,rs114531441,chr1,37548222,37548147,37548297,chr1_37548222
4,rs11811181,chr1,206551409,206551334,206551484,chr1_206551409
5,rs114569995,chr1,169828815,169828740,169828890,chr1_169828815
...,...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037,chr22_46291962
29117,chr22:32803042:D,chr22,32407055,32406980,32407130,chr22_32407055
29118,chr22:24311587:D,chr22,23969398,23969323,23969473,chr22_23969398
29119,chr22:50310878:I,chr22,49917230,49917155,49917305,chr22_49917230


In [37]:
non_sig_lcl.chr_snp_pos.value_counts()

chr17_46088714    2
chr10_45550686    2
chr17_46021495    2
chr17_45854807    2
chr19_3672275     1
                 ..
chr11_71476248    1
chr11_74769503    1
chr5_65737627     1
chr5_65737898     1
chr22_32400111    1
Name: chr_snp_pos, Length: 25532, dtype: int64

#### b cell in lcl

In [38]:
#move to cells below
sig_b_qtl_overlap_sig_lcl = sig_bqtl[sig_bqtl.chr_snp_pos.isin(sig_lcl.chr_snp_pos)]
sig_b_qtl_overlap_sig_lcl.chr_snp_pos.value_counts()

chr17_45741580    2
chr17_46014622    2
chr17_46054903    2
chr17_46049325    2
chr17_46036998    2
                 ..
chr7_100336385    1
chr7_100377643    1
chr7_100307702    1
chr7_100382481    1
chr12_53663283    1
Name: chr_snp_pos, Length: 436, dtype: int64

In [None]:
sig_b_qtl_overlap_sig_lcl[sig_b_qtl_overlap_sig_lcl.chr_snp_pos == 'chr6_32659937']

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
958,HLA-DQB1,6:32659937:T:G,5.146971e-12,0.405502,0.299747,1.323373e-10,6,32659467,32668383,ENSG00000179344,...,32659937,T,1,0.486842,0.102074,6:32659937:T:G-HLA-DQB1,6.901463,211365.1,7.392414,chr6_32659937
3289,HLA-DQA1,6:32659937:T:G,5.486857e-07,0.609814,0.241656,6.052621e-06,6,32628179,32647062,ENSG00000196735,...,32659937,T,1,0.486842,0.102074,6:32659937:T:G-HLA-DQA1,5.008455,211365.1,5.231579,chr6_32659937


In [39]:
all_bqtl_overlap_sig_lcl = b_qtl[b_qtl.chr_snp_pos.isin(sig_lcl.chr_snp_pos)]
all_bqtl_overlap_sig_lcl.chr_snp_pos.value_counts()

chr6_32659937     23
chr11_65184368    20
chr6_31979309     19
chr6_31973780     19
chr11_65301804    18
                  ..
chr8_70706859      1
chr11_6481550      1
chr17_18930497     1
chr6_96802660      1
chr1_58585904      1
Name: chr_snp_pos, Length: 1196, dtype: int64

In [19]:
non_sig_bqtls[non_sig_bqtls.chr_snp_pos.isin(sig_lcl.chr_snp_pos)]

NameError: name 'sig_lcl' is not defined

In [40]:
non_sig_bqtl_overlap_sig_lcl = non_sig_bqtls[non_sig_bqtls.chr_snp_pos.isin(sig_lcl.chr_snp_pos)]
non_sig_bqtl_overlap_sig_lcl.chr_snp_pos.value_counts()

chr6_32659937      21
chr11_65184368     20
chr6_31973780      19
chr6_31979309      19
chr11_65301804     18
                   ..
chr11_126598080     1
chr10_7472060       1
chr7_24719199       1
chr7_24719233       1
chr1_58585904       1
Name: chr_snp_pos, Length: 1194, dtype: int64

In [41]:
bqtl_in_sig_lcl = b_qtl[b_qtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
bqtl_in_non_sig_lcl = b_qtl[b_qtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

In [38]:
sig_bqtl_in_lcl = sig_bqtl[sig_bqtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
sig_bqtl_in_sig_lcl = sig_bqtl[sig_bqtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
sig_bqtl_in_non_sig_lcl = sig_bqtl[sig_bqtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

NameError: name 'sig_bqtl' is not defined

In [43]:
non_sig_bqtl_in_lcl = non_sig_bqtls[non_sig_bqtls['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
non_sig_bqtl_in_sig_lcl = non_sig_bqtls[non_sig_bqtls['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
non_sig_bqtl_in_non_sig_lcl = non_sig_bqtls[non_sig_bqtls['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

In [45]:
print(len(non_sig_bqtl_in_sig_lcl['chr_snp_pos'].value_counts()))
print(len(non_sig_bqtl_in_non_sig_lcl['chr_snp_pos'].value_counts()))

1194
7709


In [133]:
sig_bqtl_sig_lcl_snp = list(sig_bqtl_in_sig_lcl.chr_snp_pos.unique())
sig_bqtl_non_sig_lcl_snp = list(sig_bqtl_in_non_sig_lcl.chr_snp_pos.unique())
bqtl_sig_lcl_snp = list(bqtl_in_sig_lcl.chr_snp_pos.unique())
bqtl_non_sig_lcl_snp = list(bqtl_in_non_sig_lcl.chr_snp_pos.unique())


In [None]:
# print(f'Total bqtls that overlap with all lcl snps: {len(bqtl_in_lcl.chr_snp_pos.value_counts())} \t\t Total bqtls that overlap with significant lcl snps: {len(bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
# print(f'significant bqtls overlap with all lcl snps: {len(sig_bqtls_in_lcl.chr_snp_pos.value_counts())} \t\t significant bqtls overlap with significant lcl snps: {len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
# print(f'non-significant bqtls that overlap with all lcl snps: {len(bqtl_in_lcl.chr_snp_pos.value_counts()) - len(sig_bqtls_in_lcl.chr_snp_pos.value_counts())} \t non-significant bqtls that overlap with significant lcl snps: {len(bqtl_in_sig_lcl.chr_snp_pos.value_counts()) - len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')

In [None]:
#f'non-significant bqtls that overlap with all lcl snps: {len(non_sig_bqtl_in_lcl.chr_snp_pos.value_counts())}'

In [None]:
print(f'non-significant bqtls that overlap with all lcl snps: {len(non_sig_bqtl_in_lcl.chr_snp_pos.value_counts())}')
print(f'non-significant bqtls that overlap with significant lcl snps: {len(non_sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'non significant bqtls overlap non significant lcl snps: {len(non_sig_bqtl_in_non_sig_lcl.chr_snp_pos.value_counts())}')

non-significant bqtls that overlap with all lcl snps: 8903
non-significant bqtls that overlap with significant lcl snps: 1194
non significant bqtls overlap non significant lcl snps: 7709


In [None]:
non_sig_bqtl_sig_lcl = len(bqtl_in_sig_lcl.chr_snp_pos.value_counts()) - len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())
print(f'non-significant bqtls that overlap with significant lcl snps: {non_sig_bqtl_sig_lcl}')


non-significant bqtls that overlap with significant lcl snps: 760


In [None]:
f'non sig_bqtl in all lcl {8934-2601}'

'non sig_bqtl in all lcl 6333'

In [None]:
f'non sig_bqtl in non sig lcl {6333-760}'

'non sig_bqtl in non sig lcl 5573'

In [None]:
436/2165

0.20138568129330253

In [None]:
print(f'significant bqtls that overlap with all lcl snps: {len(sig_bqtl_in_lcl.chr_snp_pos.value_counts())}')
print(f'significant bqtls overlap with significant lcl snps: {len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'significant bqtls overlap with non significant lcl snps: {len(sig_bqtl_in_non_sig_lcl.chr_snp_pos.value_counts())}')

significant bqtls that overlap with all lcl snps: 2601
significant bqtls overlap with significant lcl snps: 436
significant bqtls overlap with non significant lcl snps: 2165


In [None]:
print(f'bqtls that overlap with all lcl snps: {len(bqtl_in_lcl.chr_snp_pos.value_counts())}')
print(f'bqtls overlap with significant lcl snps: {len(bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'bqtls overlap with non significant lcl snps: {len(bqtl_in_non_sig_lcl.chr_snp_pos.value_counts())}')

bqtls that overlap with all lcl snps: 8934
bqtls overlap with significant lcl snps: 1196
bqtls overlap with non significant lcl snps: 7738


In [None]:
8934 - 2601

6333

In [None]:
1196-436

760

In [None]:
2165/7709

0.2808405759501881

In [76]:
print(len(sig_bqtl['chr_snp_pos'].value_counts()))
print(len(b_qtl['chr_snp_pos'].value_counts()))
print(len(non_sig_bqtls['chr_snp_pos'].value_counts()))

15817
1318229
1316698


In [83]:
x =b_qtl[~b_qtl['chr_snp_pos'].isin(sig_bqtl['chr_snp_pos'])]

In [84]:
non_sig_bqtl_in_sig_lcl = x[x['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
non_sig_bqtl_in_non_sig_lcl = x[x['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

In [85]:
print(len(non_sig_bqtl_in_sig_lcl['chr_snp_pos'].value_counts()))
print(len(non_sig_bqtl_in_non_sig_lcl['chr_snp_pos'].value_counts()))

760
5573


In [77]:
1318229 - 15817

1302412

In [47]:
b_qtl_lcl = {'sig_bqtl':[436, 2165, 2601],
             'non_sig_bqtl': [1196 - 436, 7738-2165, 8934-2601],
             'total_bqtl':[436 + 760, 7738, 8934],
            #  'ratio': [(len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())/len(non_sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())),
            #            (len(sig_bqtl_in_non_sig_lcl.chr_snp_pos.value_counts())/len(non_sig_bqtl_in_non_sig_lcl.chr_snp_pos.value_counts()))]
                       }

In [49]:
b_qtl_lcl = pd.DataFrame(data=b_qtl_lcl, index=['sig_lcl', 'non_sig_lcl', 'total_lcl']).round({'ratio':3})

In [53]:
b_qtl_lcl

Unnamed: 0,sig_bqtl,non_sig_bqtl,total_bqtl
sig_lcl,436,760,1196
non_sig_lcl,2165,5573,7738
total_lcl,2601,6333,8934


In [52]:
data = b_qtl_lcl.iloc[:2,:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.4767400024310198 3.771509023089699e-09


In [72]:
b_qtl_lcl = {'sig_bqtl':[28, 122, 150],
             'non_sig_bqtl': [1206 - 28, 8117-122, 1178+ 7995],
             'total_bqtl':[1206, 8117, 9323],
            #  'ratio': [(len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())/len(non_sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())),
            #            (len(sig_bqtl_in_non_sig_lcl.chr_snp_pos.value_counts())/len(non_sig_bqtl_in_non_sig_lcl.chr_snp_pos.value_counts()))]
                       }
b_qtl_lcl = pd.DataFrame(data=b_qtl_lcl, index=['sig_lcl', 'non_sig_lcl', 'total_lcl']).round({'ratio':3})
b_qtl_lcl

Unnamed: 0,sig_bqtl,non_sig_bqtl,total_bqtl
sig_lcl,28,1178,1206
non_sig_lcl,122,7995,8117
total_lcl,150,9173,9323


In [73]:
data = b_qtl_lcl.iloc[:2,:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.557655375880208 0.04835715196319314


In [62]:
cd4_qtl_lcl = {'sig_cd4qtl':[1127, 5731, 6858],
             'non_sig_cd4qtl': [1992 - 1127, 11753-5731, 865+6022],
             'total_cd4qtl':[1992, 11753, 12945]}

cd4_qtl_lcl = pd.DataFrame(data=cd4_qtl_lcl, index=['sig_lcl', 'non_sig_lcl', 'total_lcl']).round({'ratio':3})

In [63]:
cd4_qtl_lcl

Unnamed: 0,sig_cd4qtl,non_sig_cd4qtl,total_cd4qtl
sig_lcl,1127,865,1992
non_sig_lcl,5731,6022,11753
total_lcl,6858,6887,12945


In [64]:
data = cd4_qtl_lcl.iloc[:2,:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.3690463486786697 1.1037774838388138e-10


In [68]:
cd4_qtl_lcl = {'sig_cd4qtl':[499, 3198, 3697],
             'non_sig_cd4qtl': [1364 - 499, 9220-3198, 3697+10584],
             'total_cd4qtl':[1992, 11753, 12945]}

cd4_qtl_lcl = pd.DataFrame(data=cd4_qtl_lcl, index=['sig_lcl', 'non_sig_lcl', 'total_lcl']).round({'ratio':3})
cd4_qtl_lcl

Unnamed: 0,sig_cd4qtl,non_sig_cd4qtl,total_cd4qtl
sig_lcl,499,865,1992
non_sig_lcl,3198,6022,11753
total_lcl,3697,14281,12945


In [69]:
data = cd4_qtl_lcl.iloc[:2,:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.0862923720388826 0.17106075214300348


In [None]:
b_qtl_enrichment_lcl = {'sig_bqtl': [436, 2165],
                        'total_bqtl': [1196, 7738],
                        'ratio': [(436/1196), (2165/7738)]}

In [None]:
b_qtl_enrichment_lcl =pd.DataFrame(data=b_qtl_enrichment_lcl, index=['sig_lcl', 'non_sig_lcl']).round({'ratio':2})
b_qtl_enrichment_lcl

Unnamed: 0,sig_bqtl,total_bqtl,ratio
sig_lcl,436,1196,0.36
non_sig_lcl,2165,7738,0.28


In [None]:
228/952

0.23949579831932774

In [None]:
data = b_qtl_enrichment_lcl.iloc[:,0:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.3029451520464674 2.0832426597131646e-05


In [None]:
cd4_qtl_enrichment_lcl = {'sig_cd4_qtl': [730, 4223],
                          'all_cd4_qtl': [1284, 8555],
                          'ratio':[730/1284, 4223/8555]}

In [None]:
cd4_qtl_enrichment_lcl = pd.DataFrame(data=cd4_qtl_enrichment_lcl, index=['sig_lcl', 'non_sig_lcl']).round({'ratio':2})
cd4_qtl_enrichment_lcl

Unnamed: 0,sig_cd4_qtl,all_cd4_qtl,ratio
sig_lcl,730,1284,0.57
non_sig_lcl,4223,8555,0.49


In [None]:
data = cd4_qtl_enrichment_lcl.iloc[:,0:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.1517461490738672 0.005194079001015332


significant bcell snps out of cd4 results in lcl mpra

- eqtl effecten, niet in bcellen maar andere celtypen
- mesh r -> bepaalde qtl effecten gedeeld worden tussen celtypen
- verwachting 20 -> 36% minder in qtls die niet in bcellen gevonden worden
- eQTL, niet in bcellen, maar wel tenminste in 1 ander celtype
    - bcell -> snps -> uit cd4 resultaten. 
    - zelfde berekening

***

In [31]:
cd4_qtl = pd.read_csv('eQTL\WMA_meta_CD4_T_qtl_results_fastApprox.txt', sep='\t')
sig_cd4_qtl = pd.read_csv('data/cd4_qtl.csv', sep='\t')

In [None]:
print(len(cd4_qtl))
print(len(sig_cd4_qtl))

7781522
199627


In [32]:
sig_cd4_qtl = sig_cd4_qtl.astype({'snp_position': 'int64', 'snp_chromosome': 'str'})
sig_cd4_qtl = sig_cd4_qtl.astype({'snp_position': 'str'})
sig_cd4_qtl['snp_chromosome'] = 'chr' + sig_cd4_qtl['snp_chromosome']
sig_cd4_qtl['chr_snp_pos'] = sig_cd4_qtl['snp_chromosome'] + '_' +  sig_cd4_qtl['snp_position']

In [None]:
sig_cd4_qtl['empirical_feature_p_value'][len(sig_cd4_qtl)-1]

0.0258057636183491

In [136]:
cd4_qtl = cd4_qtl.astype({'snp_position': 'int64', 'snp_chromosome': 'str'})
cd4_qtl = cd4_qtl.astype({'snp_position': 'str'})
cd4_qtl['snp_chromosome'] = 'chr' + cd4_qtl['snp_chromosome']
cd4_qtl['chr_snp_pos'] = cd4_qtl['snp_chromosome'] + '_' +  cd4_qtl['snp_position']

In [None]:
cd4_qtl['chr_snp_pos'].value_counts()

chr6_31326454     105
chr6_31252277     105
chr6_32254019     102
chr6_31457941     102
chr6_31101839      99
                 ... 
chr6_69304099       1
chr6_69568655       1
chr6_69149706       1
chr6_69149246       1
chr6_121432797      1
Name: chr_snp_pos, Length: 1652543, dtype: int64

In [None]:
len(cd4_qtl)

7781522

In [None]:
non_sig_cd4_qtl = cd4_qtl[cd4_qtl['empirical_feature_p_value']> sig_cd4_qtl['empirical_feature_p_value'][len(sig_cd4_qtl)-1]]
#non_sig_cd4_qtl

In [None]:
non_sig_cd4_qtl

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
199627,6550050,ZNF638,2:71367738:CT:C,0.002652,-0.191855,0.341871,0.025806,2,71276561,71435069,...,71367738,CT,1,0.092105,0.261092,2:71367738:CT:C-ZNF638,-3.005462e+00,217721,-2.915124,chr2_71367738
199628,6526347,ZNF638,2:71299730:T:G,0.002761,-0.208300,0.249569,0.025806,2,71276561,71435069,...,71299730,T,1,0.131579,0.492859,2:71299730:T:G-ZNF638,-2.993150e+00,217721,-2.675423,chr2_71299730
199629,6525944,ZNF638,2:71366363:GA:G,0.002775,-0.210579,0.250673,0.025806,2,71276561,71435069,...,71366363,GA,1,0.131579,0.492859,2:71366363:GA:G-ZNF638,-2.991671e+00,217721,-2.680057,chr2_71366363
199630,6528217,ZNF638,2:71315825:C:T,0.002782,-0.203392,0.250604,0.025806,2,71276561,71435069,...,71315825,C,1,0.131579,0.492859,2:71315825:C:T-ZNF638,-2.990833e+00,217721,-2.661877,chr2_71315825
199631,6525896,ZNF638,2:71348340:A:T,0.002799,-0.210176,0.250076,0.025806,2,71276561,71435069,...,71348340,A,1,0.131579,0.492859,2:71348340:A:T-ZNF638,-2.988996e+00,217721,-2.673386,chr2_71348340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7781517,8579753,HSF2,6:121687706:A:C,1.000000,0.149612,0.249112,1.000000,6,122399551,122433119,...,121687706,A,1,0.486842,1.000000,6:121687706:A:C-HSF2,-3.775919e-07,217721,0.066883,chr6_121687706
7781518,2544097,MESD,15:80830629:T:A,0.999450,-0.523612,0.259139,1.000000,15,80946289,80989828,...,80830629,T,1,0.276316,0.691117,15:80830629:T:A-MESD,-6.899242e-04,217721,0.083578,chr15_80830629
7781519,2544095,MESD,15:80831169:T:C,0.999974,-0.523514,0.259068,1.000000,15,80946289,80989828,...,80831169,T,1,0.276316,0.691117,15:80831169:T:C-MESD,3.284134e-05,217721,0.084264,chr15_80831169
7781520,2544096,MESD,15:80830968:T:C,0.999974,-0.523514,0.259068,1.000000,15,80946289,80989828,...,80830968,T,1,0.276316,0.691117,15:80830968:T:C-MESD,3.284134e-05,217721,0.084264,chr15_80830968


#### cd4 in lcl

In [None]:
sig_cd4_qtl_in_lcl = sig_cd4_qtl[sig_cd4_qtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
sig_cd4_qtl_in_sig_lcl = sig_cd4_qtl[sig_cd4_qtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
sig_cd4_qtl_in_non_sig_lcl = sig_cd4_qtl[sig_cd4_qtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

In [None]:
non_sig_cd4_qtl_in_lcl = non_sig_cd4_qtl[non_sig_cd4_qtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
non_sig_cd4_qtl_in_sig_lcl = non_sig_cd4_qtl[non_sig_cd4_qtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
non_sig_cd4_qtl_in_non_sig_lcl = non_sig_cd4_qtl[non_sig_cd4_qtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

In [None]:
cd4_qtl_in_lcl = cd4_qtl[cd4_qtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
cd4_qtl_in_sig_lcl = cd4_qtl[cd4_qtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
cd4_qtl_in_non_sig_lcl = cd4_qtl[cd4_qtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

In [None]:
#cd4_qtl_in_non_sig_lcl = cd4_qtl[cd4_qtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]
#cd4_qtl_in_non_sig_lcl

In [None]:
#{4953 - 730}

In [None]:
print(f'significant cd4 qtls overlap with all lcl snps: {len(sig_cd4_qtl_in_lcl.chr_snp_pos.value_counts())}')
print(f'significant cd4 qtls overlap with significant lcl snps: {len(sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'significant cd4 qtls overlap with non significant lcl snps: {len(sig_cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())}')


significant cd4 qtls overlap with all lcl snps: 4953
significant cd4 qtls overlap with significant lcl snps: 730
significant cd4 qtls overlap with non significant lcl snps: 4223


In [None]:
print(f'non significant cd4qtls overlap with all lcl snps: {len(non_sig_cd4_qtl_in_lcl.chr_snp_pos.value_counts())}')
print(f'non significant cd4qtls overlap with significant lcl snps: {len(non_sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'non significant cd4 qtls overlap with non significant lcl snps: {len(non_sig_cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())}')


non significant cd4qtls overlap with all lcl snps: 9791
non significant cd4qtls overlap with significant lcl snps: 1278
non significant cd4 qtls overlap with non significant lcl snps: 8513


In [None]:
print(f'Total cd4qtls that overlap with all lcl snps: {len(cd4_qtl_in_lcl.chr_snp_pos.value_counts())}')
print(f'Total cd4qtls that overlap with significant lcl snps: {len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'total cd4 qtls overlap with non significant lcl snps: {len(cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())}')


Total cd4qtls that overlap with all lcl snps: 9839
Total cd4qtls that overlap with significant lcl snps: 1284
total cd4 qtls overlap with non significant lcl snps: 8555


In [None]:
cd4_qtl_lcl = {'sig_cd4_qtl':[len(sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()),len(sig_cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())],
               'total_cd4_qtl':[len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()), len(cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())],
                #'non_sig_cd4_qtl':[len(non_sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()), len(non_sig_cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())],
                'ratio': [(len(sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())/len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())),
                          (len(sig_cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())/len(cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts()))] }

In [None]:
cd4_qtl_lcl = pd.DataFrame(data=cd4_qtl_lcl, index=['sig_lcl', 'non_sig_lcl']).round({'ratio':3})
cd4_qtl_lcl

Unnamed: 0,sig_cd4_qtl,total_cd4_qtl,ratio
sig_lcl,730,1284,0.569
non_sig_lcl,4223,8555,0.494


Total cd4qtls that overlap with significant lcl snps - ()

In [None]:
1284 - 428

856

In [None]:
# print(f'Total cd4qtls that overlap with all lcl snps: {len(cd4_qtl_in_lcl.chr_snp_pos.value_counts())} \t\t Total cd4qtls that overlap with significant lcl snps: {len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())} \t non significant {len(cd4_qtl_in_lcl.chr_snp_pos.value_counts()) -len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())}')
# print(f'significant cd4qtls overlap with all lcl snps: {len(sig_cd4_qtl_in_lcl.chr_snp_pos.value_counts())} \t\t significant cd4qtls overlap with significant lcl snps: {len(sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())} \t non significant{4953 - 730}')
# print(f'non-significant cd4qtls that overlap with all lcl snps: {len(cd4_qtl_in_lcl.chr_snp_pos.value_counts()) - len(sig_cd4_qtl_in_lcl.chr_snp_pos.value_counts())} \t non-significant cd4qtls that overlap with all lcl snps: {len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()) - len(sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts())} \t non significant{ 4886-554}')

In [None]:
print(f'Total bqtls that overlap with all lcl snps: {len(bqtl_in_lcl.chr_snp_pos.value_counts())} \t\t Total bqtls that overlap with significant lcl snps: {len(bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'significant bqtls overlap with all lcl snps: {len(sig_bqtls_in_lcl.chr_snp_pos.value_counts())} \t\t significant bqtls overlap with significant lcl snps: {len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'non-significant bqtls that overlap with all lcl snps: {len(bqtl_in_lcl.chr_snp_pos.value_counts()) - len(sig_bqtls_in_lcl.chr_snp_pos.value_counts())} \t non-significant bqtls that overlap with all lcl snps: {len(bqtl_in_sig_lcl.chr_snp_pos.value_counts()) - len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')

Total bqtls that overlap with all lcl snps: 8934 		 Total bqtls that overlap with significant lcl snps: 1196
significant bqtls overlap with all lcl snps: 2601 		 significant bqtls overlap with significant lcl snps: 436
non-significant bqtls that overlap with all lcl snps: 6333 	 non-significant bqtls that overlap with all lcl snps: 760


In [None]:
2601 - 436

2165

In [None]:
# 1284 - (730/302)

In [None]:
302/856

0.352803738317757

In [None]:
2148/ 6480

0.3314814814814815

In [None]:
len(sig_cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts())

4223

In [None]:
cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts()

chr11_65181084     46
chr11_65179808     46
chr11_65183372     46
chr11_65183499     46
chr11_65179769     46
                   ..
chr10_100568490     1
chr10_100568513     1
chr10_100559435     1
chr10_100559989     1
chr17_35426661      1
Name: chr_snp_pos, Length: 8555, dtype: int64

In [None]:
print(f'ratio sig cd4 - lcl\t\t {len(sig_cd4_qtl_in_lcl.chr_snp_pos.value_counts()) / len(cd4_qtl_in_lcl.chr_snp_pos.value_counts()):.2f}')
print(f'ratio sig cd4 - sig lcl\t\t {len(sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()) / len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()):.2f}')
print(f'ratio sig cd4 - non sig lcl\t {len(sig_cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts()) / len(cd4_qtl_in_non_sig_lcl.chr_snp_pos.value_counts()):.2f}')

print('\n')
print(f'ratio sig b - lcl \t\t {len(sig_bqtls_in_lcl.chr_snp_pos.value_counts()) / len(bqtl_in_lcl.chr_snp_pos.value_counts()):.2f}')
print(f'ratio sig b - sig lcl \t\t {len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts()) / len(bqtl_in_sig_lcl.chr_snp_pos.value_counts()):.2f}')
print(f'ratio sig b - non sig lcl \t {len(sig_bqtl_in_non_sig_lcl.chr_snp_pos.value_counts()) / len(bqtl_in_non_sig_lcl.chr_snp_pos.value_counts()):.2f}')
#print(f'{ 2165 / 6938:.2f}')



ratio sig cd4 - lcl		 0.50
ratio sig cd4 - sig lcl		 0.57
ratio sig cd4 - non sig lcl	 0.49


ratio sig b - lcl 		 0.29
ratio sig b - sig lcl 		 0.36
ratio sig b - non sig lcl 	 0.28


In [None]:
len(bqtl_in_sig_lcl.chr_snp_pos.value_counts())

1196

je vind meer qtl effecten -> meer overlap

In [None]:
len(cd4_qtl[cd4_qtl['snp_id'].isin(sig_cd4_qtl.snp_id)])

1481115

In [None]:
len(cd4_qtl[cd4_qtl['chr_snp_pos'].isin(sig_cd4_qtl.chr_snp_pos)])

1488987

In [None]:
len(sig_cd4_qtl.snp_id)

199627

In [None]:
8555-(4223-2148)

6480

b resultaten uit de cd4 t cellen

In [None]:
2148/6480

0.3314814814814815

In [None]:
sig_cd4_qtl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,5.750549e-61,1.525292,0.277239,3.875870e-58,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,16.472841,217721,16.886385,chr12_56007301
1,SMDT1,22:42074313:T:C,3.718128e-59,-1.446559,0.271777,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42074313,T,1,0.342105,1.000000,22:42074313:T:C-SMDT1,-16.218728,217721,-16.704826,chr22_42074313
2,SMDT1,22:42080766:A:T,4.780513e-59,1.449479,0.272295,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080766,A,1,0.342105,1.000000,22:42080766:A:T-SMDT1,16.203283,217721,16.682376,chr22_42080766
3,SMDT1,22:42080750:A:C,4.832899e-59,1.447290,0.271892,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,16.202613,217721,16.681792,chr22_42080750
4,SMDT1,22:42078134:C:G,4.860768e-59,1.445682,0.271640,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42078134,C,1,0.342105,1.000000,22:42078134:C:G-SMDT1,16.202260,217721,16.680695,chr22_42078134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199622,PEA15,1:160013969:C:T,1.194720e-03,0.595101,0.254633,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160013969,C,1,0.171053,1.000000,1:160013969:C:T-PEA15,3.240138,217721,2.913457,chr1_160013969
199623,PEA15,1:160015506:GA:G,1.196031e-03,0.557679,0.255824,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160015506,GA,1,0.184211,1.000000,1:160015506:GA:G-PEA15,3.239825,217721,2.927943,chr1_160015506
199624,PEA15,1:160027419:C:G,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160027419,C,1,0.184211,1.000000,1:160027419:C:G-PEA15,3.235851,217721,2.932173,chr1_160027419
199625,PEA15,1:160022992:T:C,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160022992,T,1,0.184211,1.000000,1:160022992:T:C-PEA15,3.235851,217721,2.932173,chr1_160022992


In [None]:
sig_cd4_qtl_unique = sig_cd4_qtl[~sig_cd4_qtl['chr_snp_pos'].isin(sig_bqtl.chr_snp_pos)]
sig_cd4_qtl_unique

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
101,SH3YL1,2:224919:A:G,4.487579e-56,1.373035,0.257340,5.360115e-54,2,217730,266398,ENSG00000035115,...,224919,A,1,0.342105,1.0,2:224919:A:G-SH3YL1,15.776890,217721,16.108700,chr2_224919
102,SH3YL1,2:242426:A:G,4.964040e-56,1.403399,0.263155,5.360115e-54,2,217730,266398,ENSG00000035115,...,242426,A,1,0.342105,1.0,2:242426:A:G-SH3YL1,15.770519,217721,16.102613,chr2_242426
103,SH3YL1,2:239969:C:T,4.982439e-56,1.403082,0.263122,5.360115e-54,2,217730,266398,ENSG00000035115,...,239969,C,1,0.342105,1.0,2:239969:C:T-SH3YL1,15.770285,217721,16.102477,chr2_239969
104,SH3YL1,2:242132:G:A,4.995529e-56,1.403059,0.263114,5.360115e-54,2,217730,266398,ENSG00000035115,...,242132,G,1,0.342105,1.0,2:242132:G:A-SH3YL1,15.770119,217721,16.101949,chr2_242132
105,SH3YL1,2:239597:C:G,4.996765e-56,1.403082,0.263122,5.360115e-54,2,217730,266398,ENSG00000035115,...,239597,C,1,0.342105,1.0,2:239597:C:G-SH3YL1,15.770104,217721,16.102111,chr2_239597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199622,PEA15,1:160013969:C:T,1.194720e-03,0.595101,0.254633,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160013969,C,1,0.171053,1.0,1:160013969:C:T-PEA15,3.240138,217721,2.913457,chr1_160013969
199623,PEA15,1:160015506:GA:G,1.196031e-03,0.557679,0.255824,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160015506,GA,1,0.184211,1.0,1:160015506:GA:G-PEA15,3.239825,217721,2.927943,chr1_160015506
199624,PEA15,1:160027419:C:G,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160027419,C,1,0.184211,1.0,1:160027419:C:G-PEA15,3.235851,217721,2.932173,chr1_160027419
199625,PEA15,1:160022992:T:C,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160022992,T,1,0.184211,1.0,1:160022992:T:C-PEA15,3.235851,217721,2.932173,chr1_160022992


In [None]:
sig_cd4_qtl_unique.chr_snp_pos.value_counts()

chr6_30485012     8
chr11_65822572    7
chr6_30220994     7
chr6_30652239     7
chr6_30445254     7
                 ..
chr7_6419629      1
chr7_6427820      1
chr11_62659506    1
chr4_4326772      1
chr1_160021673    1
Name: chr_snp_pos, Length: 144036, dtype: int64

In [None]:
overlap_sig_b_sig_cd4 = sig_bqtl[sig_bqtl['snp_id'].isin(sig_cd4_qtl.snp_id)]


In [None]:
overlap_sig_b_sig_cd4

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365,12.408954,chr12_56007301
1,SMDT1,22:42123461:A:AT,4.472720e-20,1.006116,0.252152,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42123461,A,1,0.315789,1.000000,22:42123461:A:AT-SMDT1,9.176031,211365,9.403753,chr22_42123461
2,SMDT1,22:42119191:GAGAT:G,4.763233e-20,1.078521,0.252800,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42119191,GAGAT,1,0.342105,1.000000,22:42119191:GAGAT:G-SMDT1,9.169250,211365,9.320564,chr22_42119191
3,SMDT1,22:42092156:C:A,4.933741e-20,1.073046,0.253754,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42092156,C,1,0.342105,1.000000,22:42092156:C:A-SMDT1,9.165457,211365,9.320889,chr22_42092156
4,SMDT1,22:42094636:CAAA:C,5.097736e-20,1.063314,0.251781,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42094636,CAAA,1,0.342105,1.000000,22:42094636:CAAA:C-SMDT1,9.161930,211365,9.318367,chr22_42094636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20011,TUFM,16:28892442:CTG:C,9.915560e-04,0.881049,0.314535,5.637180e-03,16,28842411,28846348,ENSG00000178952,...,28892442,CTG,1,0.381579,0.490409,16:28892442:CTG:C-TUFM,3.292912,211365,3.366263,chr16_28892442
20012,TUFM,16:28892446:AAC:A,9.915560e-04,0.881049,0.314535,5.637180e-03,16,28842411,28846348,ENSG00000178952,...,28892446,AAC,1,0.381579,0.490409,16:28892446:AAC:A-TUFM,3.292912,211365,3.366263,chr16_28892446
20013,HLA-C,6:31478769:T:C,1.863080e-03,-0.390286,0.236738,5.640952e-03,6,31268749,31272130,ENSG00000204525,...,31478769,T,1,0.355263,1.000000,6:31478769:T:C-HLA-C,-3.111233,211365,-3.108471,chr6_31478769
20014,HLA-C,6:31516255:T:C,1.864925e-03,-0.751529,0.316263,5.643625e-03,6,31268749,31272130,ENSG00000204525,...,31516255,T,1,0.184211,1.000000,6:31516255:T:C-HLA-C,-3.110940,211365,-2.973853,chr6_31516255


In [None]:
overlap_sig_cd4_sig_b = sig_cd4_qtl[sig_cd4_qtl['chr_snp_pos'].isin(sig_bqtl.chr_snp_pos)]

In [None]:
overlap_sig_cd4_sig_b['chr_snp_pos'].value_counts()

chr6_31457941    10
chr6_31326454     9
chr6_31711840     9
chr6_31562859     8
chr6_32268789     8
                 ..
chr6_31258330     1
chr6_31257931     1
chr6_31257515     1
chr6_31257316     1
chr6_32426536     1
Name: chr_snp_pos, Length: 11068, dtype: int64

In [None]:
print(len(sig_cd4_qtl[~sig_cd4_qtl['chr_snp_pos'].isin(sig_bqtl.chr_snp_pos)]['chr_snp_pos'].value_counts()))
print(len(sig_bqtl[~sig_bqtl['chr_snp_pos'].isin(sig_cd4_qtl.chr_snp_pos)]['chr_snp_pos'].value_counts()))
print(len(sig_cd4_qtl['chr_snp_pos'].value_counts()))
print(len(sig_bqtl['chr_snp_pos'].value_counts()))

144036
4749
155104
15817


In [None]:
overlap_sig_b_sig_cd4['c'].value_counts()

6:32685359:A:G     3
6:32685900:G:T     3
6:32680358:C:T     3
6:32680359:A:G     3
6:32674599:A:T     3
                  ..
6:32303666:C:G     1
6:32303897:A:T     1
6:32285069:A:AT    1
6:32303098:C:T     1
6:31396930:G:A     1
Name: snp_id, Length: 11080, dtype: int64

In [None]:
sig_b_qtl_unique = sig_bqtl[~sig_bqtl['chr_snp_pos'].isin(sig_cd4_qtl.chr_snp_pos)]
sig_b_qtl_unique

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
206,HLA-DQB1,6:32660413:T:C,5.482612e-18,-1.517662,0.334129,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32660413,T,1,0.131579,0.492859,6:32660413:T:C-HLA-DQB1,-8.642854,211365.1,-8.879585,chr6_32660413
207,HLA-DQB1,6:32663148:A:T,6.251302e-18,-1.527211,0.336776,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32663148,A,1,0.131579,0.492859,6:32663148:A:T-HLA-DQB1,-8.627855,211365.1,-8.852898,chr6_32663148
208,HLA-DQB1,6:32663172:A:G,6.251302e-18,-1.527211,0.336776,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32663172,A,1,0.131579,0.492859,6:32663172:A:G-HLA-DQB1,-8.627855,211365.1,-8.852898,chr6_32663172
209,HLA-DQB1,6:32663167:G:A,6.292160e-18,-1.527211,0.336776,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32663167,G,1,0.131579,0.492859,6:32663167:G:A-HLA-DQB1,-8.627110,211365.1,-8.851912,chr6_32663167
210,HLA-DQB1,6:32659750:C:A,6.607150e-18,-1.508293,0.333583,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32659750,C,1,0.131579,0.492859,6:32659750:C:A-HLA-DQB1,-8.621519,211365.1,-8.849241,chr6_32659750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20003,HLA-DRA,6:32661378:C:A,4.750873e-04,-0.375719,0.193752,5.602038e-03,6,32439878,32445046,ENSG00000204287,...,32661378,C,1,0.131579,0.492859,6:32661378:C:A-HLA-DRA,-3.494423,211365.1,-3.511569,chr6_32661378
20005,DDX17,22:38594733:C:T,5.309879e-04,-0.384871,0.157286,5.604411e-03,22,38483438,38507660,ENSG00000100201,...,38594733,C,1,0.394737,0.042608,22:38594733:C:T-DDX17,-3.464617,211365.1,-3.220773,chr22_38594733
20007,RPS18,6:33151135:A:G,1.886131e-04,0.344654,0.164863,5.634371e-03,6,33272075,33276511,ENSG00000231500,...,33151135,A,1,0.144737,0.569699,6:33151135:A:G-RPS18,3.733799,211365.1,3.664862,chr6_33151135
20008,DDX17,22:38597365:C:T,5.382164e-04,-0.366285,0.151527,5.636670e-03,22,38483438,38507660,ENSG00000100201,...,38597365,C,1,0.394737,0.042608,22:38597365:C:T-DDX17,-3.460978,211365.1,-3.219666,chr22_38597365


In [None]:
sig_b_qtl_unique.chr_snp_pos.value_counts()

chr6_32702410     3
chr6_32702419     3
chr6_32660413     2
chr6_32702013     2
chr6_32705155     2
                 ..
chr6_32722767     1
chr6_32704110     1
chr15_61700267    1
chr15_61700300    1
chr3_196979325    1
Name: chr_snp_pos, Length: 4749, dtype: int64

In [None]:
# unique_sig_b_sig_lcl = sig_b_qtl_unique[sig_b_qtl_unique['chr_snp_pos'].isin(sig_lcl.chr_snp_pos)]
# unique_sig_b_non_sig_lcl = sig_b_qtl_unique[sig_b_qtl_unique['chr_snp_pos'].isin(non_sig_lcl.chr_snp_pos)]
# unique_sig_b_lcl = sig_b_qtl_unique[sig_b_qtl_unique['chr_snp_pos'].isin(final_lcl_positions.chr_snp_pos)]

In [None]:
# cd4_qtl[~cd4_qtl['chr_snp_pos'].isin(sig_bqtl.chr_snp_pos)]

In [None]:
# cd4_qtl.info()

In [None]:
sig_bqtl = sig_bqtl.astype({'snp_position': 'int64', 'weight': 'int64'})

In [None]:
sig_bqtl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20017 entries, 0 to 20016
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   feature_id                 20017 non-null  object 
 1   snp_id                     20017 non-null  object 
 2   p_value                    20017 non-null  float64
 3   beta                       20017 non-null  float64
 4   beta_se                    20017 non-null  float64
 5   empirical_feature_p_value  20017 non-null  float64
 6   feature_chromosome         20017 non-null  int64  
 7   feature_start              20017 non-null  int64  
 8   feature_end                20017 non-null  int64  
 9   ENSG                       20017 non-null  object 
 10  biotype                    20017 non-null  object 
 11  n_samples                  20017 non-null  int64  
 12  n_e_samples                20017 non-null  int64  
 13  alpha_param                0 non-null      flo

In [None]:
cd4_qtl = cd4_qtl.drop(['V1'], axis=1)

In [None]:
sig_bqtl['snp_position'].value_counts()

45857379     4
32693360     4
31867824     4
32692662     4
45933245     4
            ..
76842550     1
76807324     1
76576323     1
76578657     1
196979325    1
Name: snp_position, Length: 15815, dtype: int64

In [None]:
sig_bqtl.chr_snp_pos.value_counts()

chr17_45857379    4
chr6_31867824     4
chr17_45786127    4
chr6_32693360     4
chr6_32692662     4
                 ..
chr15_76812515    1
chr15_76842550    1
chr15_76807324    1
chr15_76576323    1
chr3_196979325    1
Name: chr_snp_pos, Length: 15817, dtype: int64

In [None]:
sig_cd4_qtl.chr_snp_pos.value_counts()

chr6_31457941     10
chr6_31326454      9
chr6_31711840      9
chr6_32268789      8
chr6_31562859      8
                  ..
chr13_75791007     1
chr13_75790946     1
chr13_75795610     1
chr13_75802993     1
chr1_160021673     1
Name: chr_snp_pos, Length: 155104, dtype: int64

In [None]:
sig_bqtl[sig_bqtl['snp_position'] == '45857379']

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos


In [None]:
cd4_qtl[cd4_qtl['snp_position'] == '45857379']

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
1784,KANSL1-AS1,17:45857379:GTT:G,3.971514e-49,-1.401485,0.273046,8.080704999999999e-49,17,46193566,46197842,ENSG00000214401,...,45857379,GTT,1,0.25,0.664641,17:45857379:GTT:G-KANSL1-AS1,-14.73276,217721,-15.094948,chr17_45857379
8153,KANSL1,17:45857379:GTT:G,1.9291640000000002e-25,-1.107034,0.202017,3.860711e-25,17,46029916,46225389,ENSG00000120071,...,45857379,GTT,1,0.25,0.664641,17:45857379:GTT:G-KANSL1,-10.423881,217721,-9.885703,chr17_45857379
12425,KANSL1-AS1,17:45857379:GT:G,3.155421e-20,0.994706,0.257094,4.0184699999999996e-20,17,46193566,46197842,ENSG00000214401,...,45857379,GT,1,0.460526,0.52802,17:45857379:GT:G-KANSL1-AS1,9.21354,217721,9.551112,chr17_45857379
13815,KANSL1,17:45857379:GT:G,4.482796e-19,0.701559,0.191047,6.197174999999999e-19,17,46029916,46225389,ENSG00000120071,...,45857379,GT,1,0.460526,0.52802,17:45857379:GT:G-KANSL1,8.924361,217721,8.800227,chr17_45857379
22269,FAM215B,17:45857379:GT:G,7.261873e-15,0.200282,0.13025,2.023736e-12,17,46558830,46562795,ENSG00000232300,...,45857379,GT,1,0.460526,0.52802,17:45857379:GT:G-FAM215B,7.779836,217721,8.354175,chr17_45857379
27012,FAM215B,17:45857379:GTT:G,1.857524e-11,-0.074496,0.138332,3.544656e-11,17,46558830,46562795,ENSG00000232300,...,45857379,GTT,1,0.25,0.664641,17:45857379:GTT:G-FAM215B,-6.716805,217721,-7.379311,chr17_45857379
139482,ARL17A,17:45857379:GTT:G,0.003930257,0.42296,0.201678,0.006097383,17,46516702,46579691,ENSG00000185829,...,45857379,GTT,1,0.25,0.664641,17:45857379:GTT:G-ARL17A,2.883706,217721,3.052141,chr17_45857379
324746,HEXIM1,17:45857379:GT:G,0.01866293,-0.1512,0.19129,0.08915191,17,45148475,45152099,ENSG00000186834,...,45857379,GT,1,0.460526,0.52802,17:45857379:GT:G-HEXIM1,-2.352196,217721,-1.979273,chr17_45857379
326482,HEXIM1,17:45857379:GTT:G,0.05510528,0.304687,0.203159,0.08915191,17,45148475,45152099,ENSG00000186834,...,45857379,GTT,1,0.25,0.664641,17:45857379:GTT:G-HEXIM1,1.918045,217721,1.593218,chr17_45857379
481919,NMT1,17:45857379:GTT:G,0.1040561,0.471149,0.175555,0.172298,17,44957992,45109016,ENSG00000136448,...,45857379,GTT,1,0.25,0.664641,17:45857379:GTT:G-NMT1,1.6255,217721,1.01158,chr17_45857379


In [None]:
top_cd4_qtl = pd.read_csv('eQTL\WMA_meta_top_CD4_T_qtl_results_fastApprox.txt', sep ='\t')

In [None]:
top_cd4_qtl

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,Global_pvalue
0,1551594,RPS26,12:56007301:G:A,5.750549e-61,1.525292,0.277239,3.875870e-58,12,56041351,56044697,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,16.472841,217721,16.886385,1.937548e-54
1,5727671,SMDT1,22:42074313:T:C,3.718128e-59,-1.446559,0.271777,1.774930e-57,22,42079691,42084284,...,42074313,T,1,0.342105,1.000000,22:42074313:T:C-SMDT1,-16.218728,217721,-16.704826,4.436436e-54
2,5805288,SH3YL1,2:224919:A:G,4.487579e-56,1.373035,0.257340,5.360115e-54,2,217730,266398,...,224919,A,1,0.342105,1.000000,2:224919:A:G-SH3YL1,15.776890,217721,16.108700,8.931738e-51
3,2364069,GABPB1-AS1,15:50317890:C:A,9.554753e-55,-1.305453,0.250183,4.022706e-52,15,50354944,50372202,...,50317890,C,1,0.302632,0.442799,15:50317890:C:A-GABPB1-AS1,-15.582626,217721,-15.909857,5.027376e-49
4,8318608,ERAP2,5:96916728:G:A,5.016080e-54,-0.891033,0.171425,4.288749e-51,5,96875986,96919703,...,96916728,G,1,0.421053,0.504386,5:96916728:G:A-ERAP2,-15.476286,217721,-15.589916,4.287891e-48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4994,4790418,RABIF,1:203071170:ATT:A,1.766595e-02,0.842398,0.368235,9.999382e-01,1,202878282,202889149,...,203071170,ATT,1,0.210526,0.169695,1:203071170:ATT:A-RABIF,2.372546,217721,2.305738,9.999773e-01
4995,1672822,CCT2,12:70569606:G:A,6.616220e-03,0.752566,0.277824,9.999455e-01,12,69585426,69601570,...,70569606,G,1,0.197368,1.000000,12:70569606:G:A-CCT2,2.715568,217721,2.578182,9.999773e-01
4996,5484641,SERINC3,20:44136440:C:CCAT,8.225898e-03,0.457268,0.161446,9.999479e-01,20,44496221,44522085,...,44136440,C,1,0.447368,0.753494,20:44136440:C:CCAT-SERINC3,2.642654,217721,2.108840,9.999773e-01
4997,9922937,PLEKHA2,8:38741700:G:A,2.423626e-02,-0.687195,0.371474,9.999586e-01,8,38901235,38973912,...,38741700,G,1,0.105263,1.000000,8:38741700:G:A-PLEKHA2,-2.253363,217721,-2.202876,9.999773e-01


In [7]:
top_b_qtl = pd.read_csv('eQTL\WMA_meta_top_B_qtl_results_fastApprox.txt', sep='\t')

In [None]:
top_b_qtl

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,Global_pvalue
0,470835,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365.1,12.408954,5.029380e-28
1,1991637,SMDT1,22:42123461:A:AT,4.472720e-20,1.006116,0.252152,6.857645e-19,22,42079691,42084284,...,42123461,A,1,0.315789,1.000000,22:42123461:A:AT-SMDT1,9.176031,211365.1,9.403753,7.553696e-16
2,3657825,RPL8,8:144786875:G:A,9.394820e-21,0.636211,0.172559,2.599743e-18,8,144789765,144792587,...,144786875,G,1,0.394737,1.000000,8:144786875:G:A-RPL8,9.342655,211365.1,9.462664,1.909078e-15
3,1187421,EIF5A,17:7312217:T:C,9.416636e-21,0.966808,0.221549,4.903394e-18,17,7306999,7312463,...,7312217,T,1,0.421053,1.000000,17:7312217:T:C-EIF5A,9.342409,211365.1,9.365777,2.700544e-15
4,824517,GABPB1-AS1,15:50319839:A:AAAAT,2.628243e-18,-1.455068,0.260223,6.060572e-16,15,50354944,50372202,...,50319839,A,1,0.289474,0.128919,15:50319839:A:AAAAT-GABPB1-AS1,-8.726436,211365.1,-8.624820,2.670288e-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2198,1215001,SLC25A39,17:44857691:T:C,1.169629e-02,-0.777534,0.345729,9.997343e-01,17,44319625,44324870,...,44857691,T,1,0.197368,1.000000,17:44857691:T:C-SLC25A39,-2.521177,211365.1,-2.235546,9.999943e-01
2199,782512,SIVA1,14:104099782:C:T,2.562218e-02,0.269837,0.155489,9.998146e-01,14,104753147,104768494,...,104099782,C,1,0.315789,0.271959,14:104099782:C:T-SIVA1,2.231891,211365.1,2.139372,9.999943e-01
2200,2504376,TFRC,3:196698554:C:CA,9.289415e-03,0.225621,0.143519,9.999708e-01,3,196012511,196082153,...,196698554,C,1,0.486842,0.204138,3:196698554:C:CA-TFRC,2.601215,211365.1,2.440557,9.999943e-01
2201,1850237,MTF2,1:93916041:C:G,2.488226e-02,0.092230,0.306739,9.999713e-01,1,93079235,93139079,...,93916041,C,1,0.118421,0.414165,1:93916041:C:G-MTF2,-2.243226,211365.1,-2.376586,9.999943e-01


In [8]:
top_b_qtl = top_b_qtl.astype({'snp_position': 'int64', 'snp_chromosome': 'str'})
top_b_qtl = top_b_qtl.astype({'snp_position': 'str'})
top_b_qtl['snp_chromosome'] = 'chr' + top_b_qtl['snp_chromosome']
top_b_qtl['chr_snp_pos'] = top_b_qtl['snp_chromosome'] + '_' +  top_b_qtl['snp_position']

In [None]:
sig_lcl

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
0,rs11810220,chr1,163311300,163311225,163311375,chr1_163311300
1,rs11585048,chr1,2602648,2602573,2602723,chr1_2602648
3,rs11585844,chr1,37563668,37563593,37563743,chr1_37563668
5,rs11587500,chr1,24190390,24190315,24190465,chr1_24190390
6,rs11588318,chr1,200669534,200669459,200669609,chr1_200669534
...,...,...,...,...,...,...
208,chr20:25529845:D,chr20,25549209,25549134,25549284,chr20_25549209
209,chr21:38345364:I,chr21,36973064,36972989,36973139,chr21_36973064
210,chr21:30327732:D,chr21,28955410,28955335,28955485,chr21_28955410
211,chr22:50310881:D,chr22,49917233,49917158,49917308,chr22_49917233


In [None]:
b_qtl[b_qtl['snp_id']== '12:56007301:G:A']

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,470835,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365.1,12.408954,chr12_56007301
57964,509309,MYL6,12:56007301:G:A,0.01876291,0.080122,0.282185,0.05764479,12,56158346,56163496,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-MYL6,-2.350208,211365.1,-2.695429,chr12_56007301
412725,479047,CD63,12:56007301:G:A,0.01117293,-0.311587,0.17343,0.41115,12,55725323,55729707,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-CD63,-2.537243,211365.1,-2.328159,chr12_56007301
1378417,494001,BLOC1S1,12:56007301:G:A,0.4946432,-0.180829,0.206181,0.777066,12,55716038,55720087,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-BLOC1S1,-0.682942,211365.1,-0.486787,chr12_56007301
1409997,512437,ATP5F1B,12:56007301:G:A,0.2480961,0.036983,0.220739,0.784976,12,56638175,56645984,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-ATP5F1B,1.154986,211365.1,1.112368,chr12_56007301
1441990,484162,NACA,12:56007301:G:A,0.7339549,0.257119,0.185304,0.7930579,12,56712305,56731628,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-NACA,-0.339869,211365.1,-0.265071,chr12_56007301
1986331,493543,PTGES3,12:56007301:G:A,0.8171223,0.248972,0.278792,0.8891155,12,56663341,56688408,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-PTGES3,-0.231248,211365.1,-0.360126,chr12_56007301
2126844,482889,PA2G4,12:56007301:G:A,0.4796424,0.412721,0.282329,0.9081884,12,56104537,56113910,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-PA2G4,0.706878,211365.1,0.177496,chr12_56007301
2171552,482363,RPL41,12:56007301:G:A,0.6913728,-0.211445,0.141413,0.914223,12,56116590,56117967,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPL41,0.396993,211365.1,0.906158,chr12_56007301
2389803,486290,CNPY2,12:56007301:G:A,0.2161696,0.236857,0.189203,0.9389091,12,56309842,56316119,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-CNPY2,1.236778,211365.1,1.266168,chr12_56007301


In [None]:
sig_bqtl['snp_id'].value_counts()

6:32674599:A:T      3
6:32702419:C:T      3
6:32685900:G:T      3
6:32680359:A:G      3
6:32702410:A:ACG    3
                   ..
15:76842550:T:A     1
15:76807324:A:T     1
15:76576323:C:T     1
15:76578657:G:A     1
3:196979325:T:C     1
Name: snp_id, Length: 15871, dtype: int64

In [None]:
gene_sig_bqtl = sig_bqtl.drop_duplicates(subset='snp_id', keep='first')

In [None]:
sig_bqtl[sig_bqtl['snp_id'] == '6:32702410:A:ACG']

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
2895,HLA-DQB1,6:32702410:A:ACG,1.557463e-07,-1.089485,0.287938,3e-06,6,32659467,32668383,ENSG00000179344,...,32702410,A,1,0.157895,0.205598,6:32702410:A:ACG-HLA-DQB1,-5.245633,211365,-5.126362,chr6_32702410
7475,HLA-DRA,6:32702410:A:ACG,1.083895e-05,-0.362071,0.167236,0.000673,6,32439878,32445046,ENSG00000204287,...,32702410,A,1,0.157895,0.205598,6:32702410:A:ACG-HLA-DRA,-4.399722,211365,-4.476677,chr6_32702410
12269,HLA-DQA1,6:32702410:A:ACG,0.0001989848,0.347302,0.229643,0.001417,6,32628179,32647062,ENSG00000196735,...,32702410,A,1,0.157895,0.205598,6:32702410:A:ACG-HLA-DQA1,3.720302,211365,3.656287,chr6_32702410


In [None]:
gene_sig_bqtl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365,12.408954,chr12_56007301
1,SMDT1,22:42123461:A:AT,4.472720e-20,1.006116,0.252152,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42123461,A,1,0.315789,1.000000,22:42123461:A:AT-SMDT1,9.176031,211365,9.403753,chr22_42123461
2,SMDT1,22:42119191:GAGAT:G,4.763233e-20,1.078521,0.252800,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42119191,GAGAT,1,0.342105,1.000000,22:42119191:GAGAT:G-SMDT1,9.169250,211365,9.320564,chr22_42119191
3,SMDT1,22:42092156:C:A,4.933741e-20,1.073046,0.253754,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42092156,C,1,0.342105,1.000000,22:42092156:C:A-SMDT1,9.165457,211365,9.320889,chr22_42092156
4,SMDT1,22:42094636:CAAA:C,5.097736e-20,1.063314,0.251781,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42094636,CAAA,1,0.342105,1.000000,22:42094636:CAAA:C-SMDT1,9.161930,211365,9.318367,chr22_42094636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20012,TUFM,16:28892446:AAC:A,9.915560e-04,0.881049,0.314535,5.637180e-03,16,28842411,28846348,ENSG00000178952,...,28892446,AAC,1,0.381579,0.490409,16:28892446:AAC:A-TUFM,3.292912,211365,3.366263,chr16_28892446
20013,HLA-C,6:31478769:T:C,1.863080e-03,-0.390286,0.236738,5.640952e-03,6,31268749,31272130,ENSG00000204525,...,31478769,T,1,0.355263,1.000000,6:31478769:T:C-HLA-C,-3.111233,211365,-3.108471,chr6_31478769
20014,HLA-C,6:31516255:T:C,1.864925e-03,-0.751529,0.316263,5.643625e-03,6,31268749,31272130,ENSG00000204525,...,31516255,T,1,0.184211,1.000000,6:31516255:T:C-HLA-C,-3.110940,211365,-2.973853,chr6_31516255
20015,HLA-C,6:31396930:G:A,1.865420e-03,0.383531,0.268094,5.643625e-03,6,31268749,31272130,ENSG00000204525,...,31396930,G,1,0.368421,0.507756,6:31396930:G:A-HLA-C,3.110862,211365,3.149272,chr6_31396930


In [None]:
gene_sig_bqtl = gene_sig_bqtl.astype({'snp_position':'int64'})

In [None]:
gene_sig_bqtl[gene_sig_bqtl['snp_position'] == 31457941]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
6071,HLA-C,6:31457941:TAA:T,4e-05,0.577559,0.281976,0.000228,6,31268749,31272130,ENSG00000204525,...,31457941,TAA,1,0.157895,1.0,6:31457941:TAA:T-HLA-C,4.105947,211365,4.156794,chr6_31457941
16741,HLA-C,6:31457941:TA:T,0.000713,-0.920329,0.303,0.00295,6,31268749,31272130,ENSG00000204525,...,31457941,TA,1,0.131579,0.492859,6:31457941:TA:T-HLA-C,-3.384392,211365,-3.168314,chr6_31457941
18720,HLA-C,6:31457941:T:TA,0.001474,-0.41433,0.277169,0.004625,6,31268749,31272130,ENSG00000204525,...,31457941,T,1,0.223684,0.651845,6:31457941:T:TA-HLA-C,-3.179698,211365,-3.199773,chr6_31457941


In [None]:
sig_cd4_qtl['snp_id'].value_counts()

6:30484127:T:C              6
6:31340940:G:A              5
6:31344715:A:G              5
6:31348743:G:A              5
6:31358297:A:T              5
                           ..
7:56146401:G:A              1
7:12210776:G:A              1
17:48718140:CT:C            1
13:49872796:AAAAAAAAAG:A    1
1:160021673:G:A             1
Name: snp_id, Length: 155967, dtype: int64

In [None]:
gene_sig_cd4qtl = sig_cd4_qtl.drop_duplicates(subset='snp_id', keep='first')

In [None]:
gene_sig_cd4qtl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,5.750549e-61,1.525292,0.277239,3.875870e-58,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,16.472841,217721,16.886385,chr12_56007301
1,SMDT1,22:42074313:T:C,3.718128e-59,-1.446559,0.271777,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42074313,T,1,0.342105,1.000000,22:42074313:T:C-SMDT1,-16.218728,217721,-16.704826,chr22_42074313
2,SMDT1,22:42080766:A:T,4.780513e-59,1.449479,0.272295,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080766,A,1,0.342105,1.000000,22:42080766:A:T-SMDT1,16.203283,217721,16.682376,chr22_42080766
3,SMDT1,22:42080750:A:C,4.832899e-59,1.447290,0.271892,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,16.202613,217721,16.681792,chr22_42080750
4,SMDT1,22:42078134:C:G,4.860768e-59,1.445682,0.271640,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42078134,C,1,0.342105,1.000000,22:42078134:C:G-SMDT1,16.202260,217721,16.680695,chr22_42078134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199622,PEA15,1:160013969:C:T,1.194720e-03,0.595101,0.254633,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160013969,C,1,0.171053,1.000000,1:160013969:C:T-PEA15,3.240138,217721,2.913457,chr1_160013969
199623,PEA15,1:160015506:GA:G,1.196031e-03,0.557679,0.255824,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160015506,GA,1,0.184211,1.000000,1:160015506:GA:G-PEA15,3.239825,217721,2.927943,chr1_160015506
199624,PEA15,1:160027419:C:G,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160027419,C,1,0.184211,1.000000,1:160027419:C:G-PEA15,3.235851,217721,2.932173,chr1_160027419
199625,PEA15,1:160022992:T:C,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160022992,T,1,0.184211,1.000000,1:160022992:T:C-PEA15,3.235851,217721,2.932173,chr1_160022992


In [None]:
gene_sig_cd4qtl['feature_start'].value_counts()

46193566    3213
29941260    3059
31353872    2504
31268749    2342
11795573    1545
            ... 
668100         1
73490933       1
16197854       1
2887270        1
43369221       1
Name: feature_start, Length: 1238, dtype: int64

In [None]:
# gene_11795573 = gene_sig_cd4qtl[gene_sig_cd4qtl['feature_start'] == 11795573]
# gene_11795573['feature_id'].value_counts

In [None]:
gene_pos_cd4 = list(gene_sig_cd4qtl['feature_start'].unique())
gene_pos_b = list(gene_sig_bqtl['feature_start'].unique())

In [None]:
cd4_qtl_in_sig_lcl[cd4_qtl_in_sig_lcl['feature_start'].isin(gene_pos_cd4)]

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,1551594,RPS26,12:56007301:G:A,5.750549e-61,1.525292,0.277239,3.875870e-58,12,56041351,56044697,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,16.472841,217721,16.886385,chr12_56007301
186,8318631,ERAP2,5:96939497:G:A,7.074509e-53,-0.891009,0.171587,5.042134e-51,5,96875986,96919703,...,96939497,G,1,0.421053,0.504386,5:96939497:G:A-ERAP2,-15.305052,217721,-15.449191,chr5_96939497
208,8318648,ERAP2,5:96915971:C:A,1.276454e-52,-0.922925,0.179665,5.652886e-51,5,96875986,96919703,...,96915971,C,1,0.407895,0.737704,5:96915971:C:A-ERAP2,-15.266606,217721,-15.413412,chr5_96915971
225,8318644,ERAP2,5:97014384:G:A,3.595154e-52,0.939216,0.182012,1.229543e-50,5,96875986,96919703,...,97014384,G,1,0.421053,0.504386,5:97014384:G:A-ERAP2,15.198917,217721,15.316779,chr5_97014384
234,8318640,ERAP2,5:97027703:C:T,6.009413e-52,0.963284,0.186423,1.741711e-50,5,96875986,96919703,...,97027703,C,1,0.421053,0.504386,5:97027703:C:T-ERAP2,15.165223,217721,15.294625,chr5_97027703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7708129,3034623,NAA38,17:7234631:C:G,8.495524e-01,-0.137119,0.143667,9.999007e-01,17,7856685,7885238,...,7234631,C,1,0.315789,0.271959,17:7234631:C:G-NAA38,0.189690,217721,0.498794,chr17_7234631
7772906,2116603,MAP4K5,14:51139283:C:G,2.550815e-01,-0.020147,0.246657,9.999950e-01,14,50418501,50561126,...,51139283,C,1,0.421053,0.103485,14:51139283:C:G-MAP4K5,1.138093,217721,1.209535,chr14_51139283
7772907,2116590,MAP4K5,14:51142171:C:G,2.552379e-01,-0.020211,0.246665,9.999950e-01,14,50418501,50561126,...,51142171,C,1,0.421053,0.103485,14:51142171:C:G-MAP4K5,1.137719,217721,1.208471,chr14_51142171
7773370,2083977,MAP4K5,14:49620639:C:G,4.207199e-01,-0.247770,0.238720,9.999950e-01,14,50418501,50561126,...,49620639,C,1,0.263158,0.396942,14:49620639:C:G-MAP4K5,-0.805173,217721,-0.560557,chr14_49620639


In [None]:
test = bqtl_in_sig_lcl[bqtl_in_sig_lcl['feature_start'].isin(gene_pos_b)]

***

In [None]:
intersect_cd4_in_enh = pd.read_csv('data/bed_files/bedtools/cd4qtl_in_enh.txt', sep='\t', header=None)

In [None]:
intersect_cd4_in_enh

Unnamed: 0,0,1,2,3,4,5,6,7
0,chr1,100896711,100896885,chr1_101362267_101362441_MACS2STARRENH_indivua...,chr1,100896815,100896816,1:100896815:C:A
1,chr1,110647340,110647639,chr1_111189962_111190261_MACS2STARRENH_indivua...,chr1,110647392,110647393,1:110647392:G:A
2,chr1,110672731,110672946,chr1_111215353_111215568_MACS2STARRENH_indivua...,chr1,110672850,110672851,1:110672850:A:C
3,chr1,111225803,111226180,chr1_111768425_111768802_MACS2STARRENH_indivua...,chr1,111225954,111225955,1:111225954:G:A
4,chr1,111225803,111226180,chr1_111768425_111768802_MACS2STARRENH_indivua...,chr1,111225997,111225998,1:111225997:A:G
...,...,...,...,...,...,...,...,...
949,chr9,89604799,89605007,chr9_92219714_92219922_MACS2STARRENH_indivuall...,chr9,89605006,89605007,9:89605006:A:C
950,chr9,95875974,95876124,chr9_98638256_98638406_MACS2STARRENH_indivuall...,chr9,95876006,95876007,9:95876006:A:G
951,chr9,97905294,97905465,chr9_100667576_100667747_MACS2STARRENH_indivua...,chr9,97905317,97905318,9:97905317:C:T
952,chr9,97938321,97938520,chr9_100700603_100700802_MACS2STARRENH_indivua...,chr9,97938372,97938373,9:97938372:C:T


In [None]:
intersect_cd4_in_enh[5].value_counts()

127993221    2
36114245     2
108433927    1
59451507     1
67643056     1
            ..
91853713     1
92904401     1
12080262     1
14300987     1
98056438     1
Name: 5, Length: 952, dtype: int64

In [None]:
intersect_sig_cd4_enh = pd.read_csv('data/bed_files/bedtools/sig_cd4qtl_in_enh.txt', sep='\t', header=None)

In [None]:
intersect_sig_cd4_enh


Unnamed: 0,0,1,2,3,4,5,6,7
0,chr1,100896711,100896885,chr1_101362267_101362441_MACS2STARRENH_indivua...,chr1,100896815,100896816,1:100896815:C:A
1,chr1,172380900,172381177,chr1_172350040_172350317_MACS2STARRENH_indivua...,chr1,172381071,172381072,1:172381071:GCA:G
2,chr1,172394195,172394357,chr1_172363335_172363497_MACS2STARRENH_indivua...,chr1,172394348,172394349,1:172394348:A:G
3,chr1,19644171,19644326,chr1_19970665_19970820_MACS2STARRENH_indivuall...,chr1,19644233,19644234,1:19644233:C:G
4,chr1,204507103,204507425,chr1_204476231_204476553_MACS2STARRENH_indivua...,chr1,204507233,204507234,1:204507233:G:A
...,...,...,...,...,...,...,...,...
225,chr9,277733,277891,chr9_277733_277891_MACS2STARRENH_indivuallyrmD...,chr9,277776,277777,9:277776:C:T
226,chr9,37753703,37754040,chr9_37753700_37754037_MACS2STARRENH_indivuall...,chr9,37753795,37753796,9:37753795:C:T
227,chr9,37753703,37754040,chr9_37753700_37754037_MACS2STARRENH_indivuall...,chr9,37753801,37753802,9:37753801:C:T
228,chr9,37753703,37754040,chr9_37753700_37754037_MACS2STARRENH_indivuall...,chr9,37753849,37753850,9:37753849:T:G


In [None]:
intersect_sig_cd4_enh[5].value_counts()

36114245     2
127993221    2
103198430    1
103198600    1
75941102     1
            ..
19907665     1
3106665      1
57417280     1
667971       1
97905317     1
Name: 5, Length: 228, dtype: int64

In [None]:
intersect_sig_b_enh = pd.read_csv('data/bed_files/bedtools/sig_bqtl_in_enh.txt', sep='\t', header=None)
intersect_b_enh = pd.read_csv('data/bed_files/bedtools/bqtl_in_enh.txt', sep='\t', header=None)

In [None]:
intersect_sig_b_enh['chr_snp_pos'] = intersect_sig_b_enh[4] + '_' + [str(i) for i in intersect_sig_b_enh[5]]
intersect_b_enh['chr_snp_pos'] = intersect_b_enh[4] + '_' + [str(i) for i in intersect_b_enh[5]]

In [None]:
len(intersect_sig_b_enh['chr_snp_pos'].value_counts())

22

In [None]:
len(intersect_b_enh.chr_snp_pos.value_counts())

877

In [None]:
22/877

0.02508551881413911

In [None]:
#intersect_cd4_in_plasmid = pd.read_csv('data/bed_files/bedtools/cd4qtl_in_plasmid.txt', sep='\t', header=None)

In [None]:
#intersect_cd4_in_plasmid[5].value_counts()

In [None]:
#intersect_sig_cd4_in_plasmid = pd.read_csv('data/bed_files/bedtools/sig_cd4qtl_in_plasmid.txt', sep='\t', header=None)

In [None]:
#intersect_sig_cd4_in_plasmid[5].value_counts()

In [None]:
gene_sig_cd4qtl[gene_sig_cd4qtl['snp_id'].isin(intersect_cd4_in_enh[7])]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
2942,XRRA1,11:74949098:G:C,1.882793e-50,-1.422397,0.279969,2.530878e-48,11,74807739,74949200,ENSG00000166435,...,74949098,G,1,0.263158,0.235435,11:74949098:G:C-XRRA1,-14.937363,217721,-15.276445,chr11_74949098
5130,ELP5,17:7205187:T:G,3.190423e-32,1.192322,0.298287,1.632024e-30,17,7251416,7259940,ENSG00000170291,...,7205187,T,1,0.315789,0.717019,17:7205187:T:G-ELP5,11.816954,217721,12.091769,chr17_7205187
6431,CBR3,21:36114244:A:C,5.010141e-30,0.958961,0.294934,7.732457e-27,21,36135079,36146562,ENSG00000159231,...,36114244,A,1,0.434211,0.526619,21:36114244:A:C-CBR3,11.384243,217721,11.110350,chr21_36114244
6556,RBM43,2:151261757:G:A,2.361397e-28,0.912972,0.202870,3.576534e-26,2,151247940,151261863,ENSG00000184898,...,151261757,G,1,0.473684,0.342222,2:151261757:G:A-RBM43,11.043321,217721,10.899083,chr2_151261757
6557,RBM43,2:151261761:A:G,2.398367e-28,0.912097,0.202591,3.576534e-26,2,151247940,151261863,ENSG00000184898,...,151261761,A,1,0.473684,0.342222,2:151261761:A:G-RBM43,11.041926,217721,10.896518,chr2_151261761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190076,SENP7,3:101467181:C:A,6.474714e-04,-0.194679,0.155965,2.165347e-02,3,101324205,101513241,ENSG00000138468,...,101467181,C,1,0.342105,0.285977,3:101467181:C:A-SENP7,3.410907,217721,3.724661,chr3_101467181
190335,JMY,5:79236056:G:GCGC,9.398857e-04,-0.862850,0.253387,2.178413e-02,5,79236131,79327211,ENSG00000152409,...,79236056,G,1,0.460526,0.324738,5:79236056:G:GCGC-JMY,-3.307929,217721,-3.447409,chr5_79236056
192250,DBT,1:99808794:A:C,2.957756e-03,0.237903,0.177475,2.267389e-02,1,100186919,100249834,ENSG00000137992,...,99808794,A,1,0.276316,0.418573,1:99808794:A:C-DBT,2.972095,217721,3.030142,chr1_99808794
194763,TSPYL1,6:116254181:T:G,3.068215e-03,0.161211,0.210881,2.368815e-02,6,116267760,116279930,ENSG00000189241,...,116254181,T,1,0.421053,0.331485,6:116254181:T:G-TSPYL1,2.960820,217721,2.877099,chr6_116254181


In [None]:
top_b_qtl[top_b_qtl['chr_snp_pos'].isin(sig_lcl.chr_snp_pos)]

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,Global_pvalue,chr_snp_pos
0,470835,RPS26,12:56007301:G:A,3.382176e-34,1.443169,0.249831,2.282969e-31,12,56041351,56044697,...,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,12.193121,211365.1,12.408954,5.029380000000001e-28,chr12_56007301
40,864823,CTSH,15:78931339:G:A,8.3517e-07,0.743339,0.184601,0.00012217,15,78921058,78949574,...,G,1,0.368421,0.507756,15:78931339:G:A-CTSH,4.926964,211365.1,4.605555,0.006564402,chr15_78931339
57,300879,UCP2,11:73982376:C:G,8.251883e-07,-0.583603,0.16753,0.0005129095,11,73974672,73982843,...,C,1,0.342105,0.285977,11:73982376:C:G-UCP2,-4.929314,211365.1,-4.806935,0.01948172,chr11_73982376
144,234915,TIMM10,11:57559454:C:T,2.090312e-05,0.615796,0.278626,0.007821008,11,57528464,57530803,...,C,1,0.210526,1.0,11:57559454:C:T-TIMM10,4.255019,211365.1,4.160546,0.1175687,chr11_57559454
1310,144138,PSMD13,11:438662:A:G,0.003038967,-0.539937,0.242778,0.3467733,11,236966,252984,...,A,1,0.407895,1.0,11:438662:A:G-PSMD13,-2.963769,211365.1,-2.714154,0.5823776,chr11_438662


In [None]:
cd4_qtl[(cd4_qtl['snp_id'] == '6:31478769:T:C') & (cd4_qtl['feature_id'] == 'HLA-C')]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
53655,HLA-C,6:31478769:T:C,2.979688e-07,-0.126313,0.163421,2e-06,6,31268749,31272130,ENSG00000204525,...,31478769,T,1,0.355263,1.0,6:31478769:T:C-HLA-C,-5.124727,217721,-5.070583,chr6_31478769


In [None]:
##df = cd4_qtl.merge(sig_bqtl, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)


In [None]:
#len(cd4_qtl)

In [None]:
#len(cd4_qtl) - len(df)

In [None]:
#len(df)
print(len(sig_cd4_qtl))
print(len(sig_cd4_qtl_unique))

199627
174066


In [None]:
#len(df)
print(len(sig_bqtl))
print(len(sig_b_qtl_unique))

20017
4961


In [None]:
len(sig_bqtl) - len(sig_b_qtl_unique)

15056

In [None]:
len(sig_cd4_qtl) - len(sig_cd4_qtl_unique)

25561

In [None]:
len(sig_cd4_qtl_unique) # overlap sig lcl en niet significant

174066

In [None]:
cd4_minus_sigb = cd4_qtl[~cd4_qtl.chr_snp_pos.isin(sig_bqtl.chr_snp_pos)]

In [None]:
cd4_minus_sigb_sig_lcl = cd4_minus_sigb[cd4_minus_sigb['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]


In [None]:
len(cd4_minus_sigb_sig_lcl.chr_snp_pos.value_counts())

848

In [None]:
1284 - 428

856

In [None]:
730-302

428

### everything but sig b QTL in cd4 qtl

In [None]:
unique_sig_cd4_sig_lcl = sig_cd4_qtl_unique[sig_cd4_qtl_unique['chr_snp_pos'].isin(sig_lcl.chr_snp_pos)]

In [None]:
unique_sig_cd4_non_sig_lcl = sig_cd4_qtl_unique[sig_cd4_qtl_unique['chr_snp_pos'].isin(non_sig_lcl.chr_snp_pos)]

In [None]:
unique_sig_cd4_lcl = sig_cd4_qtl_unique[sig_cd4_qtl_unique['chr_snp_pos'].isin(final_lcl_positions.chr_snp_pos)]

In [None]:
unique_sig_cd4_sig_lcl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
186,ERAP2,5:96939497:G:A,7.074509e-53,-0.891009,0.171587,5.042134e-51,5,96875986,96919703,ENSG00000164308,...,96939497,G,1,0.421053,0.504386,5:96939497:G:A-ERAP2,-15.305052,217721,-15.449191,chr5_96939497
208,ERAP2,5:96915971:C:A,1.276454e-52,-0.922925,0.179665,5.652886e-51,5,96875986,96919703,ENSG00000164308,...,96915971,C,1,0.407895,0.737704,5:96915971:C:A-ERAP2,-15.266606,217721,-15.413412,chr5_96915971
225,ERAP2,5:97014384:G:A,3.595154e-52,0.939216,0.182012,1.229543e-50,5,96875986,96919703,ENSG00000164308,...,97014384,G,1,0.421053,0.504386,5:97014384:G:A-ERAP2,15.198917,217721,15.316779,chr5_97014384
234,ERAP2,5:97027703:C:T,6.009413e-52,0.963284,0.186423,1.741711e-50,5,96875986,96919703,ENSG00000164308,...,97027703,C,1,0.421053,0.504386,5:97027703:C:T-ERAP2,15.165223,217721,15.294625,chr5_97027703
2929,XRRA1,11:74907656:C:G,7.586901e-51,-1.428735,0.280514,2.530878e-48,11,74807739,74949200,ENSG00000166435,...,74907656,C,1,0.263158,0.235435,11:74907656:C:G-XRRA1,-14.997821,217721,-15.352010,chr11_74907656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197195,IER3,6:29763858:C:T,6.118878e-03,0.569227,0.280985,2.467528e-02,6,30743199,30744548,ENSG00000137331,...,29763858,C,1,0.184211,1.000000,6:29763858:C:T-IER3,2.741342,217721,2.429336,chr6_29763858
197245,TBCD,17:82450072:C:T,4.715114e-03,0.592896,0.251776,2.470153e-02,17,82752042,82945914,ENSG00000141556,...,82450072,C,1,0.223684,0.059519,17:82450072:C:T-TBCD,2.825879,217721,2.289512,chr17_82450072
197246,TBCD,17:82459853:C:G,4.716084e-03,0.598968,0.253364,2.470153e-02,17,82752042,82945914,ENSG00000141556,...,82459853,C,1,0.223684,0.059519,17:82459853:C:G-TBCD,2.825813,217721,2.285773,chr17_82459853
197257,TBCD,17:82445598:T:TC,4.794571e-03,0.591710,0.251580,2.470153e-02,17,82752042,82945914,ENSG00000141556,...,82445598,T,1,0.223684,0.059519,17:82445598:T:TC-TBCD,2.820521,217721,2.275475,chr17_82445598


In [None]:
len(unique_sig_cd4_sig_lcl.chr_snp_pos.value_counts())

302

In [None]:
len(unique_sig_cd4_non_sig_lcl.chr_snp_pos.value_counts())

2148

In [None]:
4223-2148

2075

In [None]:
sig_cd4_minus_sig_bqtl = {'sig_cd4': [len(unique_sig_cd4_sig_lcl.chr_snp_pos.value_counts()), len(unique_sig_cd4_non_sig_lcl.chr_snp_pos.value_counts())],
                          'total_cd4':[(1284-428), (8555-2075)],
                          'ratio': [len(unique_sig_cd4_sig_lcl.chr_snp_pos.value_counts())/(1284-428), len(unique_sig_cd4_non_sig_lcl.chr_snp_pos.value_counts())/ (8555-2075)]}
sig_cd4_minus_sig_bqtl = pd.DataFrame(data=sig_cd4_minus_sig_bqtl, index=['sig_lcl', 'non_sig_lcl']).round({'ratio':2})
sig_cd4_minus_sig_bqtl

Unnamed: 0,sig_cd4,total_cd4,ratio
sig_lcl,302,856,0.35
non_sig_lcl,2148,6480,0.33


In [None]:
data = sig_cd4_minus_sig_bqtl.iloc[:,0:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.0643241267686525 0.386064390158663


In [None]:
len(unique_sig_cd4_sig_lcl.chr_snp_pos.value_counts())

302

In [None]:
len(unique_sig_cd4_non_sig_lcl.chr_snp_pos.value_counts())

2148

In [None]:
len(unique_sig_cd4_lcl.chr_snp_pos.value_counts())

2450

In [None]:
2450 - 302

2148

In [None]:
302 /856

0.352803738317757

In [None]:
sig_b_qtl_unique

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
206,HLA-DQB1,6:32660413:T:C,5.482612e-18,-1.517662,0.334129,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32660413,T,1,0.131579,0.492859,6:32660413:T:C-HLA-DQB1,-8.642854,211365.1,-8.879585,chr6_32660413
207,HLA-DQB1,6:32663148:A:T,6.251302e-18,-1.527211,0.336776,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32663148,A,1,0.131579,0.492859,6:32663148:A:T-HLA-DQB1,-8.627855,211365.1,-8.852898,chr6_32663148
208,HLA-DQB1,6:32663172:A:G,6.251302e-18,-1.527211,0.336776,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32663172,A,1,0.131579,0.492859,6:32663172:A:G-HLA-DQB1,-8.627855,211365.1,-8.852898,chr6_32663172
209,HLA-DQB1,6:32663167:G:A,6.292160e-18,-1.527211,0.336776,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32663167,G,1,0.131579,0.492859,6:32663167:G:A-HLA-DQB1,-8.627110,211365.1,-8.851912,chr6_32663167
210,HLA-DQB1,6:32659750:C:A,6.607150e-18,-1.508293,0.333583,1.651824e-15,6,32659467,32668383,ENSG00000179344,...,32659750,C,1,0.131579,0.492859,6:32659750:C:A-HLA-DQB1,-8.621519,211365.1,-8.849241,chr6_32659750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20003,HLA-DRA,6:32661378:C:A,4.750873e-04,-0.375719,0.193752,5.602038e-03,6,32439878,32445046,ENSG00000204287,...,32661378,C,1,0.131579,0.492859,6:32661378:C:A-HLA-DRA,-3.494423,211365.1,-3.511569,chr6_32661378
20005,DDX17,22:38594733:C:T,5.309879e-04,-0.384871,0.157286,5.604411e-03,22,38483438,38507660,ENSG00000100201,...,38594733,C,1,0.394737,0.042608,22:38594733:C:T-DDX17,-3.464617,211365.1,-3.220773,chr22_38594733
20007,RPS18,6:33151135:A:G,1.886131e-04,0.344654,0.164863,5.634371e-03,6,33272075,33276511,ENSG00000231500,...,33151135,A,1,0.144737,0.569699,6:33151135:A:G-RPS18,3.733799,211365.1,3.664862,chr6_33151135
20008,DDX17,22:38597365:C:T,5.382164e-04,-0.366285,0.151527,5.636670e-03,22,38483438,38507660,ENSG00000100201,...,38597365,C,1,0.394737,0.042608,22:38597365:C:T-DDX17,-3.460978,211365.1,-3.219666,chr22_38597365


In [None]:
unique_sig_b_sig_lcl = sig_b_qtl_unique[sig_b_qtl_unique['chr_snp_pos'].isin(sig_lcl.chr_snp_pos)]
unique_sig_b_non_sig_lcl = sig_b_qtl_unique[sig_b_qtl_unique['chr_snp_pos'].isin(non_sig_lcl.chr_snp_pos)]

In [None]:
sig_b_minus_sig_cd4 = {'sig_b': [len(unique_sig_b_sig_lcl.chr_snp_pos.value_counts()), len(unique_sig_b_non_sig_lcl.chr_snp_pos.value_counts())],
                          'total_b':[(1196-(436-8)), (7738-(2165-90))],
                          'ratio': [len(unique_sig_b_sig_lcl.chr_snp_pos.value_counts())/(1196-(436-8)), len(unique_sig_b_non_sig_lcl.chr_snp_pos.value_counts())/ (7738-(2165-90))]}
sig_b_minus_sig_cd4 = pd.DataFrame(data=sig_b_minus_sig_cd4, index=['sig_lcl', 'non_sig_lcl']).round({'ratio':2})
sig_b_minus_sig_cd4

Unnamed: 0,sig_b,total_b,ratio
sig_lcl,8,768,0.01
non_sig_lcl,90,5663,0.02


In [None]:
data = sig_b_minus_sig_cd4.iloc[:,0:2]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

0.6554398148148148 0.3432655393934376


In [None]:
#len(cd4_qtl) - len(cd4_qtl_unique)

In [None]:
# sig_cd4_qtl_unique = sig_cd4_qtl[sig_cd4_qtl['chr_snp_pos'].isin(cd4_qtl_unique.chr_snp_pos)]
# sig_cd4_qtl_unique

In [None]:
len(b_qtl)

3466245

In [None]:
b_qtl_unique =b_qtl[~b_qtl['chr_snp_pos'].isin(cd4_qtl.chr_snp_pos)]
b_qtl_unique

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
470,2705833,ARHGAP24,4:85590192:GCCTTCCTTCCTTCCTT:G,1.375429e-14,1.775161,0.381658,2.673687e-12,4,85475150,86002668,...,85590192,GCCTTCCTTCCTTCCTT,1,0.302632,0.442799,4:85590192:GCCTTCCTTCCTTCCTT:G-ARHGAP24,7.698621,211365.1,7.690246,chr4_85590192
471,2705773,ARHGAP24,4:85619906:A:G,1.554262e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,...,85619906,A,1,0.328947,0.711897,4:85619906:A:G-ARHGAP24,7.682983,211365.1,7.636893,chr4_85619906
472,2705772,ARHGAP24,4:85616072:T:C,1.554312e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,...,85616072,T,1,0.328947,0.711897,4:85616072:T:C-ARHGAP24,7.682979,211365.1,7.636858,chr4_85616072
473,2705771,ARHGAP24,4:85608107:T:G,1.554957e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,...,85608107,T,1,0.328947,0.711897,4:85608107:T:G-ARHGAP24,7.682926,211365.1,7.636823,chr4_85608107
474,2705770,ARHGAP24,4:85602521:A:G,1.561854e-14,1.544259,0.322549,2.673687e-12,4,85475150,86002668,...,85602521,A,1,0.328947,0.711897,4:85602521:A:G-ARHGAP24,7.682359,211365.1,7.636308,chr4_85602521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3466163,1617337,CEP350,1:179100985:C:T,9.805646e-01,-0.045996,0.122870,9.999998e-01,1,179954773,180114875,...,179100985,C,1,0.368421,1.000000,1:179100985:C:T-CEP350,-0.024361,211365.1,0.150945,chr1_179100985
3466164,1617342,CEP350,1:179101199:C:G,9.806032e-01,-0.045982,0.122861,9.999998e-01,1,179954773,180114875,...,179101199,C,1,0.368421,1.000000,1:179101199:C:G-CEP350,-0.024313,211365.1,0.150983,chr1_179101199
3466202,1617004,CEP350,1:179134710:GCCTCTC:G,9.899706e-01,0.051232,0.120048,9.999998e-01,1,179954773,180114875,...,179134710,GCCTCTC,1,0.394737,0.735891,1:179134710:GCCTCTC:G-CEP350,-0.012570,211365.1,-0.172623,chr1_179134710
3466205,1617217,CEP350,1:179100294:C:T,9.919599e-01,-0.049173,0.124987,9.999998e-01,1,179954773,180114875,...,179100294,C,1,0.368421,1.000000,1:179100294:C:T-CEP350,0.010077,211365.1,0.194646,chr1_179100294


In [None]:
sig_bqtl_unique = sig_bqtl[~sig_bqtl['chr_snp_pos'].isin(cd4_qtl.chr_snp_pos)]
sig_bqtl_unique

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
470,ARHGAP24,4:85590192:GCCTTCCTTCCTTCCTT:G,1.375429e-14,1.775161,0.381658,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85590192,GCCTTCCTTCCTTCCTT,1,0.302632,0.442799,4:85590192:GCCTTCCTTCCTTCCTT:G-ARHGAP24,7.698621,211365,7.690246,chr4_85590192
471,ARHGAP24,4:85619906:A:G,1.554262e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85619906,A,1,0.328947,0.711897,4:85619906:A:G-ARHGAP24,7.682983,211365,7.636893,chr4_85619906
472,ARHGAP24,4:85616072:T:C,1.554312e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85616072,T,1,0.328947,0.711897,4:85616072:T:C-ARHGAP24,7.682979,211365,7.636858,chr4_85616072
473,ARHGAP24,4:85608107:T:G,1.554957e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85608107,T,1,0.328947,0.711897,4:85608107:T:G-ARHGAP24,7.682926,211365,7.636823,chr4_85608107
474,ARHGAP24,4:85602521:A:G,1.561854e-14,1.544259,0.322549,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85602521,A,1,0.328947,0.711897,4:85602521:A:G-ARHGAP24,7.682359,211365,7.636308,chr4_85602521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19490,ERICH1,8:643977:C:T,1.989414e-05,0.774380,0.236679,5.095386e-03,8,614746,738106,ENSG00000104714,...,643977,C,1,0.250000,1.000000,8:643977:C:T-ERICH1,4.266076,211365,3.920378,chr8_643977
19597,CAPZA1,1:111759995:AGAAAT:A,9.411961e-05,-0.463807,0.220524,5.134748e-03,1,112619805,112671616,ENSG00000116489,...,111759995,AGAAAT,1,0.157895,1.000000,1:111759995:AGAAAT:A-CAPZA1,-3.905272,211365,-4.018136,chr1_111759995
19648,RGS10,10:118712242:G:A,1.318374e-04,0.793548,0.288952,5.190714e-03,10,119499817,119542719,ENSG00000148908,...,118712242,G,1,0.118421,1.000000,10:118712242:G:A-RGS10,3.823001,211365,3.551248,chr10_118712242
19649,RGS10,10:118711913:G:GA,1.325683e-04,0.793548,0.288952,5.190714e-03,10,119499817,119542719,ENSG00000148908,...,118711913,G,1,0.118421,1.000000,10:118711913:G:GA-RGS10,3.821638,211365,3.549572,chr10_118711913


In [None]:
len(sig_bqtl)

20017

In [None]:
sig_b_qtl_unique = sig_bqtl[sig_bqtl['chr_snp_pos'].isin(b_qtl_unique.chr_snp_pos)]
sig_b_qtl_unique


Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
470,ARHGAP24,4:85590192:GCCTTCCTTCCTTCCTT:G,1.375429e-14,1.775161,0.381658,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85590192,GCCTTCCTTCCTTCCTT,1,0.302632,0.442799,4:85590192:GCCTTCCTTCCTTCCTT:G-ARHGAP24,7.698621,211365,7.690246,chr4_85590192
471,ARHGAP24,4:85619906:A:G,1.554262e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85619906,A,1,0.328947,0.711897,4:85619906:A:G-ARHGAP24,7.682983,211365,7.636893,chr4_85619906
472,ARHGAP24,4:85616072:T:C,1.554312e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85616072,T,1,0.328947,0.711897,4:85616072:T:C-ARHGAP24,7.682979,211365,7.636858,chr4_85616072
473,ARHGAP24,4:85608107:T:G,1.554957e-14,1.544048,0.322525,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85608107,T,1,0.328947,0.711897,4:85608107:T:G-ARHGAP24,7.682926,211365,7.636823,chr4_85608107
474,ARHGAP24,4:85602521:A:G,1.561854e-14,1.544259,0.322549,2.673687e-12,4,85475150,86002668,ENSG00000138639,...,85602521,A,1,0.328947,0.711897,4:85602521:A:G-ARHGAP24,7.682359,211365,7.636308,chr4_85602521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19490,ERICH1,8:643977:C:T,1.989414e-05,0.774380,0.236679,5.095386e-03,8,614746,738106,ENSG00000104714,...,643977,C,1,0.250000,1.000000,8:643977:C:T-ERICH1,4.266076,211365,3.920378,chr8_643977
19597,CAPZA1,1:111759995:AGAAAT:A,9.411961e-05,-0.463807,0.220524,5.134748e-03,1,112619805,112671616,ENSG00000116489,...,111759995,AGAAAT,1,0.157895,1.000000,1:111759995:AGAAAT:A-CAPZA1,-3.905272,211365,-4.018136,chr1_111759995
19648,RGS10,10:118712242:G:A,1.318374e-04,0.793548,0.288952,5.190714e-03,10,119499817,119542719,ENSG00000148908,...,118712242,G,1,0.118421,1.000000,10:118712242:G:A-RGS10,3.823001,211365,3.551248,chr10_118712242
19649,RGS10,10:118711913:G:GA,1.325683e-04,0.793548,0.288952,5.190714e-03,10,119499817,119542719,ENSG00000148908,...,118711913,G,1,0.118421,1.000000,10:118711913:G:GA-RGS10,3.821638,211365,3.549572,chr10_118711913


In [None]:
#f'ratio significante b qtls in all b qtls: {(len(sig_bqtl) /len(b_qtl)) * 100:.2f}%'

In [None]:
#print(f'unique b_qtls: {len(b_qtl_unique)} \nbqtls: {len(b_qtl)} \nratio unique b_qtls vs all b_qtls: {(len(b_qtl_unique) / len(b_qtl)) * 100:.2f}%')

In [None]:
# print(f'unique cd4_qtls: {len(cd4_qtl_unique)} \ncd4qtls: {len(cd4_qtl)} \nratio unique cd4_qtls vs all cd4_qtls: {(len(cd4_qtl_unique) / len(cd4_qtl)) * 100:.2f}%\n')
# print(f'unique significant cd4_qtls: {len(sig_cd4_qtl_unique)} \nsignificant cd4 qtls: {len(sig_cd4_qtl)} \nratio unique significant cd4_qtls vs significant cd4_qtls: {(len(sig_cd4_qtl_unique) / len(sig_cd4_qtl)) * 100:.2f}%')

In [None]:
print(f'unique b_qtls: {len(b_qtl_unique)} \nbqtls: {len(b_qtl)} \nratio unique b_qtls vs all b_qtls: {(len(b_qtl_unique) / len(b_qtl)) * 100:.2f}%\n')
print(f'unique significant bqtls: {len(sig_b_qtl_unique)}\nsignificant bqtls: {len(sig_bqtl)} \nratio unique significant b qtls vs significant b qtls: {(len(sig_b_qtl_unique) / len(sig_bqtl)) * 100:.2f}%')

unique b_qtls: 155078 
bqtls: 3466245 
ratio unique b_qtls vs all b_qtls: 4.47%

unique significant bqtls: 193
significant bqtls: 20017 
ratio unique significant b qtls vs significant b qtls: 0.96%


In [None]:
unique_bqtl_in_lcl = b_qtl_unique[b_qtl_unique['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
len(unique_bqtl_in_lcl)

567

In [None]:
unique_sig_bqtl_in_lcl = sig_b_qtl_unique[sig_b_qtl_unique['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
len(unique_sig_bqtl_in_lcl)

0

In [None]:
unique_bqtl_in_sig_lcl = b_qtl_unique[b_qtl_unique['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
len(unique_bqtl_in_sig_lcl)

43

In [None]:
unique_sig_bqtl_insig__lcl = sig_b_qtl_unique[sig_b_qtl_unique['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
len(unique_sig_bqtl_insig__lcl)

0

In [None]:
print(f'Total bqtls that overlap with all lcl snps: {len(bqtl_in_lcl.chr_snp_pos.value_counts())} \t\t Total bqtls that overlap with significant lcl snps: {len(bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'significant bqtls overlap with all lcl snps: {len(sig_bqtls_in_lcl.chr_snp_pos.value_counts())} \t\t significant bqtls overlap with significant lcl snps: {len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')
print(f'non-significant bqtls that overlap with all lcl snps: {len(bqtl_in_lcl.chr_snp_pos.value_counts()) - len(sig_bqtls_in_lcl.chr_snp_pos.value_counts())} \t non-significant bqtls that overlap with all lcl snps: {len(bqtl_in_sig_lcl.chr_snp_pos.value_counts()) - len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts())}')

Total bqtls that overlap with all lcl snps: 8934 		 Total bqtls that overlap with significant lcl snps: 1196
significant bqtls overlap with all lcl snps: 2601 		 significant bqtls overlap with significant lcl snps: 436
non-significant bqtls that overlap with all lcl snps: 6333 	 non-significant bqtls that overlap with all lcl snps: 760


In [None]:
print(f'ratio cd4 - lcl {len(sig_cd4_qtl_in_lcl.chr_snp_pos.value_counts()) / len(cd4_qtl_in_lcl.chr_snp_pos.value_counts()):.3f}')
print(f'ratio sig cd4 - lcl {len(sig_cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()) / len(cd4_qtl_in_sig_lcl.chr_snp_pos.value_counts()):.3f}')
print(4223 / 8555)
print('\n')
print(f'ratio b - lcl {len(sig_bqtls_in_lcl.chr_snp_pos.value_counts()) / len(bqtl_in_lcl.chr_snp_pos.value_counts()):.3f}')

print(f'ratio sig b - lcl {len(sig_bqtl_in_sig_lcl.chr_snp_pos.value_counts()) / len(bqtl_in_sig_lcl.chr_snp_pos.value_counts()):.3f}')



ratio cd4 - lcl 0.503
ratio sig cd4 - lcl 0.569
0.49362945645821155


ratio b - lcl 0.291
ratio sig b - lcl 0.365


In [None]:
bqtl_not_overlapping_cd4_qtl = sig_bqtls_in_lcl[~sig_bqtls_in_lcl['chr_snp_pos'].isin(sig_cd4_qtl_in_lcl.chr_snp_pos)]
bqtl_not_overlapping_cd4_qtl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
5071,CTSH,15:78931339:G:A,8.351700e-07,0.743339,0.184601,0.000122,15,78921058,78949574,ENSG00000103811,...,78931339,G,1,0.368421,0.507756,15:78931339:G:A-CTSH,4.926964,211365.1,4.605555,chr15_78931339
5072,CTSH,15:78943104:T:C,9.642513e-07,-0.715158,0.178762,0.000122,15,78921058,78949574,ENSG00000103811,...,78943104,T,1,0.381579,0.490409,15:78943104:T:C-CTSH,-4.898797,211365.1,-4.636822,chr15_78943104
5075,CTSH,15:78939176:A:G,1.012313e-06,-0.728898,0.181433,0.000122,15,78921058,78949574,ENSG00000103811,...,78939176,A,1,0.381579,0.490409,15:78939176:A:G-CTSH,-4.889230,211365.1,-4.605988,chr15_78939176
5076,CTSH,15:78939274:A:G,1.012313e-06,-0.728898,0.181433,0.000122,15,78921058,78949574,ENSG00000103811,...,78939274,A,1,0.381579,0.490409,15:78939274:A:G-CTSH,-4.889230,211365.1,-4.605988,chr15_78939274
5077,CTSH,15:78942522:A:G,1.132315e-06,-0.731967,0.179536,0.000122,15,78921058,78949574,ENSG00000103811,...,78942522,A,1,0.381579,0.490409,15:78942522:A:G-CTSH,-4.867128,211365.1,-4.589170,chr15_78942522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18698,CRELD2,22:49926381:G:T,2.254491e-04,1.664777,0.532047,0.004584,22,49918167,49927540,ENSG00000184164,...,49926381,G,1,0.078947,0.191819,22:49926381:G:T-CRELD2,3.688648,211365.1,3.736890,chr22_49926381
18699,CRELD2,22:49925703:T:C,2.254936e-04,1.661329,0.530269,0.004584,22,49918167,49927540,ENSG00000184164,...,49925703,T,1,0.078947,0.191819,22:49925703:T:C-CRELD2,3.688598,211365.1,3.735970,chr22_49925703
18703,CRELD2,22:49923682:G:A,2.267348e-04,1.613001,0.515589,0.004584,22,49918167,49927540,ENSG00000184164,...,49923682,G,1,0.078947,0.191819,22:49923682:G:A-CRELD2,3.687200,211365.1,3.735039,chr22_49923682
18706,CRELD2,22:49925279:T:C,2.321028e-04,1.660960,0.530176,0.004584,22,49918167,49927540,ENSG00000184164,...,49925279,T,1,0.078947,0.191819,22:49925279:T:C-CRELD2,3.681240,211365.1,3.727947,chr22_49925279


In [None]:
bqtl_not_overlapping_cd4_qtl.chr_snp_pos.value_counts()

chr6_33135754     2
chr22_49919476    1
chr22_49919366    1
chr22_49920641    1
chr22_49921279    1
                 ..
chr20_35146661    1
chr20_35145275    1
chr20_35142584    1
chr20_35139473    1
chr22_49925298    1
Name: chr_snp_pos, Length: 98, dtype: int64

In [None]:
#lcl_qtl[~lcl_qtl['chr_snp_pos'].isin(d['chr_snp_pos'])]

In [None]:
#final_lcl_positions[(final_lcl_positions['snp_position'].isin(b_qtl_position)) & (final_lcl_positions['snp_chromosome'].isin(bqtl_chr))]

In [None]:
#bqtl_lcl_snp = b_qtl[(b_qtl['snp_chromosome'].isin(lcl_chrom) & b_qtl['snp_position'].isin(lcl_positions))]


***

In [None]:
enh = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/enh_cd4_coords.txt',
                   sep='\t', header=None, names=['chromosome', 'start', 'end', 'name'])

In [None]:
enh.dtypes

chromosome    object
start          int64
end            int64
name          object
dtype: object

In [None]:
snp_pos_sig_cd4 = [int(i) for i in sig_cd4_qtl['snp_position'].unique()]

In [None]:
# for i in snp_pos_sig_cd4:
#     if enh.query("'start' >= i"):
#         print('yes')

In [None]:
sig_cd4_qtl.dtypes

feature_id                    object
snp_id                        object
p_value                      float64
beta                         float64
beta_se                      float64
empirical_feature_p_value    float64
feature_chromosome             int64
feature_start                  int64
feature_end                    int64
ENSG                          object
biotype                       object
n_samples                      int64
n_e_samples                    int64
alpha_param                  float64
beta_param                   float64
snp_chromosome                object
snp_position                   int64
assessed_allele               object
call_rate                      int64
maf                          float64
hwe_p                        float64
QTL                           object
z_score                      float64
weight                         int64
z_score_org                  float64
chr_snp_pos                   object
dtype: object

In [None]:
enh['chromosome'].value_counts()

chr1     868
chr2     610
chr6     537
chr3     513
chr19    484
chr12    464
chr17    444
chr11    408
chr7     403
chr5     403
chr10    382
chr9     336
chr8     326
chr4     315
chr14    300
chr16    286
chr15    271
chr20    200
chr13    196
chr22    167
chr18    121
chr21     96
Name: chromosome, dtype: int64

In [None]:
coord = ['start', 'end']
operations = ['<=', '>=']
chrom = [str(i) for i in sig_cd4_qtl['snp_chromosome']]
values = snp_pos_sig_cd4

In [None]:
start = list(enh['start'])
end = list(enh['end'])

In [None]:
enh['start']

0          844682
1         1116100
2         1201502
3         1208078
4         1231960
          ...    
8125    137579026
8126    137590556
8127    137605159
8128    137628626
8129    137722743
Name: start, Length: 8130, dtype: int64

In [None]:
start

[844682,
 1116100,
 1201502,
 1208078,
 1231960,
 1375352,
 1399332,
 1511997,
 1630343,
 1778888,
 2133552,
 2139962,
 2189510,
 2199436,
 2300511,
 2322170,
 2391528,
 2412558,
 2526570,
 2546978,
 3772461,
 3857154,
 5482003,
 5992466,
 6199598,
 6393752,
 6497616,
 6602858,
 7783839,
 7940766,
 7949133,
 7961479,
 8152683,
 8169878,
 8182450,
 8409137,
 8431198,
 8472737,
 8510285,
 8525949,
 8900022,
 8927346,
 9428794,
 9627098,
 9652497,
 9725102,
 9728807,
 10495320,
 10507396,
 11012410,
 11060104,
 11262590,
 11681489,
 11934576,
 11980180,
 12019264,
 12150888,
 12173812,
 12178040,
 12345501,
 12449442,
 12510708,
 12618295,
 15524402,
 15579106,
 15616940,
 15697022,
 15847420,
 15949674,
 16206970,
 16367020,
 16420831,
 16613511,
 16644664,
 16896165,
 17054160,
 17182072,
 18902757,
 18913174,
 18948476,
 18952032,
 18956658,
 19463282,
 19485478,
 19606914,
 19644171,
 19715694,
 19800285,
 19882199,
 20186034,
 20786626,
 21176885,
 21255362,
 22025961,
 22275998,
 23

In [None]:
sig_cd4_qtl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,5.750549e-61,1.525292,0.277239,3.875870e-58,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,16.472841,217721,16.886385,chr12_56007301
1,SMDT1,22:42074313:T:C,3.718128e-59,-1.446559,0.271777,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42074313,T,1,0.342105,1.000000,22:42074313:T:C-SMDT1,-16.218728,217721,-16.704826,chr22_42074313
2,SMDT1,22:42080766:A:T,4.780513e-59,1.449479,0.272295,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080766,A,1,0.342105,1.000000,22:42080766:A:T-SMDT1,16.203283,217721,16.682376,chr22_42080766
3,SMDT1,22:42080750:A:C,4.832899e-59,1.447290,0.271892,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,16.202613,217721,16.681792,chr22_42080750
4,SMDT1,22:42078134:C:G,4.860768e-59,1.445682,0.271640,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42078134,C,1,0.342105,1.000000,22:42078134:C:G-SMDT1,16.202260,217721,16.680695,chr22_42078134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199622,PEA15,1:160013969:C:T,1.194720e-03,0.595101,0.254633,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160013969,C,1,0.171053,1.000000,1:160013969:C:T-PEA15,3.240138,217721,2.913457,chr1_160013969
199623,PEA15,1:160015506:GA:G,1.196031e-03,0.557679,0.255824,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160015506,GA,1,0.184211,1.000000,1:160015506:GA:G-PEA15,3.239825,217721,2.927943,chr1_160015506
199624,PEA15,1:160027419:C:G,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160027419,C,1,0.184211,1.000000,1:160027419:C:G-PEA15,3.235851,217721,2.932173,chr1_160027419
199625,PEA15,1:160022992:T:C,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160022992,T,1,0.184211,1.000000,1:160022992:T:C-PEA15,3.235851,217721,2.932173,chr1_160022992


In [None]:
#sig_cd4_qtl.query('snp_position >= @start & snp_position <= @end'.format(start, end) for start, end in zip(start, end))

In [None]:

enh.query(' & '.join(['{} {} {} '.format(coord, operation, val) for coord, operation, val in zip(coord, operations,values)]))


Unnamed: 0,chromosome,start,end,name
225,chr1,42682213,42682403,chr1_43147884_43148074_MACS2STARRENH_indivuall...
226,chr1,42958800,42959119,chr1_43424471_43424790_MACS2STARRENH_indivuall...
227,chr1,43227238,43227413,chr1_43692909_43693084_MACS2STARRENH_indivuall...
228,chr1,43348854,43349046,chr1_43814525_43814717_MACS2STARRENH_indivuall...
229,chr1,43358791,43358950,chr1_43824462_43824621_MACS2STARRENH_indivuall...
...,...,...,...,...
7572,chr8,52008509,52008693,chr8_52921069_52921253_MACS2STARRENH_indivuall...
7573,chr8,53843263,53843529,chr8_54755823_54756089_MACS2STARRENH_indivuall...
7574,chr8,53844401,53844564,chr8_54756961_54757124_MACS2STARRENH_indivuall...
7575,chr8,54022230,54022342,chr8_54934790_54934902_MACS2STARRENH_indivuall...


In [None]:
enh.query(' & '.join(['{} {} {}'.format(f, o, v) 
                         for f, o, v in zip(coord, operations, values)
                       ])
          )

Unnamed: 0,chromosome,start,end,name
225,chr1,42682213,42682403,chr1_43147884_43148074_MACS2STARRENH_indivuall...
226,chr1,42958800,42959119,chr1_43424471_43424790_MACS2STARRENH_indivuall...
227,chr1,43227238,43227413,chr1_43692909_43693084_MACS2STARRENH_indivuall...
228,chr1,43348854,43349046,chr1_43814525_43814717_MACS2STARRENH_indivuall...
229,chr1,43358791,43358950,chr1_43824462_43824621_MACS2STARRENH_indivuall...
...,...,...,...,...
7572,chr8,52008509,52008693,chr8_52921069_52921253_MACS2STARRENH_indivuall...
7573,chr8,53843263,53843529,chr8_54755823_54756089_MACS2STARRENH_indivuall...
7574,chr8,53844401,53844564,chr8_54756961_54757124_MACS2STARRENH_indivuall...
7575,chr8,54022230,54022342,chr8_54934790_54934902_MACS2STARRENH_indivuall...


In [None]:
sig_cd4_qtl

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
0,RPS26,12:56007301:G:A,5.750549e-61,1.525292,0.277239,3.875870e-58,12,56041351,56044697,ENSG00000197728,...,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,16.472841,217721,16.886385,chr12_56007301
1,SMDT1,22:42074313:T:C,3.718128e-59,-1.446559,0.271777,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42074313,T,1,0.342105,1.000000,22:42074313:T:C-SMDT1,-16.218728,217721,-16.704826,chr22_42074313
2,SMDT1,22:42080766:A:T,4.780513e-59,1.449479,0.272295,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080766,A,1,0.342105,1.000000,22:42080766:A:T-SMDT1,16.203283,217721,16.682376,chr22_42080766
3,SMDT1,22:42080750:A:C,4.832899e-59,1.447290,0.271892,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,16.202613,217721,16.681792,chr22_42080750
4,SMDT1,22:42078134:C:G,4.860768e-59,1.445682,0.271640,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,42078134,C,1,0.342105,1.000000,22:42078134:C:G-SMDT1,16.202260,217721,16.680695,chr22_42078134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199622,PEA15,1:160013969:C:T,1.194720e-03,0.595101,0.254633,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160013969,C,1,0.171053,1.000000,1:160013969:C:T-PEA15,3.240138,217721,2.913457,chr1_160013969
199623,PEA15,1:160015506:GA:G,1.196031e-03,0.557679,0.255824,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160015506,GA,1,0.184211,1.000000,1:160015506:GA:G-PEA15,3.239825,217721,2.927943,chr1_160015506
199624,PEA15,1:160027419:C:G,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160027419,C,1,0.184211,1.000000,1:160027419:C:G-PEA15,3.235851,217721,2.932173,chr1_160027419
199625,PEA15,1:160022992:T:C,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,160022992,T,1,0.184211,1.000000,1:160022992:T:C-PEA15,3.235851,217721,2.932173,chr1_160022992


Extra check amount qtls


In [None]:
sig_bqtl_non_sig_lcl = pd.read_csv('data/lcl_coordinates.txt', sep='\t', header=None)

In [None]:
sig_bqtl_non_sig_lcl

Unnamed: 0,0,1,2,3
0,chr1,227334465,227334615,rs11809905
1,chr1,42958305,42958455,rs114530232
2,chr1,37548147,37548297,rs114531441
3,chr1,163311225,163311375,rs11810220
4,chr1,206551334,206551484,rs11811181
...,...,...,...,...
29116,chr22,46291887,46292037,chr22:46687859:D
29117,chr22,32406980,32407130,chr22:32803042:D
29118,chr22,23969323,23969473,chr22:24311587:D
29119,chr22,49917155,49917305,chr22:50310878:I


In [None]:
bqtl_lcl = pd.read_csv('data/bed_files/bedtools/bqtl_lcl_1.txt', sep='\t', header=None)

In [None]:
bqtl_lcl[7].value_counts()

6:29942211:C:T       6
22:49917234:C:T      6
17:45911784:A:AC     6
17:45911785:A:AAC    6
17:45911793:G:GC     6
                    ..
17:44183921:C:A      1
17:44184642:A:AT     1
17:44188074:C:T      1
17:44188850:C:T      1
9:99312457:C:T       1
Name: 7, Length: 12023, dtype: int64

In [None]:
bqtl_sig_lcl = pd.read_csv('data/bed_files/bedtools/bqtl_sig_lcl_1.txt', sep='\t', header=None)

In [None]:
bqtl_non_sig_lcl = pd.read_csv('data/bed_files/bedtools/bqtl_non_sig_lcl_1.txt', sep='\t', header=None)

In [None]:
bqtl_non_sig_lcl[7].value_counts()

17:45714039:C:A    5
12:56432026:C:G    5
6:29942211:C:T     5
17:45822361:A:G    5
12:56431953:T:C    5
                  ..
17:45588013:T:C    1
17:45588102:T:G    1
17:45588472:A:G    1
17:45588540:C:G    1
9:99312457:C:T     1
Name: 7, Length: 10650, dtype: int64

In [None]:
bqtl_sig_lcl[7].value_counts()

17:45900929:G:C    4
17:45729747:C:T    4
17:45900954:G:A    4
17:45900942:T:C    4
17:45900940:G:A    4
                  ..
17:45741540:A:G    1
17:45740856:C:A    1
17:45740794:A:C    1
17:45740242:G:A    1
9:98598583:G:GA    1
Name: 7, Length: 1834, dtype: int64

In [None]:
missing_qtls = list(set(bqtl_sig_lcl[7]) - set(list(set(bqtl_lcl[7]) - set(bqtl_non_sig_lcl[7]))))

In [None]:
len(missing_qtls)

461

In [None]:
bqtl_sig_lcl[bqtl_sig_lcl[7].isin(missing_qtls)]

Unnamed: 0,0,1,2,3,4,5,6,7
8,chr1,150925136,150925286,rs72704685,chr1,150925201,150925202,1:150925201:A:T
9,chr1,150925136,150925286,rs72704685,chr1,150925211,150925212,1:150925211:C:G
31,chr1,169846957,169847107,rs12135057,chr1,169847041,169847042,1:169847041:G:T
42,chr1,236218542,236218692,rs1614938,chr1,236218597,236218598,1:236218597:T:A
43,chr1,236218542,236218692,rs1614938,chr1,236218617,236218618,1:236218617:T:A
...,...,...,...,...,...,...,...,...
2123,chr8,41591869,41592019,rs11994272,chr8,41591944,41591945,8:41591944:A:G
2124,chr8,41591869,41592019,rs11994272,chr8,41591980,41591981,8:41591980:A:C
2140,chr9,113290183,113290333,rs10981719,chr9,113290227,113290228,9:113290227:G:GA
2150,chr9,136377323,136377473,rs11793497,chr9,136377398,136377399,9:136377398:A:G


In [None]:
bqtl_non_sig_lcl[bqtl_non_sig_lcl[7].isin(missing_qtls)]

Unnamed: 0,0,1,2,3,4,5,6,7
65,chr1,150925126,150925276,rs72704682,chr1,150925201,150925202,1:150925201:A:T
66,chr1,150925126,150925276,rs72704682,chr1,150925211,150925212,1:150925211:C:G
172,chr1,169846985,169847135,rs12124775,chr1,169847041,169847042,1:169847041:G:T
296,chr1,236218450,236218600,rs1055851,chr1,236218597,236218598,1:236218597:T:A
298,chr1,236218522,236218672,rs1615601,chr1,236218597,236218598,1:236218597:T:A
...,...,...,...,...,...,...,...,...
12634,chr8,41591905,41592055,rs11994275,chr8,41591944,41591945,8:41591944:A:G
12635,chr8,41591905,41592055,rs11994275,chr8,41591980,41591981,8:41591980:A:C
12853,chr9,113290096,113290246,rs10981718,chr9,113290227,113290228,9:113290227:G:GA
12914,chr9,136377392,136377542,rs11791262,chr9,136377398,136377399,9:136377398:A:G


In [None]:
len(list(set(bqtl_sig_lcl[7]) & set(bqtl_non_sig_lcl[7])))

461

In [None]:
sig_bqtl_non_sig_lcl[3].value_counts()

rs17843584     10
rs17612858      8
rs9273072       8
rs74950753      8
rs78638549      7
               ..
rs117602228     1
rs117291215     1
rs116944053     1
rs117569582     1
rs57453563      1
Name: 3, Length: 10203, dtype: int64

In [None]:
sig_cd4qtl_in_sig_lcl = pd.read_csv('data/bed_files/bedtools/cd4_qtl_sig_lcl.txt', sep='\t', header=None)

In [None]:
sig_cd4qtl_in_sig_lcl[7].value_counts()

17:45900929:G:C     4
5:96896582:C:T      4
17:46025323:G:C     4
17:46025316:C:CT    4
17:45729697:C:T     4
                   ..
17:45741540:A:G     1
17:45740856:C:A     1
17:45740794:A:C     1
17:45740242:G:A     1
9:98598583:G:GA     1
Name: 7, Length: 1992, dtype: int64

In [None]:
data = [[656,2655],[1834,10650]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.4347941272511076 2.304630304938235e-12


In [None]:
data = [[1127,5731],[1992,11753]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.1602535600436994 0.0002825700322160416


In [None]:
data = [[436,2165],[1196,7738]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.3029451520464674 2.0832426597131646e-05


In [None]:
data = [[730,4223],[1284,8555]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.1517461490738672 0.005194079001015332


In [138]:
sig_bqtl_sig_lcl_snp

['chr12_56007301',
 'chr7_100308061',
 'chr7_100382481',
 'chr7_100307702',
 'chr7_100377643',
 'chr7_100336385',
 'chr7_100353692',
 'chr7_100355205',
 'chr6_32659937',
 'chr13_43023570',
 'chr15_78931339',
 'chr1_39026790',
 'chr1_39027946',
 'chr14_49620639',
 'chr13_48591739',
 'chr11_73982376',
 'chr17_45607655',
 'chr17_45610883',
 'chr17_45601755',
 'chr17_45613271',
 'chr17_45606261',
 'chr17_45706224',
 'chr17_46173250',
 'chr17_46165988',
 'chr17_46162592',
 'chr17_46168400',
 'chr17_46169948',
 'chr17_46165422',
 'chr17_46161600',
 'chr17_46149830',
 'chr17_46152620',
 'chr17_46157160',
 'chr17_45591884',
 'chr17_45631859',
 'chr17_46131998',
 'chr17_46161205',
 'chr17_46127373',
 'chr17_46129116',
 'chr17_46134136',
 'chr17_46134666',
 'chr17_46120714',
 'chr17_46126000',
 'chr17_46126047',
 'chr17_45807925',
 'chr17_45601827',
 'chr17_45587977',
 'chr17_45588013',
 'chr17_46779986',
 'chr17_45591449',
 'chr17_45632527',
 'chr17_45633018',
 'chr17_45630254',
 'chr17_4581541

In [156]:
print(len(sig_bqtl_in_sig_lcl[~sig_bqtl_in_sig_lcl.chr_snp_pos.isin(sig_cd4_qtl.chr_snp_pos)]['chr_snp_pos'].value_counts()),
      len(sig_bqtl_in_non_sig_lcl[~sig_bqtl_in_non_sig_lcl.chr_snp_pos.isin(sig_cd4_qtl.chr_snp_pos)]['chr_snp_pos'].value_counts()),
      len(bqtl_in_sig_lcl[~bqtl_in_sig_lcl.chr_snp_pos.isin(sig_cd4_qtl.chr_snp_pos)]['chr_snp_pos'].value_counts()),
      len(bqtl_in_non_sig_lcl[~bqtl_in_non_sig_lcl.chr_snp_pos.isin(sig_cd4_qtl.chr_snp_pos)]['chr_snp_pos'].value_counts()))
# sig_bqtl_non_sig_lcl_snp = list(sig_bqtl_in_non_sig_lcl.chr_snp_pos.unique())
# bqtl_sig_lcl_snp = list(bqtl_in_sig_lcl.chr_snp_pos.unique())
# bqtl_non_sig_lcl_snp = list(bqtl_in_non_sig_lcl.chr_snp_pos.unique())

8 90 488 3732


In [149]:
print(len(list(set(sig_bqtl_sig_lcl_snp) - set(list(sig_cd4_qtl.chr_snp_pos)))),
len(list(set(sig_bqtl_non_sig_lcl_snp) - set(list(sig_cd4_qtl.chr_snp_pos)))),
len(list(set(bqtl_sig_lcl_snp) - set(list(sig_cd4_qtl.chr_snp_pos)))),
len(list(set(bqtl_non_sig_lcl_snp) - set(list(sig_cd4_qtl.chr_snp_pos)))))
# sig_bqtl_non_sig_lcl_snp = list(sig_bqtl_in_non_sig_lcl.chr_snp_pos.unique())
# bqtl_sig_lcl_snp = list(bqtl_in_sig_lcl.chr_snp_pos.unique())
# bqtl_non_sig_lcl_snp = list(bqtl_in_non_sig_lcl.chr_snp_pos.unique())

8 26 27 27


In [143]:
data = [[8, 488],[90,3732]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

0.6797814207650273 0.4208544997449186


0.6553044439836893 0.3459039068006189


In [None]:
8 91 1590 11852

In [146]:
data = [[302,856],[2148,6480]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.0643241267686525 0.386064390158663


In [30]:
1364-499

865

b eqtls based on previous tables

In [21]:
data = [[656, 1178],[2655,7995]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.6769205682330484 3.1098314187228015e-21


In [29]:
data = [[28, 1178],[122,7995]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.557655375880208 0.04835715196319314


cd4 based on previous tables


In [25]:
data = [[1127, 865],[5731,6022]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.3690463486786697 1.1037774838388138e-10


In [32]:
data = [[499, 865],[3198,6022]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.0862923720388826 0.17106075214300348


In [36]:
6480-856

5624

differential snp

In [34]:
data = [[8, 480],[90,3642]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

0.6744444444444444 0.33962043321538926


In [37]:
data = [[302, 1846],[856,5624]]
odd_ratio, p_value = stats.fisher_exact(data)
print(odd_ratio, p_value)

1.074847358775225 0.32413352339616497


In [63]:
sig_cd4_qtl = pd.read_csv('data/bed_files/sig_cd4_qtl_bed_unique.txt', sep='\t', names=['chr', 'start', 'end', 'chr_snp_pos'])

In [64]:
sig_cd4_qtl

Unnamed: 0,chr,start,end,chr_snp_pos
0,chr12,56007301,56007302,chr12_56007301
1,chr22,42074313,42074314,chr22_42074313
2,chr22,42080766,42080767,chr22_42080766
3,chr22,42080750,42080751,chr22_42080750
4,chr22,42078134,42078135,chr22_42078134
...,...,...,...,...
199622,chr1,160013969,160013970,chr1_160013969
199623,chr1,160015506,160015507,chr1_160015506
199624,chr1,160027419,160027420,chr1_160027419
199625,chr1,160022992,160022993,chr1_160022992


In [49]:
sig_bqtl_in_lcl = sig_b_qtl[sig_b_qtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
sig_bqtl_in_sig_lcl = sig_b_qtl[sig_b_qtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
sig_bqtl_in_non_sig_lcl = sig_b_qtl[sig_b_qtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]

In [68]:
len(sig_bqtl_in_lcl[~sig_bqtl_in_lcl.chr_snp_pos.isin(sig_cd4_qtl.chr_snp_pos)]['chr_snp_pos'].value_counts())

98

In [65]:
non_sig_bqtl_in_lcl = non_sig_bqtl[non_sig_bqtl['chr_snp_pos'].isin(final_lcl_positions['chr_snp_pos'])]
non_sig_bqtl_in_sig_lcl = non_sig_bqtl[non_sig_bqtl['chr_snp_pos'].isin(sig_lcl['chr_snp_pos'])]
non_sig_bqtl_in_non_sig_lcl = non_sig_bqtl[non_sig_bqtl['chr_snp_pos'].isin(non_sig_lcl['chr_snp_pos'])]