Requirements:
LCL data Tewhey et al., Table S1. Combined LCL Analysis for All 39,478 Ref/Alt Pairs Tested by MPRA, Related to Figure 2 (https://www.cell.com/fulltext/S0092-8674(16)30421-4)

In [None]:
import pandas as pd
import re
from collections import Counter
import scipy.stats as stats
import numpy as np
import statistics as stat
from get_config_yaml import get_config

In [None]:
def get_start_end_coord(position_sig):
    """
    LCL SNPs are centered in 150 bp sequences
    Determine start (-75 bp) and end (+75) coordinate of regulatory elements MPRA
    """ 
    start_mpra = []
    end_mpra = []
    for i in position_sig:
        start_mpra.append(i - 75)
        end_mpra.append(i + 75)
    return start_mpra, end_mpra


In [None]:
config = get_config()

In [23]:

lcl_mpra_path = (config['lcl_mpra'])

In [24]:
lcl = pd.read_csv(lcl_mpra_path, sep=';')

In [25]:
lcl

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
0,rs11548103_RC,rs11548103,neg,ref,893.147913,1403.234147,0.637129,20.726159,16.129804,985.132571,1413.418102,0.512199,24.239298,19.642943,-0.157649,-0.070399,-0.124930,1.197350,0.803895
1,rs2016366,rs2016366,pos,ref,316.596386,258.902025,-0.281146,1.799521,0.000000,345.506770,401.928486,0.195118,0.437249,0.000000,0.344054,0.696614,0.476264,,
2,rs2016366_alt,rs2016366,pos,alt,653.148636,605.357051,-0.104744,0.364373,0.000000,627.652912,774.785548,0.287498,1.736553,0.000000,0.390497,0.395150,0.392242,,
3,rs11102212_RC,rs11102212,neg,ref,272.682393,724.187989,1.276380,20.903079,16.306723,270.606119,663.387712,1.183784,18.276201,13.679846,-0.182603,0.057417,-0.092596,0.151356,0.102546
4,rs646867_RC,rs646867,neg,ref,605.412960,595.896429,-0.023234,0.213685,0.000000,978.751908,806.449311,-0.270002,2.243712,0.000000,-0.287121,-0.179513,-0.246768,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39473,rs9621715_alt,rs9621715,pos,alt,650.529931,507.281685,-0.342574,2.658698,0.000000,563.523072,419.823896,-0.401997,2.811763,0.000000,-0.089697,-0.008966,-0.059423,,
39474,rs4275_RC,rs4275,neg,ref,718.716545,859.461801,0.250261,2.474741,0.000000,576.804932,802.800230,0.455674,5.422339,0.825984,0.167059,0.269337,0.205413,,
39475,rs131816_RC,rs131816,neg,ref,754.841862,634.680055,-0.245811,1.350291,0.000000,954.736488,1064.020915,0.151281,1.036517,0.000000,0.473005,0.270571,0.397092,,
39476,rs131816_RC_alt,rs131816,neg,alt,827.703506,840.290995,0.018192,0.117288,0.000000,753.191408,756.170624,0.003935,0.110077,0.000000,-0.011147,-0.019440,-0.014257,,


In [26]:
lcl['SNP'].value_counts()

rs115855724    4
rs118026199    4
rs116983424    4
rs112595714    4
rs118159794    4
              ..
rs114015819    1
rs76308922     1
rs10262443     1
rs11761517     1
rs2076041      1
Name: SNP, Length: 29173, dtype: int64

In [27]:
snp_chr = [i for i in lcl['SNP'] if i.startswith('chr')]

In [28]:
lcl[lcl['SNP'].isin(snp_chr)]['SNP'].value_counts()

chr17:44104410:D    4
chr17:44276431:I    4
chr17:44149352:D    4
chr17:44354157:I    4
chr17:44037106:I    4
                   ..
chr7:141469761:I    1
chr7:66072054:D     1
chr6:2930007:D      1
chr6:30558477:I     1
chr10:35471310:I    1
Name: SNP, Length: 2018, dtype: int64

In [29]:
len(snp_chr)

2755

Build conversion from grch37 to  grch38

In [30]:
#Extract chromosome, snp position and add an end position of snps without rsid for build conversion to grch38
snp_chr = [i for i in list(lcl['SNP'].unique()) if i.startswith('chr')]
start_pos = [int(i[5:].replace(':', '').replace('I', '').replace('D','')) for i in snp_chr]
end_pos = [i + 1 for i in start_pos]
chr_pos = [i[:5].replace(':', '') for i in list(lcl['SNP'].unique()) if i.startswith('chr')]

In [None]:
#Create input file for ensmbl
with open("total_unique_grch37_positions_input_ensmbl.txt","w") as f:
    if len(start_pos) == len(end_pos) == len(chr_pos) == len(snp_chr):
        for (chr,start,end,snp) in zip(chr_pos,start_pos,end_pos, snp_chr):
            f.write("{0} \t {1} \t {2} \t {3}\n".format(chr,start,end, snp))

In [None]:
#Read in output file of ensbml
unique_total_grch38 = pd.read_csv('data/total_unique_lcl_variants_grch38.bed', sep='\t', header=None)


In [None]:
unique_total_grch38

Unnamed: 0,0,1,2,3
0,chr1,150801063,150801064,chr1:150773539:I
1,chr1,247886531,247886532,chr1:248049833:D
2,chr1,95235667,95235668,chr1:95701223:I
3,chr1,150852051,150852052,chr1:150824527:I
4,chr1,175005506,175005507,chr1:174974642:D
...,...,...,...,...
1997,chr22,46291962,46291963,chr22:46687859:D
1998,chr22,32407055,32407056,chr22:32803042:D
1999,chr22,23969398,23969399,chr22:24311587:D
2000,chr22,49917230,49917231,chr22:50310878:I


In [35]:
#check missing snps
if len(unique_total_grch38) == len(chr_pos):
    print('True')
else:
    print('False', len(chr_pos) - len(unique_total_grch38))


False 16


In [36]:
lcl_chr = lcl[lcl['SNP'].isin(snp_chr)]

could not be converted:

In [37]:
missing_lcl_chr = lcl_chr[~lcl_chr['SNP'].isin(unique_total_grch38[3])]
missing_lcl_chr['SNP'].drop_duplicates()

11509     chr7:72197046:D
12229     chr7:74367161:D
12628     chr7:72209527:D
12865     chr7:72214746:D
15374    chr10:51583018:D
21592    chr15:22908713:D
22234    chr15:23283055:D
22635    chr15:83212280:D
24254    chr17:35254926:D
24487    chr17:34587955:D
29079    chr17:36904739:D
29411    chr17:36438743:I
30161    chr17:35598119:D
31397    chr17:36446921:D
31410    chr17:34968395:I
33858    chr17:34905449:I
Name: SNP, dtype: object

In [40]:
missing_ids = list(missing_lcl_chr['SNP'].unique())

In [41]:
len(unique_total_grch38)

2002

In [None]:
#check snps with no rsid
no_rs_id = [i for i in list(lcl['SNP']) if i.startswith('chr')]
no_rs_id = lcl[lcl['SNP'].isin(no_rs_id)]

In [None]:
no_rs_id

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
21,chr1:150773539:I_RC,chr1:150773539:I,neg,ref,333.963651,335.456529,-0.004300,0.349020,0.000000,367.746453,378.166913,0.022874,0.248667,0.000000,-0.244704,0.480306,0.027174,,
24,chr1:248049833:D_RC,chr1:248049833:D,neg,ref,101.584307,80.760099,-0.256048,0.523068,0.000000,163.008293,120.656706,-0.361262,1.444777,0.000000,0.025269,-0.322687,-0.105214,,
28,chr1:95701223:I,chr1:95701223:I,pos,ref,317.950783,339.020238,0.085936,0.110712,0.000000,312.317269,337.687680,0.099660,0.102379,0.000000,0.006836,0.025204,0.013724,,
34,chr1:150824527:I_RC,chr1:150824527:I,neg,ref,665.673312,928.793886,0.467121,7.948176,3.351821,616.995944,852.686405,0.437081,5.977152,1.380797,-0.149947,0.169805,-0.030040,0.385947,0.256515
69,chr1:174974642:D_RC,chr1:174974642:D,neg,ref,531.038174,458.535352,-0.196354,0.786907,0.000000,668.270004,726.296483,0.109224,0.502498,0.000000,0.310294,0.297718,0.305578,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39384,chr22:24311587:D_RC,chr22:24311587:D,neg,ref,981.570908,887.813493,-0.140719,0.825951,0.000000,1466.670551,1208.088993,-0.273060,3.078556,0.000000,-0.182409,-0.048896,-0.132342,,
39402,chr22:50310878:I,chr22:50310878:I,pos,ref,993.455891,850.487619,-0.218312,1.109198,0.000000,1174.904671,1006.382120,-0.217826,2.178905,0.000000,0.086053,-0.142126,0.000486,,
39403,chr22:50310878:I_alt,chr22:50310878:I,pos,alt,1126.995341,1137.968150,0.010008,0.234265,0.000000,938.036839,1040.206850,0.144965,0.732187,0.000000,0.199215,0.027862,0.134957,,
39446,chr22:32796098:D_RC,chr22:32796098:D,neg,ref,1051.218463,1032.588025,-0.025729,0.052462,0.000000,817.815702,785.592943,-0.057540,0.220910,0.000000,-0.096893,0.076659,-0.031811,,


In [43]:
no_i_or_d = [i for i in list(no_rs_id['SNP'].unique()) if not i.endswith(':I') and not i.endswith(':D')]
no_i_or_d

['chr1:25780893',
 'chr2:130952625',
 'chr4:7064219',
 'chr6:32629802',
 'chr6:32546828',
 'chr6:32605274',
 'chr6:32627992',
 'chr6:32629889',
 'chr7:56088811',
 'chr7:98741441',
 'chr7:56087474',
 'chr15:43897499',
 'chr16:74445537',
 'chr16:70187270',
 'chr16:1306981',
 'chr16:70190401',
 'chr16:70164334',
 'chr17:45214631',
 'chr17:21319121',
 'chr22:45723947']

In [44]:
chr_id = lcl[lcl['SNP'].isin(snp_chr)]
final_chr_id = chr_id[~chr_id['SNP'].isin(missing_ids)]
set_chr_id = final_chr_id.drop_duplicates(subset=['SNP'], keep='last')

In [45]:
final_chr_id['SNP'].value_counts()

chr17:44276431:I    4
chr17:44001549:I    4
chr17:44149352:D    4
chr17:44354157:I    4
chr17:44037106:I    4
                   ..
chr6:74179373:D     1
chr6:30566241:D     1
chr6:19956679:D     1
chr6:32631301:I     1
chr10:7793788:I     1
Name: SNP, Length: 2002, dtype: int64

In [None]:
id_chr = list(final_chr_id['SNP'].unique())
#set(list(set_chr_id['SNP']))

Create input file for snps without rsid

In [None]:
snp = [i for i in list(final_chr_id['SNP'].unique()) if i.startswith('chr')]
start_pos = [int(i[5:].replace(':', '').replace('I', '').replace('D','')) for i in snp]
end_pos = [i + 1 for i in start_pos]
chr_pos = [i[:5].replace(':', '') for i in list(final_chr_id['SNP'].unique()) if i.startswith('chr')]
with open("data/input_ensmbl_total_lcl_variants_grch37.txt","w") as f:
    if len(start_pos) == len(end_pos) == len(chr_pos) == len(snp):
        for (chr,start,end, snp) in zip(chr_pos,start_pos,end_pos, snp):
            f.write("{0} \t {1} \t {2} \t {3}\n".format(chr,start,end, snp))

***

output_assembly_converter file missing for reading: check and compare if worked

In [46]:
output_assembly_converter = unique_total_grch38


In [47]:
output_assembly_converter =output_assembly_converter[[3, 0 ,1]]

In [48]:
output_assembly_converter =output_assembly_converter.rename(columns={3:'snp', 0:'snp_chromosome', 1:'snp_position'})

In [49]:
final_chr_id[final_chr_id['SNP'].isin(list(output_assembly_converter['snp']))]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
21,chr1:150773539:I_RC,chr1:150773539:I,neg,ref,333.963651,335.456529,-0.004300,0.349020,0.000000,367.746453,378.166913,0.022874,0.248667,0.000000,-0.244704,0.480306,0.027174,,
24,chr1:248049833:D_RC,chr1:248049833:D,neg,ref,101.584307,80.760099,-0.256048,0.523068,0.000000,163.008293,120.656706,-0.361262,1.444777,0.000000,0.025269,-0.322687,-0.105214,,
28,chr1:95701223:I,chr1:95701223:I,pos,ref,317.950783,339.020238,0.085936,0.110712,0.000000,312.317269,337.687680,0.099660,0.102379,0.000000,0.006836,0.025204,0.013724,,
34,chr1:150824527:I_RC,chr1:150824527:I,neg,ref,665.673312,928.793886,0.467121,7.948176,3.351821,616.995944,852.686405,0.437081,5.977152,1.380797,-0.149947,0.169805,-0.030040,0.385947,0.256515
69,chr1:174974642:D_RC,chr1:174974642:D,neg,ref,531.038174,458.535352,-0.196354,0.786907,0.000000,668.270004,726.296483,0.109224,0.502498,0.000000,0.310294,0.297718,0.305578,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39384,chr22:24311587:D_RC,chr22:24311587:D,neg,ref,981.570908,887.813493,-0.140719,0.825951,0.000000,1466.670551,1208.088993,-0.273060,3.078556,0.000000,-0.182409,-0.048896,-0.132342,,
39402,chr22:50310878:I,chr22:50310878:I,pos,ref,993.455891,850.487619,-0.218312,1.109198,0.000000,1174.904671,1006.382120,-0.217826,2.178905,0.000000,0.086053,-0.142126,0.000486,,
39403,chr22:50310878:I_alt,chr22:50310878:I,pos,alt,1126.995341,1137.968150,0.010008,0.234265,0.000000,938.036839,1040.206850,0.144965,0.732187,0.000000,0.199215,0.027862,0.134957,,
39446,chr22:32796098:D_RC,chr22:32796098:D,neg,ref,1051.218463,1032.588025,-0.025729,0.052462,0.000000,817.815702,785.592943,-0.057540,0.220910,0.000000,-0.096893,0.076659,-0.031811,,


In [50]:
list(final_chr_id['SNP'].unique()) == list(output_assembly_converter['snp'])

True

In [None]:
len(output_assembly_converter)

2002

In [51]:
output_assembly_converter

Unnamed: 0,snp,snp_chromosome,snp_position
0,chr1:150773539:I,chr1,150801063
1,chr1:248049833:D,chr1,247886531
2,chr1:95701223:I,chr1,95235667
3,chr1:150824527:I,chr1,150852051
4,chr1:174974642:D,chr1,175005506
...,...,...,...
1997,chr22:46687859:D,chr22,46291962
1998,chr22:32803042:D,chr22,32407055
1999,chr22:24311587:D,chr22,23969398
2000,chr22:50310878:I,chr22,49917230


In [None]:
len(final_chr_id['SNP'].value_counts()) == len(output_assembly_converter)

True

***

### Format and combine files from biomart query and assembly converter to gain needed snp, chromosome and snp position information

In [None]:
total_biomart_syn_variants = pd.read_csv('data/biomart_total.txt')

In [None]:
total_biomart_syn_variants

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Synonym name
0,rs8768,dbSNP,1,26170849,26170849,rs79266459
1,rs66517664,dbSNP,1,2768235,2768235,rs76055539
2,rs66517664,dbSNP,HSCHR1_1_CTG3,2780219,2780219,rs76055539
3,rs259338,dbSNP,1,95272751,95272751,rs78140099
4,rs500513,dbSNP,1,234474793,234474793,rs76203336
...,...,...,...,...,...,...
13089,rs79990247,dbSNP,12,7924992,7924992,rs140493080
13090,rs77718176,dbSNP,7,100217868,100217868,rs113859809
13091,rs113177067,dbSNP,12,9965410,9965410,rs150347472
13092,rs112600168,dbSNP,9,31326631,31326631,rs141183894


In [None]:
missing_rsid_mart_export = pd.read_csv("mart_export (5).txt", sep='\t')

In [None]:
missing_rsid_mart_export =missing_rsid_mart_export.drop_duplicates(subset=['Variant name'], keep='first')
missing_rsid_mart_export

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Variant alleles,Minor allele (ALL),Synonym name
0,rs10413306,dbSNP,19,52782474,52782474,C/G,,
1,rs11279206,dbSNP,22,43120043,43120055,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG,,rs35315665
4,rs11673357,dbSNP,19,53197554,53197554,T/A/C,,NM_032559.5:c.1104G>A
13,rs35871241,dbSNP,7,72726370,72726370,G/C/T,,
14,rs3840965,dbSNP,22,50578781,50578782,CT/CTTCT,,rs140535917
15,rs59698086,dbSNP,22,21002604,21002609,AGACAG/AG,,rs148931161
16,rs61737955,dbSNP,19,54632756,54632756,C/A/G/T,,rs79640454


In [None]:
missing_rsid_mart_export = missing_rsid_mart_export[['Chromosome/scaffold position start (bp)', 'Variant alleles']]
missing_rsid_mart_export.columns = ['snp_position', 'variant_allele']
missing_rsid_mart_export

Unnamed: 0,snp_position,variant_allele
0,52782474,C/G
1,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
4,53197554,T/A/C
13,72726370,G/C/T
14,50578781,CT/CTTCT
15,21002604,AGACAG/AG
16,54632756,C/A/G/T


In [None]:
total_variants = pd.read_csv('data/martquery_1117101613_627.txt')

In [None]:
def change_column_names(dataframe):
    dataframe.columns = [i.lower().replace(' ', '_') for i in dataframe.columns]

In [None]:
change_column_names(total_biomart_syn_variants)
change_column_names(total_variants)

In [None]:
print('amount of unique variants found based on synonyms:',len(total_biomart_syn_variants['synonym_name'].value_counts()))

amount of unique variants found based on synonyms: 3497


In [None]:
chr_name = [i for i in total_biomart_syn_variants['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
syn_total_variants_unique = total_biomart_syn_variants[total_biomart_syn_variants['chromosome/scaffold_name'].isin(chr_name)]
syn_total_variants_unique = syn_total_variants_unique.drop_duplicates('synonym_name', keep='first')
syn_total_variants_unique

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs8768,dbSNP,1,26170849,26170849,rs79266459
1,rs66517664,dbSNP,1,2768235,2768235,rs76055539
3,rs259338,dbSNP,1,95272751,95272751,rs78140099
4,rs500513,dbSNP,1,234474793,234474793,rs76203336
5,rs873309,dbSNP,1,25431928,25431928,rs76583011
...,...,...,...,...,...,...
13089,rs79990247,dbSNP,12,7924992,7924992,rs140493080
13090,rs77718176,dbSNP,7,100217868,100217868,rs113859809
13091,rs113177067,dbSNP,12,9965410,9965410,rs150347472
13092,rs112600168,dbSNP,9,31326631,31326631,rs141183894


In [None]:
syn_total_variants_unique = syn_total_variants_unique.rename(columns={'synonym_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
syn_total_variants_unique = syn_total_variants_unique.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'variant_name'])
syn_total_variants_unique = syn_total_variants_unique[['snp', 'snp_chromosome', 'snp_position']]
#syn_total_variants_unique = syn_total_variants_unique.astype({'snp_chromosome': 'int64'})

In [None]:
syn_total_variants_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3495 entries, 0 to 13093
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   snp             3495 non-null   object
 1   snp_chromosome  3495 non-null   object
 2   snp_position    3495 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 109.2+ KB


In [None]:
total_variants

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11809905,dbSNP,1,227334540,227334540,rs59906147
1,rs114530232,dbSNP,1,42958380,42958380,rs118162020
2,rs114530232,dbSNP,1,42958380,42958380,VCV000668836
3,rs114530232,dbSNP,1,42958380,42958380,RCV000827794
4,rs114531441,dbSNP,1,37548222,37548222,
...,...,...,...,...,...,...
53195,rs60115620,dbSNP,12,6456710,6456710,
53196,rs57161853,dbSNP,7,72747687,72747687,
53197,rs112262084,dbSNP,16,28884350,28884350,VCV000678426
53198,rs112262084,dbSNP,16,28884350,28884350,RCV000838015


In [None]:
chr_name = [i for i in total_variants['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
unique_total_variants = total_variants[total_variants['chromosome/scaffold_name'].isin(chr_name)]
unique_total_variants

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11809905,dbSNP,1,227334540,227334540,rs59906147
1,rs114530232,dbSNP,1,42958380,42958380,rs118162020
2,rs114530232,dbSNP,1,42958380,42958380,VCV000668836
3,rs114530232,dbSNP,1,42958380,42958380,RCV000827794
4,rs114531441,dbSNP,1,37548222,37548222,
...,...,...,...,...,...,...
53195,rs60115620,dbSNP,12,6456710,6456710,
53196,rs57161853,dbSNP,7,72747687,72747687,
53197,rs112262084,dbSNP,16,28884350,28884350,VCV000678426
53198,rs112262084,dbSNP,16,28884350,28884350,RCV000838015


In [None]:
unique_total_variants = unique_total_variants.drop_duplicates('variant_name', keep='first')
unique_total_variants

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11809905,dbSNP,1,227334540,227334540,rs59906147
1,rs114530232,dbSNP,1,42958380,42958380,rs118162020
4,rs114531441,dbSNP,1,37548222,37548222,
5,rs11810220,dbSNP,1,163311300,163311300,
6,rs11811181,dbSNP,1,206551409,206551409,rs58730705
...,...,...,...,...,...,...
53194,rs73484568,dbSNP,9,33132494,33132494,rs73645258
53195,rs60115620,dbSNP,12,6456710,6456710,
53196,rs57161853,dbSNP,7,72747687,72747687,
53197,rs112262084,dbSNP,16,28884350,28884350,VCV000678426


In [None]:
#Select columns to keep
unique_total_variants = unique_total_variants.rename(columns={'variant_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
unique_total_variants = unique_total_variants.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'synonym_name'])
unique_total_variants

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,1,227334540
1,rs114530232,1,42958380
4,rs114531441,1,37548222
5,rs11810220,1,163311300
6,rs11811181,1,206551409
...,...,...,...
53194,rs73484568,9,33132494
53195,rs60115620,12,6456710
53196,rs57161853,7,72747687
53197,rs112262084,16,28884350


In [None]:
output_assembly_converter

Unnamed: 0,snp,snp_chromosome,snp_position
0,chr1:150773539:I,chr1,150801063
1,chr1:248049833:D,chr1,247886531
2,chr1:95701223:I,chr1,95235667
3,chr1:150824527:I,chr1,150852051
4,chr1:174974642:D,chr1,175005506
...,...,...,...
1997,chr22:46687859:D,chr22,46291962
1998,chr22:32803042:D,chr22,32407055
1999,chr22:24311587:D,chr22,23969398
2000,chr22:50310878:I,chr22,49917230


In [None]:
unique_total_variants['snp_chromosome'] = 'chr' + unique_total_variants['snp_chromosome']
syn_total_variants_unique['snp_chromosome'] = 'chr' + syn_total_variants_unique['snp_chromosome']

In [None]:
len(syn_total_variants_unique) +len(unique_total_variants)

27119

In [None]:
unique_total_variants

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,chr1,227334540
1,rs114530232,chr1,42958380
4,rs114531441,chr1,37548222
5,rs11810220,chr1,163311300
6,rs11811181,chr1,206551409
...,...,...,...
53194,rs73484568,chr9,33132494
53195,rs60115620,chr12,6456710
53196,rs57161853,chr7,72747687
53197,rs112262084,chr16,28884350


In [None]:
#Concat all dataframs with variants together
final_lcl_positions = pd.concat([unique_total_variants, syn_total_variants_unique, output_assembly_converter])
final_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,chr1,227334540
1,rs114530232,chr1,42958380
4,rs114531441,chr1,37548222
5,rs11810220,chr1,163311300
6,rs11811181,chr1,206551409
...,...,...,...
1997,chr22:46687859:D,chr22,46291962
1998,chr22:32803042:D,chr22,32407055
1999,chr22:24311587:D,chr22,23969398
2000,chr22:50310878:I,chr22,49917230


In [None]:
print(len(lcl['SNP'].value_counts()), 'variants in total at beginning')

29173 variants in total at beginning


In [None]:
print(len(unique_total_variants) 
      + len(syn_total_variants_unique) 
      + len(output_assembly_converter), 'variants converted')

29121 variants where position was found for


In [None]:
variants = pd.concat([unique_total_variants, syn_total_variants_unique])

In [None]:
variants_list = list(variants['snp'])

In [None]:
with open("data/bed_files/lcl_snp_list.txt","w") as f:
    for snp in variants_list:
        f.write("{0} \n".format(snp))

***
#### Create MPRA coordinates for LCL B cell SNPS to compare with single-cell eQTLs with bedtools intersect

In [57]:
all_positions = list(final_lcl_positions['snp_position'])
start_mpra, end_mpra = get_start_end_coord(all_positions)
final_lcl_positions['start_coord'] = start_mpra
final_lcl_positions['end_coord'] = end_mpra
final_lcl_positions = final_lcl_positions.rename(columns={'snp_chromosome': 'chromosome'})

save SNPs with rsid and coordinates to file

In [None]:
rs_ids_final = final_lcl_positions.iloc[:27119]

In [None]:
rs_ids_final['snp'].to_csv('data/assembly/rs_ids_lcl.txt', header=None, index=False)

save snps without rsid and coordiantes to file

In [None]:
chr_ids = final_lcl_positions.iloc[27119:]

In [None]:
chr_ids

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,chr1:150773539:I,chr1,150801063,150800988,150801138
1,chr1:248049833:D,chr1,247886531,247886456,247886606
2,chr1:95701223:I,chr1,95235667,95235592,95235742
3,chr1:150824527:I,chr1,150852051,150851976,150852126
4,chr1:174974642:D,chr1,175005506,175005431,175005581
...,...,...,...,...,...
1997,chr22:46687859:D,chr22,46291962,46291887,46292037
1998,chr22:32803042:D,chr22,32407055,32406980,32407130
1999,chr22:24311587:D,chr22,23969398,23969323,23969473
2000,chr22:50310878:I,chr22,49917230,49917155,49917305


In [None]:
chr_ids_regions = chr_ids[['chromosome', 'start_coord', 'end_coord']]

In [None]:
chr_ids_regions.to_csv('data/assembly/chr_id_regions.txt', sep='\t', header=None, index=False)

Save information of all converted LCL SNPs to csv and excel

In [None]:
final_lcl_positions.to_csv('positions_all_lcl_variants.csv', index=False)
final_lcl_positions.to_excel('positions_all__lcl_variants.xlsx', index=False)

***
### Build converted LCL MPRA coordinates

In [77]:
final_lcl_positions_path = (config['lcl_mpra_positions'])

In [None]:
final_lcl_positions

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,rs11809905,chr1,227334540,227334465,227334615
1,rs114530232,chr1,42958380,42958305,42958455
2,rs114531441,chr1,37548222,37548147,37548297
3,rs11810220,chr1,163311300,163311225,163311375
4,rs11811181,chr1,206551409,206551334,206551484
...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037
29117,chr22:32803042:D,chr22,32407055,32406980,32407130
29118,chr22:24311587:D,chr22,23969398,23969323,23969473
29119,chr22:50310878:I,chr22,49917230,49917155,49917305


Bed format coordinates of all lcl variants

In [3]:
final_lcl_positions = final_lcl_positions.rename(columns={'chromosome': 'snp_chromosome'})

In [60]:
bed_total_lcl_variants_coords = final_lcl_positions[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]
bed_total_lcl_variants_coords.to_csv('data/total_lcl_variants_coords.txt', sep='\t', header=None, index=False)

***

### Determine significant LCL MPRA regions

In [61]:
sig_variants = lcl.dropna()

In [62]:
sig_variants

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
0,rs11548103_RC,rs11548103,neg,ref,893.147913,1403.234147,0.637129,20.726159,16.129804,985.132571,1413.418102,0.512199,24.239298,19.642943,-0.157649,-0.070399,-0.124930,1.197350,0.803895
3,rs11102212_RC,rs11102212,neg,ref,272.682393,724.187989,1.276380,20.903079,16.306723,270.606119,663.387712,1.183784,18.276201,13.679846,-0.182603,0.057417,-0.092596,0.151356,0.102546
6,rs112338151,rs112338151,pos,ref,311.022339,938.600426,1.454065,35.570382,30.974027,441.094084,1188.919426,1.333956,46.863102,42.266747,-0.149852,-0.070538,-0.120109,1.108686,0.743469
14,rs10910099_RC,rs10910099,neg,ref,1565.369298,2410.813917,0.609195,18.726474,14.130119,1597.275307,1901.533139,0.243351,4.277202,0.000000,-0.319983,-0.442277,-0.365843,2.609976,1.744575
17,rs61731104,rs61731104,pos,ref,787.310108,1106.543203,0.472118,7.158719,2.562364,851.116339,948.197074,0.137500,2.071566,0.000000,-0.426533,-0.181428,-0.334618,1.448329,0.973007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39435,rs6002380_RC,rs6002380,neg,ref,1021.939757,1393.401924,0.436551,7.957672,3.361317,1410.403292,1772.580532,0.318500,3.737057,0.000000,-0.072219,-0.194436,-0.118050,0.225938,0.152430
39454,rs12165508_RC,rs12165508,neg,ref,905.677775,1186.698469,0.370023,10.844473,6.248118,602.818906,793.453679,0.368042,5.622711,1.026356,-0.004909,0.002897,-0.001982,0.082316,0.061067
39461,rs73439311_RC,rs73439311,neg,ref,225.047471,518.727303,1.021873,15.889120,11.292765,264.268231,367.242135,0.333214,5.915880,1.319524,-0.768413,-0.555736,-0.688659,2.142339,1.440353
39467,rs2234058,rs2234058,pos,ref,673.043758,1373.481543,0.938229,34.201737,29.605382,587.687732,1299.793269,1.040252,37.364005,32.767650,0.089103,0.123557,0.102023,0.925545,0.622003


In [None]:
chr_variants = [i for i in sig_variants['SNP'].unique() if i.startswith('chr')]

build 37 to 38 for snps without a rsid

In [63]:
# Input assembly converter ensmbl: chromosome, start position, stop position
# Add an stop position just in case the position in removed in the new build
snp_sig = [i for i in sig_variants['SNP'].unique() if i.startswith('chr')]
start_pos_sig = [int(i[5:].replace(':', '').replace('I','').replace('D', '')) for i in snp_sig]
end_pos_sig = [i + 1 for i in start_pos_sig]
chr_pos_sig = [i[:5].replace(':', '') for i in snp_sig if i.startswith('chr')]

In [None]:
# Create a txt file in bed format for input in assembly converter of ensmbl
with open("grch37sig_input_assembly_converter.txt","w") as f:
    for (chr_pos_sig,start_pos_sig,end_pos_sig, snp_sig) in zip(chr_pos_sig,start_pos_sig,end_pos_sig,snp_sig):
        f.write("{0} \t {1} \t {2} \t {3}\n".format(chr_pos_sig,start_pos_sig,end_pos_sig,snp_sig))

In [None]:
#Load complete file converted positions to build 38
grch38 = pd.read_csv('data/unique_lcl_sig_chr_coord.bed', sep='\t', header=None, names=['snp_chromosome', 'snp_position', 'end_coord', 'snp'])

In [None]:
grch38

Unnamed: 0,snp_chromosome,snp_position,end_coord,snp
0,chr1,150852051,150852052,chr1:150824527:I
1,chr1,205784748,205784749,chr1:205753876:D
2,chr1,172024714,172024715,chr1:171993854:D
3,chr1,41034295,41034296,chr1:41499967:I
4,chr1,22025902,22025903,chr1:22352395:D
...,...,...,...,...
208,chr20,25549209,25549210,chr20:25529845:D
209,chr21,36973064,36973065,chr21:38345364:I
210,chr21,28955410,28955411,chr21:30327732:D
211,chr22,49917233,49917234,chr22:50310881:D


In [65]:
#Missing variants build lift over
sig_variants_chr = [i for i in sig_variants['SNP'].unique() if i.startswith('chr')]
sig_variants_chr = sig_variants[sig_variants['SNP'].isin(sig_variants_chr)]
sig_variants_chr[~sig_variants_chr['SNP'].isin(grch38['snp'])]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
15374,chr10:51583018:D_RC,chr10:51583018:D,neg,ref,182.744215,453.229658,1.186026,16.662832,12.066477,256.76964,320.133604,0.275901,0.678229,0.0,-0.95012,-0.843468,-0.910125,1.952377,1.313647
29079,chr17:36904739:D,chr17:36904739:D,pos,ref,1594.207418,2515.344218,0.615043,22.826099,18.229744,1589.62954,3432.485335,1.049011,54.936248,50.339893,0.409717,0.474387,0.433968,4.923863,3.157711
29080,chr17:36904739:D_RC,chr17:36904739:D,neg,ref,1546.975363,3675.040906,1.170987,69.106165,64.50981,957.421529,2705.992119,1.388305,93.211848,88.615492,0.152803,0.324841,0.217317,1.512971,1.016804
29411,chr17:36438743:I_RC,chr17:36438743:I,neg,ref,1110.254661,1702.627803,0.598501,10.226223,5.629867,888.939888,2324.063225,1.347996,60.284617,55.688262,0.788642,0.68425,0.749495,4.144482,2.724976


In [None]:
#Check if the file has the same length as the dataframe
if len(grch38) == len(chr_pos_sig):
    print('True')
else:
    print('False')
    print('missing', len(chr_variants)- len(grch38))

In [None]:
#Check in wich position coordinates are missingA
#Counter(chr_pos_sig)

In [None]:
#Check in wich position coordinates are missingA
#Counter(list(grch38['snp_chromosome']))

Positions that are missing in output from assembly converter
- chr17 	 36904739 	 36904740 - chr17:36904739:D
- chr17 	 36438743 	 36438744 - chr17:36438743:I
- chr10 	 51583018 	 51583019 - chr10:51583018:D

In [None]:
sig_variants

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
0,rs11548103_RC,rs11548103,neg,ref,893.147913,1403.234147,0.637129,20.726159,16.129804,985.132571,1413.418102,0.512199,24.239298,19.642943,-0.157649,-0.070399,-0.124930,1.197350,0.803895
3,rs11102212_RC,rs11102212,neg,ref,272.682393,724.187989,1.276380,20.903079,16.306723,270.606119,663.387712,1.183784,18.276201,13.679846,-0.182603,0.057417,-0.092596,0.151356,0.102546
6,rs112338151,rs112338151,pos,ref,311.022339,938.600426,1.454065,35.570382,30.974027,441.094084,1188.919426,1.333956,46.863102,42.266747,-0.149852,-0.070538,-0.120109,1.108686,0.743469
14,rs10910099_RC,rs10910099,neg,ref,1565.369298,2410.813917,0.609195,18.726474,14.130119,1597.275307,1901.533139,0.243351,4.277202,0.000000,-0.319983,-0.442277,-0.365843,2.609976,1.744575
17,rs61731104,rs61731104,pos,ref,787.310108,1106.543203,0.472118,7.158719,2.562364,851.116339,948.197074,0.137500,2.071566,0.000000,-0.426533,-0.181428,-0.334618,1.448329,0.973007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39435,rs6002380_RC,rs6002380,neg,ref,1021.939757,1393.401924,0.436551,7.957672,3.361317,1410.403292,1772.580532,0.318500,3.737057,0.000000,-0.072219,-0.194436,-0.118050,0.225938,0.152430
39454,rs12165508_RC,rs12165508,neg,ref,905.677775,1186.698469,0.370023,10.844473,6.248118,602.818906,793.453679,0.368042,5.622711,1.026356,-0.004909,0.002897,-0.001982,0.082316,0.061067
39461,rs73439311_RC,rs73439311,neg,ref,225.047471,518.727303,1.021873,15.889120,11.292765,264.268231,367.242135,0.333214,5.915880,1.319524,-0.768413,-0.555736,-0.688659,2.142339,1.440353
39467,rs2234058,rs2234058,pos,ref,673.043758,1373.481543,0.938229,34.201737,29.605382,587.687732,1299.793269,1.040252,37.364005,32.767650,0.089103,0.123557,0.102023,0.925545,0.622003


In [None]:
sig_pos_not_found = ['chr17:36904739:D', 'chr17:36438743:I', 'chr10:51583018:D']
#select rows that do not contain above id's
sig_variants = sig_variants[~sig_variants['SNP'].isin(sig_pos_not_found)]
# id's to a list
sig_chr_variants =[i for i in sig_variants['SNP'].unique() if i.startswith('chr')]
# # create new coordinates
# start_pos_sig_new = [int(i[5:].replace(':', '').replace('I','').replace('D', '')) for i in list(sig_variants['SNP'].unique()) if i.startswith('chr')]
# end_pos_sig_new = [i + 1 for i in start_pos_sig_new]
# chr_pos_sig_new = [i[:5].replace(':', '') for i in list(sig_variants['SNP'].unique()) if i.startswith('chr')]

In [None]:
#Check if the file has the same length as the dataframe
if len(grch38) == len(sig_chr_variants):
    print('True')
else:
    print('False')
    print('missing', len(chr_variants)- len(grch38))

True


In [None]:
grch38 = grch38[['snp', 'snp_chromosome', 'snp_position']]

Load biomart export/output files

In [67]:
total_synonym = pd.read_csv('mart_export_total_synonym.txt')
total_variant_name_biomart = pd.read_csv('mart_export_total_variantname.txt')

In [68]:
change_column_names(total_synonym)
change_column_names(total_variant_name_biomart)

In [69]:
len(total_variant_name_biomart['variant_name'].value_counts()) + len(total_synonym['synonym_name'].value_counts())

3374

In [70]:
chr_name = [i for i in total_synonym['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
unique_syn = total_synonym[total_synonym['chromosome/scaffold_name'].isin(chr_name)]
unique_syn

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs2356416,dbSNP,1,45567593,45567593,rs74785550
1,rs7534581,dbSNP,1,1659114,1659114,rs9661285
2,rs112868731,dbSNP,3,41841863,41841863,rs144523572
3,rs111248130,dbSNP,4,55242231,55242231,rs145051830
4,rs55993837,dbSNP,3,41850539,41850539,rs140970237
...,...,...,...,...,...,...
1927,rs2885047,dbSNP,9,470259,470259,rs113679677
1928,rs4301823,dbSNP,12,57792811,57792811,rs56261123
1929,rs13221668,dbSNP,7,73763368,73763368,rs74539570
1930,rs4000157,dbSNP,7,32730090,32730090,rs145127609


In [71]:
unique_syn = unique_syn.rename(columns={'synonym_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
unique_syn = unique_syn.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'variant_name'])
unique_syn[['snp', 'snp_chromosome', 'snp_position']]

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs74785550,1,45567593
1,rs9661285,1,1659114
2,rs144523572,3,41841863
3,rs145051830,4,55242231
4,rs140970237,3,41850539
...,...,...,...
1927,rs113679677,9,470259
1928,rs56261123,12,57792811
1929,rs74539570,7,73763368
1930,rs145127609,7,32730090


In [72]:
total_chr_name = [i for i in total_variant_name_biomart['chromosome/scaffold_name'] if re.match('^(\d{1,2})', i)]
unique_chr_total = total_variant_name_biomart[total_variant_name_biomart['chromosome/scaffold_name'].isin(total_chr_name)]
unique_chr_total

Unnamed: 0,variant_name,variant_source,chromosome/scaffold_name,chromosome/scaffold_position_start_(bp),chromosome/scaffold_position_end_(bp),synonym_name
0,rs11810220,dbSNP,1,163311300,163311300,
1,rs11585048,dbSNP,1,2602648,2602648,rs59642996
3,rs11585844,dbSNP,1,37563668,37563668,
4,rs11587500,dbSNP,1,24190390,24190390,rs17184644
5,rs11587500,dbSNP,1,24190390,24190390,rs59459702
...,...,...,...,...,...,...
7902,rs111980103,dbSNP,16,970874,970874,RCV001667153
7903,rs111980103,dbSNP,16,970874,970874,RCV002421238
7904,rs59522292,dbSNP,12,124914185,124914185,
7905,rs56812038,dbSNP,7,32681035,32681035,


In [73]:
unique_chr_total = unique_chr_total.drop_duplicates(['variant_name'], keep='last')

In [74]:
unique_chr_total = unique_chr_total.rename(columns={'variant_name': 'snp','chromosome/scaffold_name': 'snp_chromosome', 'chromosome/scaffold_position_start_(bp)': 'snp_position'})
unique_chr_total = unique_chr_total.drop(columns=['variant_source', 'chromosome/scaffold_position_end_(bp)', 'synonym_name'])

In [75]:
unique_chr_total['snp_chromosome'] = 'chr' + unique_chr_total['snp_chromosome']

In [76]:
unique_syn = unique_syn[['snp', 'snp_chromosome', 'snp_position']]

In [None]:
#unique_syn['snp_chromosome'] = 'chr' + unique_syn['snp_chromosome']

In [None]:
grch38

Unnamed: 0,snp_chromosome,snp_position,end_coord,snp
0,chr1,150852051,150852052,chr1:150824527:I
1,chr1,205784748,205784749,chr1:205753876:D
2,chr1,172024714,172024715,chr1:171993854:D
3,chr1,41034295,41034296,chr1:41499967:I
4,chr1,22025902,22025903,chr1:22352395:D
...,...,...,...,...
208,chr20,25549209,25549210,chr20:25529845:D
209,chr21,36973064,36973065,chr21:38345364:I
210,chr21,28955410,28955411,chr21:30327732:D
211,chr22,49917233,49917234,chr22:50310881:D


In [77]:
unique_syn['snp_chromosome'] = 'chr' +unique_syn['snp_chromosome']
unique_syn

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs74785550,chr1,45567593
1,rs9661285,chr1,1659114
2,rs144523572,chr3,41841863
3,rs145051830,chr4,55242231
4,rs140970237,chr3,41850539
...,...,...,...
1927,rs113679677,chr9,470259
1928,rs56261123,chr12,57792811
1929,rs74539570,chr7,73763368
1930,rs145127609,chr7,32730090


In [None]:
unique_chr_total

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11810220,chr1,163311300
1,rs11585048,chr1,2602648
3,rs11585844,chr1,37563668
5,rs11587500,chr1,24190390
6,rs11588318,chr1,200669534
...,...,...,...
7892,rs4553633,chr16,2660393
7903,rs111980103,chr16,970874
7904,rs59522292,chr12,124914185
7905,rs56812038,chr7,32681035


In [79]:
positions_sig_variants = pd.concat([unique_chr_total, unique_syn, grch38])

In [None]:
positions_sig_variants

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11810220,chr1,163311300
1,rs11585048,chr1,2602648
3,rs11585844,chr1,37563668
5,rs11587500,chr1,24190390
6,rs11588318,chr1,200669534
...,...,...,...
208,chr20:25529845:D,chr20,25549209
209,chr21:38345364:I,chr21,36973064
210,chr21:30327732:D,chr21,28955410
211,chr22:50310881:D,chr22,49917233


In [None]:
positions_sig_variants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3585 entries, 0 to 212
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   snp             3585 non-null   object
 1   snp_chromosome  3585 non-null   object
 2   snp_position    3585 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 112.0+ KB


With the variant of interest centered within 150 bp of genomic sequence.

In [81]:
position_sig = list(positions_sig_variants['snp_position'])

In [82]:
start_mpra, end_mpra = get_start_end_coord(position_sig)

In [83]:
positions_sig_variants['start_coord'] = start_mpra
positions_sig_variants['end_coord'] = end_mpra

In [84]:
positions_sig_variants = positions_sig_variants.rename(columns={'snp_chromosome':'chromosome'})

Save MPRA coordinate information of signifciant LCL regions + variants

In [None]:
positions_sig_variants.to_csv('positions_sig_lcl_variants.csv', index=False)
positions_sig_variants.to_excel('positions_sig_lcl_variants.xlsx', index=False)

In [None]:
#ositions_sig_variants = pd.read_csv('positions_sig_lcl_variants.csv')

In [5]:
positions_sig_variants

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,rs11810220,chr1,163311300,163311225,163311375
1,rs11585048,chr1,2602648,2602573,2602723
2,rs11585844,chr1,37563668,37563593,37563743
3,rs11587500,chr1,24190390,24190315,24190465
4,rs11588318,chr1,200669534,200669459,200669609
...,...,...,...,...,...
3580,chr20:25529845:D,chr20,25549209,25549134,25549284
3581,chr21:38345364:I,chr21,36973064,36972989,36973139
3582,chr21:30327732:D,chr21,28955410,28955335,28955485
3583,chr22:50310881:D,chr22,49917233,49917158,49917308


In [6]:
positions_sig_variants = positions_sig_variants.rename(columns={'chromosome': 'snp_chromosome'})

Bed format txt file significant lcl variants

In [88]:
bed_format_sig_variants = positions_sig_variants[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]

In [None]:
bed_format_sig_variants.to_csv('data/lcl_significant_variants_coords.txt', sep='\t', header=None, index=False)

In [89]:
bed_format_sig_variants.dtypes

snp_chromosome    object
start_coord        int64
end_coord          int64
snp               object
dtype: object

***
### lcl_mpra_position_overlap.txt

In [None]:
with open('lcl_mpra_position_overlap.txt', 'r') as file:
    overlap_positions = [line.strip() for line in file]

In [None]:
overlap_positions =[int(i) for i in overlap_positions]

In [None]:
final_lcl_positions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29121 entries, 0 to 29120
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   snp             29121 non-null  object
 1   snp_chromosome  29121 non-null  object
 2   snp_position    29121 non-null  int64 
 3   start_coord     29121 non-null  int64 
 4   end_coord       29121 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 1.1+ MB


In [None]:
snp_overlap = final_lcl_positions[final_lcl_positions['snp_position'].isin(overlap_positions)]

In [None]:
len(snp_overlap)

42

In [None]:
id_overlap = list(snp_overlap['snp'].unique())

In [None]:
lcl[lcl['SNP'].isin(id_overlap)]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
655,rs74045976_RC,rs74045976,neg,ref,1306.076342,983.851139,-0.400317,8.330693,3.734337,1291.250043,1131.056900,-0.190530,3.094454,0.000000,0.251751,0.139849,0.209788,2.237498,1.511666
2987,rs113612815_RC,rs113612815,neg,ref,800.811823,882.603007,0.135318,0.475015,0.000000,634.185218,891.976620,0.478592,12.801986,8.205631,0.426001,0.205397,0.343275,1.188536,0.798784
3278,rs73153267_RC,rs73153267,neg,ref,561.648645,1675.101454,1.322058,26.541084,21.944729,582.374586,1965.788116,1.465996,29.878681,25.282326,0.138312,0.153316,0.143938,1.061545,0.713922
5759,rs6766641_RC,rs6766641,neg,ref,926.149452,1689.439863,0.843489,34.232160,29.635805,1105.476659,2033.612235,0.850945,31.809001,27.212646,0.030574,-0.031075,0.007456,0.113630,0.079122
8403,rs35188965_RC,rs35188965,neg,ref,1559.033700,3084.504088,0.873597,40.807681,36.211326,1947.007675,2445.217021,0.298748,6.243791,1.647436,-0.393967,-0.876318,-0.574849,5.300203,3.353410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38823,rs7284713_RC,rs7284713,neg,ref,1111.082823,1537.327584,0.456201,6.626448,2.030093,1350.609225,2099.631551,0.621997,14.834651,10.238296,0.126262,0.231685,0.165796,1.886733,1.271346
38824,rs7284713_RC_alt,rs7284713,neg,alt,1347.691448,1929.248091,0.506624,11.097647,6.501292,885.809304,1278.147327,0.513627,8.404860,3.808504,0.006982,0.007037,0.007003,0.006943,0.005538
39125,rs7293064_RC,rs7293064,neg,ref,1310.697897,3027.322186,1.176897,58.828213,54.231858,1382.291887,2557.973698,0.866727,27.741980,23.145625,-0.398131,-0.163569,-0.310170,2.046625,1.372637
39126,rs7293064_RC_alt,rs7293064,neg,alt,1199.390610,2625.811458,1.099562,41.503023,36.906668,1176.475150,2072.832588,0.794935,20.681986,16.085631,-0.348721,-0.231136,-0.304627,2.106100,1.414579


In [None]:
sig_overlap = sig_variants[sig_variants['SNP'].isin(id_overlap)]

In [None]:
sig_overlap.drop_duplicates('SNP').head()

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
655,rs74045976_RC,rs74045976,neg,ref,1306.076342,983.851139,-0.400317,8.330693,3.734337,1291.250043,1131.0569,-0.19053,3.094454,0.0,0.251751,0.139849,0.209788,2.237498,1.511666
2987,rs113612815_RC,rs113612815,neg,ref,800.811823,882.603007,0.135318,0.475015,0.0,634.185218,891.97662,0.478592,12.801986,8.205631,0.426001,0.205397,0.343275,1.188536,0.798784
3278,rs73153267_RC,rs73153267,neg,ref,561.648645,1675.101454,1.322058,26.541084,21.944729,582.374586,1965.788116,1.465996,29.878681,25.282326,0.138312,0.153316,0.143938,1.061545,0.713922
5759,rs6766641_RC,rs6766641,neg,ref,926.149452,1689.439863,0.843489,34.23216,29.635805,1105.476659,2033.612235,0.850945,31.809001,27.212646,0.030574,-0.031075,0.007456,0.11363,0.079122
8403,rs35188965_RC,rs35188965,neg,ref,1559.0337,3084.504088,0.873597,40.807681,36.211326,1947.007675,2445.217021,0.298748,6.243791,1.647436,-0.393967,-0.876318,-0.574849,5.300203,3.35341


***
create bedfile for checking intersect between lcl mpra and starr seq

In [None]:
final_lcl_positions

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,rs11809905,chr1,227334540,227334465,227334615
1,rs114530232,chr1,42958380,42958305,42958455
2,rs114531441,chr1,37548222,37548147,37548297
3,rs11810220,chr1,163311300,163311225,163311375
4,rs11811181,chr1,206551409,206551334,206551484
...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037
29117,chr22:32803042:D,chr22,32407055,32406980,32407130
29118,chr22:24311587:D,chr22,23969398,23969323,23969473
29119,chr22:50310878:I,chr22,49917230,49917155,49917305


In [None]:
positions_sig_variants

Unnamed: 0,snp,snp_chromosome,snp_position,end_coord,start_coord
0,rs11810220,chr1,163311300,163311375,163311225
1,rs11585048,chr1,2602648,2602723,2602573
3,rs11585844,chr1,37563668,37563743,37563593
5,rs11587500,chr1,24190390,24190465,24190315
6,rs11588318,chr1,200669534,200669609,200669459
...,...,...,...,...,...
208,chr20:25529845:D,chr20,25549209,25549284,25549134
209,chr21:38345364:I,chr21,36973064,36973139,36972989
210,chr21:30327732:D,chr21,28955410,28955485,28955335
211,chr22:50310881:D,chr22,49917233,49917308,49917158


In [7]:
non_significant_lcl_positions = final_lcl_positions[~final_lcl_positions['snp'].isin(positions_sig_variants['snp'])]

In [8]:
non_significant_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
0,rs11809905,chr1,227334540,227334465,227334615
1,rs114530232,chr1,42958380,42958305,42958455
2,rs114531441,chr1,37548222,37548147,37548297
4,rs11811181,chr1,206551409,206551334,206551484
5,rs114569995,chr1,169828815,169828740,169828890
...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037
29117,chr22:32803042:D,chr22,32407055,32406980,32407130
29118,chr22:24311587:D,chr22,23969398,23969323,23969473
29119,chr22:50310878:I,chr22,49917230,49917155,49917305


In [None]:
(len(final_lcl_positions) - len(positions_sig_variants)) == len(non_significant_lcl_positions)

True

In [9]:
positions_sig_variants = positions_sig_variants.astype({'snp_chromosome':'str'})
#positions_sig_variants['snp_chromosome'] = 'chr' + positions_sig_variants['snp_chromosome']
sig_lcl_coordinates = positions_sig_variants[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]

In [None]:
duplicate_start_coord = list(final_lcl_positions[final_lcl_positions['start_coord'].duplicated()]['start_coord'])


In [None]:
sig_lcl_coordinates[sig_lcl_coordinates['start_coord'].isin(duplicate_start_coord)]

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp
5232,chr2,64917483,64917633,rs4671630


In [None]:
final_lcl_positions[(final_lcl_positions['start_coord'] == 45550611) | (final_lcl_positions['start_coord'] == 45854732)
                    | (final_lcl_positions['start_coord'] == 46021420)| (final_lcl_positions['start_coord'] == 46088639	)]

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
15489,rs78363416,chr10,45550686,45550611,45550761
24952,rs111961982,chr17,45854807,45854732,45854882
26054,rs117757814,chr17,46021495,46021420,46021570
26119,rs187481091,chr17,46088714,46088639,46088789
28103,chr10:46046134:D,chr10,45550686,45550611,45550761
28805,chr17:43932173:I,chr17,45854807,45854732,45854882
28841,chr17:44098861:D,chr17,46021495,46021420,46021570
28867,chr17:44166080:I,chr17,46088714,46088639,46088789


In [None]:
len(final_lcl_positions)

29121

In [None]:
final_lcl_positions[final_lcl_positions['start_coord'] == 35409905]

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord
29052,chr20:33997783:D,chr20,35409980,35409905,35410055


In [None]:
overlap_sig_lcl_vs_total = final_lcl_positions[final_lcl_positions['start_coord'].isin(list(sig_lcl_coordinates['start_coord']))]
sig_lcl_coordinates[~sig_lcl_coordinates['start_coord'].isin(list(overlap_sig_lcl_vs_total['start_coord']))]

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp


In [None]:
not_found = ['chr10:51583018:D', 'chr16:70190401', 'chr17:36438743:I', 'chr17:36904739:D', 'chr6:32546828', 'chr6:32627992', 'chr6:32629889' ]

In [None]:
sig_lcl_coordinates[sig_lcl_coordinates['snp'].isin(not_found)]

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp
56,chr6,32578976,32579126,chr6:32546828
60,chr6,32660140,32660290,chr6:32627992
65,chr6,32662037,32662187,chr6:32629889
132,chr16,70156423,70156573,chr16:70190401


In [None]:
lcl[lcl['SNP'].isin(not_found)]

Unnamed: 0,ID,SNP,Direction,Haplotype,C.A.ctrl.mean,C.A.exp.mean,C.A.log2FC,C.A.logP,C.A.logPadj,C.B.ctrl.mean,C.B.exp.mean,C.B.log2FC,C.B.logP,C.B.logPadj,LogSkew.12878,LogSkew.19239,LogSkew.Comb,C.Skew.logP,C.Skew.fdr
9211,chr6:32546828_RC,chr6:32546828,neg,ref,1751.475564,2126.262888,0.269681,8.242406,3.646051,962.859339,1318.048194,0.428452,5.508337,0.911982,0.169623,0.140685,0.158771,0.840362,0.563019
10004,chr6:32627992,chr6:32627992,pos,ref,929.803534,1035.868995,0.150967,0.908504,0.0,766.375543,749.135712,-0.031337,0.065648,0.0,-0.144168,-0.245862,-0.182303,,
10005,chr6:32627992_RC,chr6:32627992,neg,ref,488.609921,658.676346,0.405744,6.276884,1.680529,641.959949,748.224151,0.19468,1.38265,0.0,-0.22191,-0.192988,-0.211065,,
10006,chr6:32627992_alt,chr6:32627992,pos,alt,1200.106215,1189.044047,-0.013631,0.010621,0.0,1185.058096,1159.889391,-0.030602,0.058264,0.0,-0.032936,0.009636,-0.016972,,
10007,chr6:32627992_RC_alt,chr6:32627992,neg,alt,678.591264,899.653581,0.383805,3.394353,0.0,583.460405,948.598412,0.683989,23.760747,19.164392,0.291108,0.315311,0.300184,1.116858,0.747744
10483,chr6:32629889_RC,chr6:32629889,neg,ref,768.454246,813.127091,0.078635,0.158112,0.0,905.965463,950.395477,0.058967,0.32909,0.0,-0.133572,0.170173,-0.019668,,
10484,chr6:32629889_RC_alt,chr6:32629889,neg,alt,1111.677995,1519.630078,0.441915,10.166889,5.570534,858.686278,1200.966879,0.466767,9.820155,5.2238,0.137883,-0.163532,0.024852,0.345416,0.231954
15374,chr10:51583018:D_RC,chr10:51583018:D,neg,ref,182.744215,453.229658,1.186026,16.662832,12.066477,256.76964,320.133604,0.275901,0.678229,0.0,-0.95012,-0.843468,-0.910125,1.952377,1.313647
23634,chr16:70190401,chr16:70190401,pos,ref,647.729075,1106.308855,0.742134,22.68277,18.086415,641.985305,797.271487,0.294588,1.852254,0.0,-0.354863,-0.602018,-0.447546,2.368307,1.597605
23635,chr16:70190401_RC,chr16:70190401,neg,ref,1295.031274,1419.48431,0.125623,0.835485,0.0,1006.937677,1151.313527,0.183254,1.414055,0.0,0.049362,0.071414,0.057631,,


chr10	51252949	51253099	chr10:51583018:D
chr16	651829	651979	chr16:70190401
chr17	33692192	33692342	chr17:36438743:I
chr17	34158190	34158340	chr17:36904739:D
chr6	325393	325543	chr6:32546828
chr6	326204	326354	chr6:32627992
chr6	326223	326373	chr6:32629889

Final version LCL coordinates and sig LCL coordinates

In [None]:
sig_lcl_coordinates.to_csv('data/sig_lcl_coordinates.txt', header=None, index=False, sep='\t')

In [10]:
final_lcl_positions = final_lcl_positions.astype({'snp_chromosome':'str'})
#final_lcl_positions['snp_chromosome'] = 'chr' + final_lcl_positions['snp_chromosome']
lcl_coordinates = final_lcl_positions[['snp_chromosome', 'start_coord', 'end_coord', 'snp']]

In [17]:
lcl_coordinates

Unnamed: 0,snp_chromosome,start_coord,end_coord,snp
0,chr1,227334465,227334615,rs11809905
1,chr1,42958305,42958455,rs114530232
2,chr1,37548147,37548297,rs114531441
3,chr1,163311225,163311375,rs11810220
4,chr1,206551334,206551484,rs11811181
...,...,...,...,...
29116,chr22,46291887,46292037,chr22:46687859:D
29117,chr22,32406980,32407130,chr22:32803042:D
29118,chr22,23969323,23969473,chr22:24311587:D
29119,chr22,49917155,49917305,chr22:50310878:I


In [None]:
lcl_coordinates.to_csv('data/lcl_coordinates.txt', header=None, index=False, sep='\t')

***

Extra: determine allele changes of SNPs/variants for possible downstream analysis (LCL SNP - eQTL SNP comparison)

delete next two code cells, as they are in the new notebook

In [122]:
sig_bqtl_sig_lcl_snps = list(sig_lcl[sig_lcl['chr_snp_pos'].isin(sig_bqtls_in_sig_lcl['chr_snp_pos'])]['snp'])

In [None]:
with open("data/bed_files/sig_bqtl_sig_lcl_snps.txt","w") as f:
    for snp in sig_bqtl_sig_lcl_snps:
        f.write("{0} \n".format(snp))

Search in biomart based on rsid and synomymn rsid if not found initially

In [None]:
synonym_var_lcl_snps = pd.read_csv("C:/Users/annav/Downloads/martquery_0524115334_816.txt.gz", sep='\t')

In [None]:
var_lcl_snps = pd.read_csv("C:/Users/annav/Downloads/martquery_0524120923_962.txt.gz", sep='\t')
var_lcl_snps

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Minor allele (ALL),Variant alleles,Synonym name,Global minor allele frequency (all individuals)
0,rs1344,dbSNP,1,147647471,147647471,,G/A,rs698505,
1,rs1344,dbSNP,1,147647471,147647471,,G/A,rs3170854,
2,rs1344,dbSNP,1,147647471,147647471,,G/A,rs17850585,
3,rs1344,dbSNP,1,147647471,147647471,,G/A,rs60715787,
4,rs1344,dbSNP,1,147647471,147647471,,G/A,rs386528725,
...,...,...,...,...,...,...,...,...,...
50196,rs5996114,dbSNP,HSCHR22_2_CTG1,35204,35204,,C/G/T,,
50197,rs4995141,dbSNP,HSCHR14_3_CTG1,845060,845060,,C/T,,
50198,rs4995141,dbSNP,14,106353377,106353377,,C/T,,
50199,rs144711656,dbSNP,HSCHR14_3_CTG1,840975,840975,,T/C,,


In [None]:
var_lcl_snps = var_lcl_snps[var_lcl_snps['Chromosome/scaffold name'] != 'X']
var_lcl_snps = var_lcl_snps[var_lcl_snps['Chromosome/scaffold name'].str.contains(r'^\d+$')]
var_lcl_snps = var_lcl_snps.drop_duplicates(subset=['Variant name'], keep='first')


In [None]:
var_lcl_snps = var_lcl_snps[['Variant name', 'Chromosome/scaffold name', 'Chromosome/scaffold position start (bp)', 'Variant alleles']]
var_lcl_snps.columns = ['snp', 'chr', 'position', 'variant_allele']
var_lcl_snps

Unnamed: 0,snp,chr,position,variant_allele
0,rs1344,1,147647471,G/A
9,rs1496,1,169858717,G/A/T
10,rs4870,1,2556714,A/C/G
62,rs5065,1,11846011,A/G
76,rs5067,1,11845924,A/G/T
...,...,...,...,...
50187,rs5996087,22,41925587,A/C/G
50189,rs5996089,22,41936432,G/A
50190,rs5996114,22,42112860,C/G/T
50198,rs4995141,14,106353377,C/T


In [None]:
def create_snp_id(df):
    chromosome = list(df['chr'])
    position = list(df['position'])
    variant_allele = list(df['variant_allele'])
    snps_id = []
    for chr, pos, var in zip(chromosome, position, variant_allele):
        var = var.replace('/', ':')
        x = str(chr) + ':' + str(pos) +':' + var
        snps_id.append(x)
    df['snp_id'] = snps_id
    return df
    

In [None]:
var_lcl_snps = create_snp_id(var_lcl_snps)

In [None]:
var_lcl_snps

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
50187,rs5996087,22,41925587,A/C/G,22:41925587:A:C:G
50189,rs5996089,22,41936432,G/A,22:41936432:G:A
50190,rs5996114,22,42112860,C/G/T,22:42112860:C:G:T
50198,rs4995141,14,106353377,C/T,14:106353377:C:T


In [None]:
sig_bqtl[sig_bqtl['snp_id'].isin(var_lcl_snps['snp_id'])]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org,chr_snp_pos
13,SMDT1,22:42092341:A:G,5.301338e-20,1.061741,0.251468,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42092341,A,1,0.342105,1.000000,22:42092341:A:G-SMDT1,9.157704,211365.1,9.313558,chr22_42092341
49,SMDT1,22:42080750:A:C,5.975150e-20,1.068933,0.252964,6.857645e-19,22,42079691,42084284,ENSG00000183172,...,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,9.144780,211365.1,9.298145,chr22_42080750
132,GABPB1-AS1,15:50356743:C:T,1.167074e-17,-1.313119,0.240504,7.149596e-16,15,50354944,50372202,ENSG00000244879,...,50356743,C,1,0.289474,0.459454,15:50356743:C:T-GABPB1-AS1,-8.556142,211365.1,-8.510348,chr15_50356743
646,PILRB,7:100308061:A:G,5.495119e-12,-1.333647,0.266132,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100308061,A,1,0.171053,0.273867,7:100308061:A:G-PILRB,-6.892161,211365.1,-6.489465,chr7_100308061
648,PILRB,7:100315306:A:T,5.519019e-12,-1.334578,0.266380,6.188820e-11,7,100352176,100367831,ENSG00000121716,...,100315306,A,1,0.171053,0.273867,7:100315306:A:T-PILRB,-6.891544,211365.1,-6.486483,chr7_100315306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19240,NAGK,2:71066569:A:G,6.597289e-05,0.672986,0.299214,4.908836e-03,2,71064344,71079808,ENSG00000124357,...,71066569,A,1,0.131579,1.000000,2:71066569:A:G-NAGK,3.990361,211365.1,3.875926,chr2_71066569
19424,ATP5MC2,12:53664454:C:T,8.956944e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53664454,C,1,0.342105,1.000000,12:53664454:C:T-ATP5MC2,-3.917238,211365.1,-3.969926,chr12_53664454
19426,ATP5MC2,12:53672959:T:C,9.244949e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53672959,T,1,0.342105,1.000000,12:53672959:T:C-ATP5MC2,-3.909599,211365.1,-3.959482,chr12_53672959
19427,ATP5MC2,12:53676661:A:T,9.244949e-05,-0.431927,0.232683,5.085819e-03,12,53632726,53677408,ENSG00000135390,...,53676661,A,1,0.342105,1.000000,12:53676661:A:T-ATP5MC2,-3.909599,211365.1,-3.959482,chr12_53676661


In [None]:
var_lcl_snps[var_lcl_snps['Variant name'] == 'rs4995141']

Unnamed: 0,Variant name,Variant source,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Minor allele (ALL),Variant alleles,Synonym name,Global minor allele frequency (all individuals)
50198,rs4995141,dbSNP,14,106353377,106353377,,C/T,,


In [None]:
synonym_var_lcl_snps[synonym_var_lcl_snps['Synonym name'] =='rs9614690']

Unnamed: 0,Chromosome/scaffold name,Chromosome/scaffold position start (bp),Chromosome/scaffold position end (bp),Variant alleles,Minor allele (ALL),Strand,Synonym name,Synonym source
10864,X,151150353,151150353,G/A,,1,rs9614690,Former dbSNP
10865,X,117849962,117849962,C/A,,1,rs9614690,Former dbSNP
10866,X,17053874,17053874,T/G,,1,rs9614690,Former dbSNP


In [None]:
synonym_var_lcl_snps = synonym_var_lcl_snps[synonym_var_lcl_snps['Chromosome/scaffold name'] != 'X']
synonym_var_lcl_snps = synonym_var_lcl_snps[synonym_var_lcl_snps['Chromosome/scaffold name'].str.contains(r'^\d+$')]
synonym_var_lcl_snps = synonym_var_lcl_snps.drop_duplicates(subset=['Synonym name'], keep='first')


In [None]:
synonym_var_lcl_snps = synonym_var_lcl_snps[['Synonym name', 'Chromosome/scaffold name', 'Chromosome/scaffold position start (bp)', 'Variant alleles']]
synonym_var_lcl_snps.columns = ['snp', 'chr', 'position', 'variant_allele']
synonym_var_lcl_snps

Unnamed: 0,snp,chr,position,variant_allele
0,rs116024440,1,805036,A/G
1,rs140081212,1,155215184,G/A/T
3,rs150913279,1,2568371,A/G
5,rs55803744,1,149703225,T/A/C
6,rs79266459,1,26170849,A/C/G/T
...,...,...,...,...
10853,rs79393060,22,49917234,C/G/T
10854,rs61634242,22,49601073,G/A/C
10855,rs80020284,22,23970400,G/A
10857,rs77196310,22,45413058,GGGGGG/GGGGG


In [None]:
synonym_var_lcl_snps = create_snp_id(synonym_var_lcl_snps)

rs10627369	chr22	50578781
12936	rs71707919	chr22	43120043
12974	rs66918515	chr22	21002604
13015	rs75892697	chr19	52782474
13016	rs77764310	chr19	53197554
13022	rs139074994	chr19	54632756
13084	rs73135170	

In [None]:
synonym_var_lcl_snps[synonym_var_lcl_snps['snp']== 'rs77764310']

Unnamed: 0,snp,chr,position,variant_allele,snp_id


In [None]:
synonym_var_lcl_snps =synonym_var_lcl_snps[~synonym_var_lcl_snps['snp'].isin(var_lcl_snps['snp'])]

In [None]:
lcl_snp_ids = pd.concat([var_lcl_snps, synonym_var_lcl_snps])

In [None]:
lcl_snp_ids

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
10853,rs79393060,22,49917234,C/G/T,22:49917234:C:G:T
10854,rs61634242,22,49601073,G/A/C,22:49601073:G:A:C
10855,rs80020284,22,23970400,G/A,22:23970400:G:A
10857,rs77196310,22,45413058,GGGGGG/GGGGG,22:45413058:GGGGGG:GGGGG


In [None]:
lcl_snp_ids['snp'].value_counts()

rs1344         1
rs4788099      1
rs4968013      1
rs4968011      1
rs4889679      1
              ..
rs28414073     1
rs28401739     1
rs28379833     1
rs28367131     1
rs116390392    1
Name: snp, Length: 27112, dtype: int64

In [None]:
lcl_snp_ids[lcl_snp_ids.groupby('snp')['snp'].transform('size') >= 2]

Unnamed: 0,snp,chr,position,variant_allele,snp_id
618,rs863850,1,146023711,G/A,1:146023711:G:A
1077,rs2864871,1,150794488,T/A/C,1:150794488:T:A:C
2687,rs79206743,1,146017421,T/C,1:146017421:T:C
5058,rs10399931,1,203186952,T/A/C,1:203186952:T:A:C
5592,rs826542,2,108717018,A/T,2:108717018:A:T
...,...,...,...,...,...
10808,rs62054803,19,3229769,C/T,19:3229769:C:T
10810,rs62054804,19,56062256,ACTACTA/ACTA,19:56062256:ACTACTA:ACTA
10812,rs62054805,19,14596367,C/A/T,19:14596367:C:A:T
10814,rs3760532,19,49138323,T/-,19:49138323:T:-


In [None]:
lcl_snp_ids[lcl_snp_ids['snp'] == 'rs3760532']

Unnamed: 0,snp,chr,position,variant_allele,snp_id
38715,rs3760532,17,81939573,G/A,17:81939573:G:A
10814,rs3760532,19,49138323,T/-,19:49138323:T:-


In [None]:
lcl_positions_rsid = pd.concat([unique_total_variants, syn_total_variants_unique])
lcl_positions_rsid

Unnamed: 0,snp,snp_chromosome,snp_position
0,rs11809905,chr1,227334540
1,rs114530232,chr1,42958380
4,rs114531441,chr1,37548222
5,rs11810220,chr1,163311300
6,rs11811181,chr1,206551409
...,...,...,...
13089,rs140493080,chr12,7924992
13090,rs113859809,chr7,100217868
13091,rs150347472,chr12,9965410
13092,rs141183894,chr9,31326631


In [None]:
missing_rsid_lcl = lcl_positions_rsid[~lcl_positions_rsid['snp'].isin(lcl_snp_ids['snp'])]
missing_rsid_lcl

Unnamed: 0,snp,snp_chromosome,snp_position
12908,rs10627369,chr22,50578781
12936,rs71707919,chr22,43120043
12974,rs66918515,chr22,21002604
13015,rs75892697,chr19,52782474
13016,rs77764310,chr19,53197554
13022,rs139074994,chr19,54632756
13084,rs73135170,chr7,72726370


In [None]:
missing_rsid_lcl =missing_rsid_lcl.merge(missing_rsid_mart_export, how='inner', on='snp_position')
missing_rsid_lcl

Unnamed: 0,snp,snp_chromosome,snp_position,variant_allele
0,rs10627369,chr22,50578781,CT/CTTCT
1,rs71707919,chr22,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
2,rs66918515,chr22,21002604,AGACAG/AG
3,rs75892697,chr19,52782474,C/G
4,rs77764310,chr19,53197554,T/A/C
5,rs139074994,chr19,54632756,C/A/G/T
6,rs73135170,chr7,72726370,G/C/T


In [None]:
missing_rsid_lcl.columns = ['snp', 'chr', 'position', 'variant_allele']

In [None]:
chr_list = []
for chr in list(missing_rsid_lcl['chr']):
    chr = chr[3:]
    chr_list.append(chr)

In [None]:
missing_rsid_lcl['chr'] = chr_list

In [None]:
missing_rsid_lcl

Unnamed: 0,snp,chr,position,variant_allele
0,rs10627369,22,50578781,CT/CTTCT
1,rs71707919,22,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
2,rs66918515,22,21002604,AGACAG/AG
3,rs75892697,19,52782474,C/G
4,rs77764310,19,53197554,T/A/C
5,rs139074994,19,54632756,C/A/G/T
6,rs73135170,7,72726370,G/C/T


In [None]:
missing_rsid_lcl = create_snp_id(missing_rsid_lcl)

In [None]:
missing_rsid_lcl

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs10627369,22,50578781,CT/CTTCT,22:50578781:CT:CTTCT
1,rs71707919,22,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG,22:43120043:CTGGTGAGCTCTG:CTG:CTGGTGAGCTCTGGTG...
2,rs66918515,22,21002604,AGACAG/AG,22:21002604:AGACAG:AG
3,rs75892697,19,52782474,C/G,19:52782474:C:G
4,rs77764310,19,53197554,T/A/C,19:53197554:T:A:C
5,rs139074994,19,54632756,C/A/G/T,19:54632756:C:A:G:T
6,rs73135170,7,72726370,G/C/T,7:72726370:G:C:T


In [None]:
lcl_snp_ids = pd.concat([lcl_snp_ids, missing_rsid_lcl])
lcl_snp_ids

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
2,rs66918515,22,21002604,AGACAG/AG,22:21002604:AGACAG:AG
3,rs75892697,19,52782474,C/G,19:52782474:C:G
4,rs77764310,19,53197554,T/A/C,19:53197554:T:A:C
5,rs139074994,19,54632756,C/A/G/T,19:54632756:C:A:G:T


In [None]:
missing_rsid_mart_export

Unnamed: 0,snp_position,variant_allele
0,52782474,C/G
1,43120043,CTGGTGAGCTCTG/CTG/CTGGTGAGCTCTGGTGAGCTCTG
4,53197554,T/A/C
13,72726370,G/C/T
14,50578781,CT/CTTCT
15,21002604,AGACAG/AG
16,54632756,C/A/G/T


In [None]:
missing_rsid_lcl = list(lcl_positions_rsid[~lcl_positions_rsid['snp'].isin(lcl_snp_ids['snp'])]['snp'])

In [None]:
missing_rsid_lcl

['rs10627369',
 'rs71707919',
 'rs66918515',
 'rs75892697',
 'rs77764310',
 'rs139074994',
 'rs73135170']

In [None]:
len(lcl_snp_ids)

27112

In [None]:
lcl_snp_ids

Unnamed: 0,snp,chr,position,variant_allele,snp_id
0,rs1344,1,147647471,G/A,1:147647471:G:A
9,rs1496,1,169858717,G/A/T,1:169858717:G:A:T
10,rs4870,1,2556714,A/C/G,1:2556714:A:C:G
62,rs5065,1,11846011,A/G,1:11846011:A:G
76,rs5067,1,11845924,A/G/T,1:11845924:A:G:T
...,...,...,...,...,...
10853,rs79393060,22,49917234,C/G/T,22:49917234:C:G:T
10854,rs61634242,22,49601073,G/A/C,22:49601073:G:A:C
10855,rs80020284,22,23970400,G/A,22:23970400:G:A
10857,rs77196310,22,45413058,GGGGGG/GGGGG,22:45413058:GGGGGG:GGGGG


In [None]:
lcl_snp_ids[~lcl_snp_ids['snp'].isin(final_lcl_positions['snp'])]

Unnamed: 0,snp,chr,position,variant_allele,snp_id


In [None]:
final_lcl_positions

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
0,rs11809905,chr1,227334540,227334465,227334615,chr1_227334540
1,rs114530232,chr1,42958380,42958305,42958455,chr1_42958380
2,rs114531441,chr1,37548222,37548147,37548297,chr1_37548222
3,rs11810220,chr1,163311300,163311225,163311375,chr1_163311300
4,rs11811181,chr1,206551409,206551334,206551484,chr1_206551409
...,...,...,...,...,...,...
29116,chr22:46687859:D,chr22,46291962,46291887,46292037,chr22_46291962
29117,chr22:32803042:D,chr22,32407055,32406980,32407130,chr22_32407055
29118,chr22:24311587:D,chr22,23969398,23969323,23969473,chr22_23969398
29119,chr22:50310878:I,chr22,49917230,49917155,49917305,chr22_49917230


In [None]:
final_lcl_positions[final_lcl_positions['snp'] == 'rs9614690']

Unnamed: 0,snp,snp_chromosome,snp_position,start_coord,end_coord,chr_snp_pos
27033,rs116390392,chr22,42126310,42126235,42126385,chr22_42126310


END VARIANT ALLELE
***