# Data inspection & exploration: STARR-seq CD4 T cells

Paper: Cis-regulatory atlas of primary human CD4+ T cells - Kurtis Stefan & Artem Barski 2023

### Import libraries

In [180]:
import pandas as pd
import numpy as np
from read_config_file import get_config

In [181]:
pd.set_option('mode.chained_assignment', None)

- tpm: transcripts per million

Enhancers are called with MACS2 [-log10(P) > 75], comparing STARR-Seq–obtained RNA reads to input plasmid reads.<br>
Negative regulatory elements (NREs) are called using MACS2 [-log10(P) > 30] comparing input plasmid to STARR-Seq-obtained RNA reads

### Load data

In [182]:
config = get_config()
starr_cd4_enh = (config['starr_cd4_enhancer'])
starr_cd4_nre = (config['starr_cd4_nre'])

In [183]:
enh = pd.read_csv(starr_cd4_enh, sep=';')

In [184]:
nre = pd.read_csv(starr_cd4_nre, sep=';')

In [185]:
lcl_analyis = pd.read_csv('positions_sig_lcl_variants.csv')

- the final analysis data are not strand specific.
- Genome build: hg19 - grch37

### Data inspection, explorationa and cleaning

In [186]:
enh

Unnamed: 0,chromosome,start_coord,end_coord,name,score,strand,signalValue,pvalue(-log10),qvalue(-log10),peak
0,chr1,780062,780170,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,2990,.,6.76315,302.37964,299.05209,42
1,chr1,1051480,1051842,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1773,.,1.81547,180.31976,177.37407,121
2,chr1,1136882,1137106,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1835,.,3.38474,186.49478,183.52959,97
3,chr1,1143458,1143553,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,2820,.,2.61547,285.31268,282.04059,72
4,chr1,1167340,1167536,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,4380,.,2.54028,441.82684,438.00470,138
...,...,...,...,...,...,...,...,...,...,...
8409,chrY,21906762,21906867,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1514,.,3.45702,154.32509,151.46123,76
8410,chrY,22737445,22737756,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1109,.,1.79933,113.70029,110.97779,202
8411,chrY,23167461,23167667,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1066,.,10.88384,109.34882,106.64301,168
8412,chrY,23402278,23402444,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,3543,.,3.65531,357.82581,354.30984,92


In [187]:
enh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8414 entries, 0 to 8413
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   chromosome      8414 non-null   object 
 1   start_coord     8414 non-null   int64  
 2   end_coord       8414 non-null   int64  
 3   name            8414 non-null   object 
 4   score           8414 non-null   int64  
 5   strand          8414 non-null   object 
 6   signalValue     8414 non-null   float64
 7   pvalue(-log10)  8414 non-null   float64
 8   qvalue(-log10)  8414 non-null   float64
 9   peak            8414 non-null   int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 657.5+ KB


In [188]:
enh.isna().sum()

chromosome        0
start_coord       0
end_coord         0
name              0
score             0
strand            0
signalValue       0
pvalue(-log10)    0
qvalue(-log10)    0
peak              0
dtype: int64

In [189]:
enh[['score', 'signalValue', 'peak']].describe()

Unnamed: 0,score,signalValue,peak
count,8414.0,8414.0,8414.0
mean,1914.187307,3.223809,112.933801
std,1286.327638,2.521876,77.818372
min,724.0,1.26523,0.0
25%,1046.0,1.950638,59.0
50%,1514.0,2.535415,92.0
75%,2341.0,3.608805,149.0
max,16172.0,75.35466,771.0


In [190]:
enh['name'].value_counts()

MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_127916    2
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_134214    2
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_157104    2
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_148240    2
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_128980    1
                                                                  ..
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_66654     1
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_66645     1
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_66644     1
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_66641     1
MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_211382    1
Name: name, Length: 8410, dtype: int64

In [191]:
#Show the peaks with duplicate names
enh[enh['name'].map(enh['name'].value_counts() > 1)]

Unnamed: 0,chromosome,start_coord,end_coord,name,score,strand,signalValue,pvalue(-log10),qvalue(-log10),peak
5570,chr3,98282557,98282750,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1021,.,5.66464,104.83451,102.14629,215
5571,chr3,98283019,98283148,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1021,.,5.66464,104.83451,102.14629,215
5769,chr3,179370385,179370561,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,2277,.,4.53904,230.88206,227.77855,144
5770,chr3,179370992,179371042,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,2277,.,4.53904,230.88206,227.77855,144
6133,chr4,183838469,183838571,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1282,.,2.70363,131.08398,128.29875,63
6134,chr4,183838737,183838875,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,1282,.,2.70363,131.08398,128.29875,63
6367,chr5,122181094,122181177,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,2280,.,3.00476,231.12044,228.0161,33
6368,chr5,122181336,122181362,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,2280,.,3.00476,231.12044,228.0161,33


For the some peaks there are multiple genomic positions. However they have the same values in the rows.

In [192]:
enh.strand.value_counts()

.    8414
Name: strand, dtype: int64

In [193]:
# Calculate the qvalue
enh['qvalue'] = enh['qvalue(-log10)'].map(lambda x: 10 ** -x)

In [194]:
#Drop strand column and rearrange columns
enh = enh[[ 'name','chromosome', 'start_coord', 'end_coord', 'score', 'signalValue', 'pvalue(-log10)', 'qvalue(-log10)', 'qvalue', 'peak']]

In [195]:
enh[enh['qvalue'] <= 0.05]

Unnamed: 0,name,chromosome,start_coord,end_coord,score,signalValue,pvalue(-log10),qvalue(-log10),qvalue,peak
0,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,780062,780170,2990,6.76315,302.37964,299.05209,8.869722e-300,42
1,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1051480,1051842,1773,1.81547,180.31976,177.37407,4.226005e-178,121
2,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1136882,1137106,1835,3.38474,186.49478,183.52959,2.953997e-184,97
3,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1143458,1143553,2820,2.61547,285.31268,282.04059,9.107727e-283,72
4,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1167340,1167536,4380,2.54028,441.82684,438.00470,0.000000e+00,138
...,...,...,...,...,...,...,...,...,...,...
8409,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chrY,21906762,21906867,1514,3.45702,154.32509,151.46123,3.457562e-152,76
8410,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chrY,22737445,22737756,1109,1.79933,113.70029,110.97779,1.052471e-111,202
8411,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chrY,23167461,23167667,1066,10.88384,109.34882,106.64301,2.275045e-107,168
8412,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chrY,23402278,23402444,3543,3.65531,357.82581,354.30984,0.000000e+00,92


All rows are already below a qvalue of 0.05. Everything can be used for analysis.

In [196]:
#mpra_enh = enh[['chromosome', 'start_coord', 'end_coord', 'name']]

In [197]:
#mpra_enh.to_csv('data/mpra_enhancer_coords.txt', index=False, header=None, sep='\t')

In [198]:
print('There are ', len(enh[enh.peak == 0]), 'rows/peak that have a peaknr of 0')

There are  5 rows/peak that have a peaknr of 0


In [199]:
# There are is no eQTL data available for X and Y chromosome in the other dataset, so leave out the rows for chrX and chrY
enh = enh[~enh['chromosome'].isin(['chrX', 'chrY'])]
enh

Unnamed: 0,name,chromosome,start_coord,end_coord,score,signalValue,pvalue(-log10),qvalue(-log10),qvalue,peak
0,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,780062,780170,2990,6.76315,302.37964,299.05209,8.869722e-300,42
1,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1051480,1051842,1773,1.81547,180.31976,177.37407,4.226005e-178,121
2,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1136882,1137106,1835,3.38474,186.49478,183.52959,2.953997e-184,97
3,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1143458,1143553,2820,2.61547,285.31268,282.04059,9.107727e-283,72
4,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,1167340,1167536,4380,2.54028,441.82684,438.00470,0.000000e+00,138
...,...,...,...,...,...,...,...,...,...,...
8150,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr9,140473478,140473662,2493,3.11855,252.54643,249.37758,4.191988e-250,6
8151,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr9,140485008,140485172,1632,2.20556,166.12692,163.22536,5.951686e-164,42
8152,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr9,140499611,140499764,1355,1.55885,138.31764,135.50819,3.103202e-136,131
8153,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr9,140523078,140523281,2098,5.49343,212.87029,209.82153,1.508238e-210,91


In [200]:
#enh.to_csv('data/cd4_enhancer_sequenties.csv', index=False, sep='\t')

### from build 37 to 38

Create a bedfile for conversion grom build 37 to build 38

In [201]:
#Select the relevant columns:
enh_bed = enh[['chromosome', 'start_coord', 'end_coord', 'name']]

In [202]:
enh_bed

Unnamed: 0,chromosome,start_coord,end_coord,name
0,chr1,780062,780170,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
1,chr1,1051480,1051842,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
2,chr1,1136882,1137106,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
3,chr1,1143458,1143553,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
4,chr1,1167340,1167536,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
...,...,...,...,...
8150,chr9,140473478,140473662,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
8151,chr9,140485008,140485172,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
8152,chr9,140499611,140499764,MACS2STARRENH_indivuallyrmDupat75thpercentile0...
8153,chr9,140523078,140523281,MACS2STARRENH_indivuallyrmDupat75thpercentile0...


There are some duplicate names, but they have different positions. Adding the positions the original name to create unique names

In [203]:
#Set datatype of dataframe to string
enh_bed = enh_bed.astype(str)

In [204]:
#Create a unique name per row based on position
enh_bed['unique_name'] = enh_bed['chromosome'] +'_'+ enh_bed['start_coord'] +'_'+ enh_bed['end_coord']+ '_' + enh_bed['name']


In [205]:
#Select relevant columns with the new unique name
enh_bed = enh_bed[['chromosome', 'start_coord', 'end_coord', 'unique_name']]

In [206]:
#Write the columns to a txt file
enh_bed.to_csv('enhancer_coordinates.txt', sep='\t', header=None, index=False)

***

In [207]:
test = enh.iloc[:, 1:]

***
### Negative regulatory elements

Negative regulatory elements (NREs) are called using MACS2 [-log10(P) > 30] comparing input plasmid to STARR-Seq-obtained RNA reads <br> (Model-based Analysis of ChIP-Seq, zhang et al 2008)

 to characterize the effect of enhancers and NREs on expression of their target genes. First, we attempted to assign target genes to CREs on the basis of distance. Each CRE was assigned to the nearest TSS if within 10 kb

In [208]:
nre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6267 entries, 0 to 6266
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   chromosome      6267 non-null   object 
 1   start_coord     6267 non-null   int64  
 2   end_coord       6267 non-null   int64  
 3   name            6267 non-null   object 
 4   score           6267 non-null   int64  
 5   strand          6267 non-null   object 
 6   signalValue     6267 non-null   float64
 7   pvalue(-log10)  6267 non-null   float64
 8   qvalue(-log10)  6267 non-null   float64
 9   peak            6267 non-null   int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 489.7+ KB


In [209]:
nre.sort_values(by='qvalue(-log10)', ascending=False)

Unnamed: 0,chromosome,start_coord,end_coord,name,score,strand,signalValue,pvalue(-log10),qvalue(-log10),peak
5615,chr8,12051697,12052602,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,39712,.,9.69917,3979.92358,3971.27808,204
5045,chr6,37786734,37787385,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,11355,.,7.55992,1142.65662,1135.56580,514
5887,chr9,41954133,41955150,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,9074,.,24.62481,913.88104,907.41705,679
5602,chr8,7468847,7469203,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,8736,.,42.94287,880.01074,873.62402,149
5472,chr7,100887623,100888457,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,8231,.,4.20734,829.48114,823.15228,960
...,...,...,...,...,...,...,...,...,...,...
4005,chr22,44444086,44444330,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,277,.,7.60395,30.04187,27.70291,58
1491,chr12,112856426,112856640,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,276,.,1.70155,30.02005,27.68157,241
1037,chr11,65625658,65625865,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,276,.,1.31368,30.01295,27.67459,123
2018,chr15,73088930,73089178,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,276,.,4.70075,30.01036,27.67202,728


In [210]:
#Create column with qvalues
nre['qvalue'] = nre['qvalue(-log10)'].map(lambda x: 10 ** -x)

In [211]:
#Drop strand column and rearrange columns
nre = nre[['chromosome', 'start_coord', 'end_coord', 'name', 'score', 'signalValue', 'pvalue(-log10)', 'qvalue(-log10)', 'qvalue', 'peak']]

In [212]:
nre.sort_values(by='qvalue', ascending=False)

Unnamed: 0,chromosome,start_coord,end_coord,name,score,signalValue,pvalue(-log10),qvalue(-log10),qvalue,peak
1900,chr15,26095865,26096064,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,276,2.10375,30.00355,27.66535,2.160976e-28,13
2018,chr15,73088930,73089178,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,276,4.70075,30.01036,27.67202,2.128041e-28,728
1037,chr11,65625658,65625865,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,276,1.31368,30.01295,27.67459,2.115485e-28,123
1491,chr12,112856426,112856640,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,276,1.70155,30.02005,27.68157,2.081757e-28,241
4005,chr22,44444086,44444330,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,277,7.60395,30.04187,27.70291,1.981938e-28,58
...,...,...,...,...,...,...,...,...,...,...
300,chr1,119682919,119683347,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,8025,3.31381,808.86694,802.57648,0.000000e+00,316
2883,chr19,5720138,5720732,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,5997,3.00632,605.46771,599.77106,0.000000e+00,266
5615,chr8,12051697,12052602,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,39712,9.69917,3979.92358,3971.27808,0.000000e+00,204
2355,chr17,899749,900400,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,3258,2.58787,330.78641,325.87552,0.000000e+00,354


In [213]:
nre['name'].value_counts()

CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_286479    3
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_292055    2
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_381898    2
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_71365     2
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_138928    2
                                                                                         ..
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_123480    1
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_123412    1
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_123368    1
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_123279    1
CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_

In [214]:
nre[nre['name'].map(nre['name'].value_counts() > 1)]

Unnamed: 0,chromosome,start_coord,end_coord,name,score,signalValue,pvalue(-log10),qvalue(-log10),qvalue,peak
352,chr1,150533611,150533824,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,282,1.35615,30.60234,28.25764,5.525353e-29,1099
353,chr1,150534234,150534826,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,282,1.35615,30.60234,28.25764,5.525353e-29,1099
435,chr1,167486799,167487107,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,477,1.99339,50.56792,47.70538,1.970698e-48,334
436,chr1,167487691,167487716,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,477,1.99339,50.56792,47.70538,1.970698e-48,334
1103,chr11,84431242,84431446,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,1139,15.50637,117.94125,113.97683,1.0548e-114,243
1104,chr11,84432108,84432343,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,1139,15.50637,117.94125,113.97683,1.0548e-114,243
1188,chr11,128352963,128353020,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,791,5.08873,82.68213,79.16224,6.882718e-80,527
1189,chr11,128353388,128353593,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,791,5.08873,82.68213,79.16224,6.882718e-80,527
1260,chr12,14409101,14409276,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,489,2.74288,51.85849,48.96692,1.079145e-49,168
1261,chr12,14409467,14409476,CALLNREreversecontrolTxMACS2STARRENH_indivuall...,489,2.74288,51.85849,48.96692,1.079145e-49,168


In [215]:
print('There are', len(nre[nre['name'].map(nre['name'].value_counts() > 1)]), 'duplicate names for nre')

There are 41 duplicate names for nre


smallest float is up to e-324

### build 37 to 38

In [216]:
# Drop rows with X and y chromosomes
nre = nre[~nre['chromosome'].isin(['chrX', 'chrY'])]
#cast dataframe to string and create unique names for each row based on position
nre = nre.astype(str)
nre['unique_name'] = nre['chromosome'] +'_'+ nre['start_coord'] +'_'+ nre['end_coord']+ '_' + nre['name']


__Check variable and file name__ (consistent)

In [217]:
#Select relevant columns and write to a txt file for conversion
mpra_nre = nre[['chromosome', 'start_coord', 'end_coord', 'unique_name']]
mpra_nre.to_csv('data/mpra_nre_coords.txt', sep='\t', index=False, header=None)

In [218]:
mpra_nre

Unnamed: 0,chromosome,start_coord,end_coord,unique_name
0,chr1,762641,763192,chr1_762641_763192_CALLNREreversecontrolTxMACS...
1,chr1,911493,911786,chr1_911493_911786_CALLNREreversecontrolTxMACS...
2,chr1,948929,949124,chr1_948929_949124_CALLNREreversecontrolTxMACS...
3,chr1,989233,989396,chr1_989233_989396_CALLNREreversecontrolTxMACS...
4,chr1,1152392,1152620,chr1_1152392_1152620_CALLNREreversecontrolTxMA...
...,...,...,...,...
6063,chr9,139981074,139981542,chr9_139981074_139981542_CALLNREreversecontrol...
6064,chr9,140506301,140506495,chr9_140506301_140506495_CALLNREreversecontrol...
6065,chr9,140512927,140513302,chr9_140512927_140513302_CALLNREreversecontrol...
6066,chr9,140577072,140577228,chr9_140577072_140577228_CALLNREreversecontrol...


***

### Build 38 conversion inspections

In [219]:
#enh_grch_38 = pd.read_csv('data/enh_grch38.bed', sep='\t', header=None)
enh_grch_38 = pd.read_csv('data/grch38_enh_coords.bed', sep='\t', header=None, names=['chromosome', 'start_coord', 'end_coord', 'name'])

#nre_grch_38 = pd.read_csv('data/nre_grch38.bed', sep='\t', header=None)
nre_grch_38 = pd.read_csv('data/grch38_nre_coords.bed', sep='\t', header=None,names=['chromosome', 'start_coord', 'end_coord', 'name'])

In [220]:
enh_grch_38

Unnamed: 0,chromosome,start_coord,end_coord,name
0,chr1,844682,844790,chr1_780062_780170_MACS2STARRENH_indivuallyrmD...
1,chr1,1116100,1116462,chr1_1051480_1051842_MACS2STARRENH_indivuallyr...
2,chr1,1201502,1201726,chr1_1136882_1137106_MACS2STARRENH_indivuallyr...
3,chr1,1208078,1208173,chr1_1143458_1143553_MACS2STARRENH_indivuallyr...
4,chr1,1231960,1232156,chr1_1167340_1167536_MACS2STARRENH_indivuallyr...
...,...,...,...,...
8129,chr9,137579026,137579210,chr9_140473478_140473662_MACS2STARRENH_indivua...
8130,chr9,137590556,137590720,chr9_140485008_140485172_MACS2STARRENH_indivua...
8131,chr9,137605159,137605312,chr9_140499611_140499764_MACS2STARRENH_indivua...
8132,chr9,137628626,137628829,chr9_140523078_140523281_MACS2STARRENH_indivua...


In [221]:
nre_grch_38

Unnamed: 0,chromosome,start_coord,end_coord,name
0,chr1,827261,827812,chr1_762641_763192_CALLNREreversecontrolTxMACS...
1,chr1,976113,976406,chr1_911493_911786_CALLNREreversecontrolTxMACS...
2,chr1,1013549,1013744,chr1_948929_949124_CALLNREreversecontrolTxMACS...
3,chr1,1053853,1054016,chr1_989233_989396_CALLNREreversecontrolTxMACS...
4,chr1,1217012,1217240,chr1_1152392_1152620_CALLNREreversecontrolTxMA...
...,...,...,...,...
6042,chr9,137086622,137087090,chr9_139981074_139981542_CALLNREreversecontrol...
6043,chr9,137611849,137612043,chr9_140506301_140506495_CALLNREreversecontrol...
6044,chr9,137618475,137618850,chr9_140512927_140513302_CALLNREreversecontrol...
6045,chr9,137682620,137682776,chr9_140577072_140577228_CALLNREreversecontrol...


In [222]:
len(mpra_nre) - len(nre_grch_38)

21

In [223]:
len(enh_bed) - len(enh_grch_38)

21

Check which names are missing for the nre dataset after converting

In [224]:
missing_nre = mpra_nre[~mpra_nre['unique_name'].isin(nre_grch_38['name'])]
missing_nre

Unnamed: 0,chromosome,start_coord,end_coord,unique_name
306,chr1,142890760,142891321,chr1_142890760_142891321_CALLNREreversecontrol...
307,chr1,143283666,143283895,chr1_143283666_143283895_CALLNREreversecontrol...
332,chr1,147624310,147624672,chr1_147624310_147624672_CALLNREreversecontrol...
335,chr1,147931299,147931653,chr1_147931299_147931653_CALLNREreversecontrol...
338,chr1,148598378,148598910,chr1_148598378_148598910_CALLNREreversecontrol...
342,chr1,149514459,149514637,chr1_149514459_149514637_CALLNREreversecontrol...
343,chr1,149575248,149575961,chr1_149575248_149575961_CALLNREreversecontrol...
345,chr1,149605660,149605874,chr1_149605660_149605874_CALLNREreversecontrol...
346,chr1,149672222,149672696,chr1_149672222_149672696_CALLNREreversecontrol...
696,chr10,48828008,48828181,chr10_48828008_48828181_CALLNREreversecontrolT...


In [225]:
print(f'There are {len(missing_nre)} entries/names that are not converted')

There are 34 entries/names that are not converted


The difference before and after converting is however 21, so there are some entries added.

Example chromosome 20

In [226]:
nre_chr20 = mpra_nre[mpra_nre['chromosome'] == 'chr20']
nre_chr20

Unnamed: 0,chromosome,start_coord,end_coord,unique_name
3649,chr20,311259,311448,chr20_311259_311448_CALLNREreversecontrolTxMAC...
3650,chr20,442443,442721,chr20_442443_442721_CALLNREreversecontrolTxMAC...
3651,chr20,524388,524786,chr20_524388_524786_CALLNREreversecontrolTxMAC...
3652,chr20,1305786,1306425,chr20_1305786_1306425_CALLNREreversecontrolTxM...
3653,chr20,1878619,1878804,chr20_1878619_1878804_CALLNREreversecontrolTxM...
...,...,...,...,...
3800,chr20,62289569,62289973,chr20_62289569_62289973_CALLNREreversecontrolT...
3801,chr20,62368729,62368950,chr20_62368729_62368950_CALLNREreversecontrolT...
3802,chr20,62435417,62435584,chr20_62435417_62435584_CALLNREreversecontrolT...
3803,chr20,62612178,62612680,chr20_62612178_62612680_CALLNREreversecontrolT...


In [227]:
nre_chr20_38 = nre_grch_38[nre_grch_38['chromosome'] == 'chr20']
nre_chr20_38

Unnamed: 0,chromosome,start_coord,end_coord,name
3628,chr20,330615,330804,chr20_311259_311448_CALLNREreversecontrolTxMAC...
3629,chr20,461799,462077,chr20_442443_442721_CALLNREreversecontrolTxMAC...
3630,chr20,543744,544142,chr20_524388_524786_CALLNREreversecontrolTxMAC...
3631,chr20,1325142,1325781,chr20_1305786_1306425_CALLNREreversecontrolTxM...
3632,chr20,1897973,1898158,chr20_1878619_1878804_CALLNREreversecontrolTxM...
...,...,...,...,...
3780,chr20,63658216,63658620,chr20_62289569_62289973_CALLNREreversecontrolT...
3781,chr20,63737376,63737597,chr20_62368729_62368950_CALLNREreversecontrolT...
3782,chr20,63804064,63804231,chr20_62435417_62435584_CALLNREreversecontrolT...
3783,chr20,63980825,63981327,chr20_62612178_62612680_CALLNREreversecontrolT...


In [228]:
# Check for duplicate names
nre_chr20_38['name'].value_counts()

chr20_34042543_34043186_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_203530    2
chr20_44002291_44002529_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_205089    1
chr20_44452811_44453123_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_205143    1
chr20_44600177_44601025_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_205176    1
chr20_45318028_45318410_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_205286    1
                                                                                                                 ..
chr20_30606195_30606375_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_202916    1
chr20_30619231_30619455_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_202918    1
chr20_30697273_30697794_CALLNREreversecontrolTxMACS2STARRENH_indivuallyr

Inspect the duplicate name

In [229]:
mpra_nre[mpra_nre['unique_name'] == 'chr20_34042543_34043186_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_203530']

Unnamed: 0,chromosome,start_coord,end_coord,unique_name
3717,chr20,34042543,34043186,chr20_34042543_34043186_CALLNREreversecontrolT...


In [230]:
nre_grch_38[nre_grch_38['name']== 'chr20_34042543_34043186_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_203530']

Unnamed: 0,chromosome,start_coord,end_coord,name
3696,chr20,35454721,35455117,chr20_34042543_34043186_CALLNREreversecontrolT...
3697,chr20,35455118,35455365,chr20_34042543_34043186_CALLNREreversecontrolT...


One coordinate from build 37 can give multiple coordinates for build 38 <br>
Check for all duplicate entries:

In [231]:
#Check all duplicate entries after converting
nre_grch_38[nre_grch_38['name'].map(nre_grch_38['name'].value_counts() >1)]

Unnamed: 0,chromosome,start_coord,end_coord,name
311,chr1,149176821,149177115,chr1_144519839_144520373_CALLNREreversecontrol...
312,chr1,149176650,149176820,chr1_144519839_144520373_CALLNREreversecontrol...
313,chr1,149102832,149102980,chr1_144594255_144594452_CALLNREreversecontrol...
314,chr1,149102759,149102808,chr1_144594255_144594452_CALLNREreversecontrol...
333,chr1,148303201,148303374,chr1_147775312_147775737_CALLNREreversecontrol...
334,chr1,148303374,148303376,chr1_147775312_147775737_CALLNREreversecontrol...
335,chr1,148303386,148303630,chr1_147775312_147775737_CALLNREreversecontrol...
2436,chr17,22377474,22377500,chr17_21904441_21904600_CALLNREreversecontrolT...
2437,chr17,22377518,22377575,chr17_21904441_21904600_CALLNREreversecontrolT...
2438,chr17,22377575,22377650,chr17_21904441_21904600_CALLNREreversecontrolT...


In [233]:
duplicates_nre = nre_grch_38[nre_grch_38['name'].duplicated()]
duplicates_nre = list(duplicates_nre['name'].unique())
print(f'There are {len(duplicates_nre)} unique duplicate entrys/names')
print(f'Corresponding to {len(nre_grch_38[nre_grch_38.name.isin(duplicates_nre)])} rows')

There are 10 unique duplicate entrys/names
Corresponding to 23 rows


In [234]:
duplicates_nre

['chr1_144519839_144520373_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_17689',
 'chr1_144594255_144594452_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_17701',
 'chr1_147775312_147775737_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_18075',
 'chr17_21904441_21904600_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_138276',
 'chr17_72858268_72858576_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_145627',
 'chr20_34042543_34043186_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_203530',
 'chr7_72738345_72738588_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_324915',
 'chr7_74306439_74307121_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_325207',
 'chr7_74988361_74988865_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75t

In [235]:
#drop rows with a duplicate entry
nre_grch_38 = nre_grch_38.drop_duplicates(subset='name', keep=False)

In [238]:
nre_grch_38 #6047

Unnamed: 0,chromosome,start_coord,end_coord,name
0,chr1,827261,827812,chr1_762641_763192_CALLNREreversecontrolTxMACS...
1,chr1,976113,976406,chr1_911493_911786_CALLNREreversecontrolTxMACS...
2,chr1,1013549,1013744,chr1_948929_949124_CALLNREreversecontrolTxMACS...
3,chr1,1053853,1054016,chr1_989233_989396_CALLNREreversecontrolTxMACS...
4,chr1,1217012,1217240,chr1_1152392_1152620_CALLNREreversecontrolTxMA...
...,...,...,...,...
6042,chr9,137086622,137087090,chr9_139981074_139981542_CALLNREreversecontrol...
6043,chr9,137611849,137612043,chr9_140506301_140506495_CALLNREreversecontrol...
6044,chr9,137618475,137618850,chr9_140512927_140513302_CALLNREreversecontrol...
6045,chr9,137682620,137682776,chr9_140577072_140577228_CALLNREreversecontrol...


__enhancer grch38 inspection__

In [239]:
missing_enh = enh_bed[~enh_bed['unique_name'].isin(enh_grch_38['name'])]
missing_enh

Unnamed: 0,chromosome,start_coord,end_coord,unique_name
460,chr1,145096358,145096495,chr1_145096358_145096495_MACS2STARRENH_indivua...
484,chr1,149513997,149514141,chr1_149513997_149514141_MACS2STARRENH_indivua...
485,chr1,149605477,149605586,chr1_149605477_149605586_MACS2STARRENH_indivua...
1012,chr10,48477923,48478051,chr10_48477923_48478051_MACS2STARRENH_indivual...
1013,chr10,48481322,48481507,chr10_48481322_48481507_MACS2STARRENH_indivual...
1015,chr10,51565073,51565230,chr10_51565073_51565230_MACS2STARRENH_indivual...
2620,chr14,106551152,106551289,chr14_106551152_106551289_MACS2STARRENH_indivu...
2630,chr15,23115604,23115656,chr15_23115604_23115656_MACS2STARRENH_indivual...
3046,chr16,46395403,46395438,chr16_46395403_46395438_MACS2STARRENH_indivual...
3047,chr16,46406769,46406867,chr16_46406769_46406867_MACS2STARRENH_indivual...


In [240]:
print(f'There are {len(missing_enh)} entries/names that are not converted')

There are 23 entries/names that are not converted


In [247]:
enh_grch_38['name'].value_counts()

chr1_145454616_145454878_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_9891      2
chr14_106995875_106996150_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_62780    2
chr1_780062_780170_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_7               1
chr3_45883605_45883739_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_124297      1
chr3_45730624_45730793_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_124285      1
                                                                                           ..
chr15_52774217_52774391_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_64981      1
chr15_52338124_52338362_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_64933      1
chr15_52029846_52030175_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_64901      1
chr15_51057627_51057705_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_64830      1
chr9_140617195_140617347_MACS2STARRENH_indivuallyrmDupat75th

In [250]:
#Check all duplicate entries after converting
enh_grch_38[enh_grch_38['name'].map(enh_grch_38['name'].value_counts() >1)]

Unnamed: 0,chromosome,start_coord,end_coord,name
462,chr1,145980374,145980466,chr1_145454616_145454878_MACS2STARRENH_indivua...
463,chr1,145980205,145980374,chr1_145454616_145454878_MACS2STARRENH_indivua...
2618,chr14,106539879,106539950,chr14_106995875_106996150_MACS2STARRENH_indivu...
2619,chr14,106539951,106540155,chr14_106995875_106996150_MACS2STARRENH_indivu...


In [251]:
duplicates_enh = enh_grch_38[enh_grch_38['name'].duplicated()]
duplicates_enh = list(duplicates_enh['name'].unique())
print(f'There are {len(duplicates_enh)} unique duplicate entrys/names')
print(f'Corresponding to {len(enh_grch_38[enh_grch_38.name.isin(duplicates_enh)])} rows')

There are 2 unique duplicate entrys/names
Corresponding to 4 rows


In [252]:
enh_grch_38 = enh_grch_38.drop_duplicates(subset='name', keep=False)

In [253]:
enh_grch_38

Unnamed: 0,chromosome,start_coord,end_coord,name
0,chr1,844682,844790,chr1_780062_780170_MACS2STARRENH_indivuallyrmD...
1,chr1,1116100,1116462,chr1_1051480_1051842_MACS2STARRENH_indivuallyr...
2,chr1,1201502,1201726,chr1_1136882_1137106_MACS2STARRENH_indivuallyr...
3,chr1,1208078,1208173,chr1_1143458_1143553_MACS2STARRENH_indivuallyr...
4,chr1,1231960,1232156,chr1_1167340_1167536_MACS2STARRENH_indivuallyr...
...,...,...,...,...
8129,chr9,137579026,137579210,chr9_140473478_140473662_MACS2STARRENH_indivua...
8130,chr9,137590556,137590720,chr9_140485008_140485172_MACS2STARRENH_indivua...
8131,chr9,137605159,137605312,chr9_140499611_140499764_MACS2STARRENH_indivua...
8132,chr9,137628626,137628829,chr9_140523078_140523281_MACS2STARRENH_indivua...


Write the dataframes to new txt files

In [76]:
enh_grch_38.to_csv('data/enh_cd4_coords.txt', sep='\t', index=False, header=None)
nre_grch_38.to_csv('data/nre_cd4_coords.txt', sep='\t', index=False, header=None)

***

Overlap analysis

In [None]:
header_enh = ['chromosome_enh', 'start_enh', 'end_enh', 'enh_name','chromosome_lcl', 'start_lcl', 'end_lcl', 'snp']
header_nre = ['chromosome_nre', 'start_nre', 'end_nre', 'nre_name','chromosome_lcl', 'start_lcl', 'end_lcl', 'snp']

In [None]:
enh_overlap_nre = pd.read_csv('data/enh_overlap_with_nre.txt', sep='\t', header=None)
enh_overlap_sig_lcl = pd.read_csv('data/enh_overlap_sig_lcl.txt', sep='\t', header=None, names=header_enh)
enh_overlap_total_lcl = pd.read_csv('data/enh_overlap_total_lcl.txt', sep='\t', header=None, names=header_enh)
nre_overlap_sig_lcl = pd.read_csv('data/nre_overlap_sig_lcl.txt', sep='\t', header=None, names=header_nre)
nre_overlap_total_lcl = pd.read_csv('data/nre_overlap_total_lcl.txt', sep='\t', header=None, names=header_nre)

In [None]:
enh_overlap_sig_lcl

Unnamed: 0,chromosome_enh,start_enh,end_enh,enh_name,chromosome_lcl,start_lcl,end_lcl,snp
0,chr1,2546978,2547299,chr1_2478417_2478738_MACS2STARRENH_indivuallyr...,chr1,2546997,2547147,rs2477678
1,chr1,22025961,22026209,chr1_22352454_22352702_MACS2STARRENH_indivuall...,chr1,22025827,22025977,chr1:22352395:D
2,chr1,26169775,26169929,chr1_26496266_26496420_MACS2STARRENH_indivuall...,chr1,26169804,26169954,rs2232646
3,chr1,27914537,27914833,chr1_28241048_28241344_MACS2STARRENH_indivuall...,chr1,27914497,27914647,rs17185052
4,chr1,43389750,43389967,chr1_43855421_43855638_MACS2STARRENH_indivuall...,chr1,43389832,43389982,rs839753
...,...,...,...,...,...,...,...,...
108,chr9,113401339,113401554,chr9_116163619_116163834_MACS2STARRENH_indivua...,chr9,113401290,113401440,rs1771220
109,chr9,128275996,128276234,chr9_131038275_131038513_MACS2STARRENH_indivua...,chr9,128276221,128276371,rs115231472
110,chr9,136118927,136119153,chr9_139010773_139010999_MACS2STARRENH_indivua...,chr9,136118794,136118944,rs79362369
111,chr9,136403654,136403775,chr9_139298106_139298227_MACS2STARRENH_indivua...,chr9,136403686,136403836,rs80029103


In [None]:
enh_overlap_total_lcl

Unnamed: 0,chromosome_enh,start_enh,end_enh,enh_name,chromosome_lcl,start_lcl,end_lcl,snp
0,chr1,2546978,2547299,chr1_2478417_2478738_MACS2STARRENH_indivuallyr...,chr1,2546997,2547147,rs2477678
1,chr1,22025961,22026209,chr1_22352454_22352702_MACS2STARRENH_indivuall...,chr1,22025827,22025977,chr1:22352395:D
2,chr1,26169775,26169929,chr1_26496266_26496420_MACS2STARRENH_indivuall...,chr1,26169804,26169954,rs2232646
3,chr1,27914537,27914833,chr1_28241048_28241344_MACS2STARRENH_indivuall...,chr1,27914497,27914647,rs17185052
4,chr1,43389750,43389967,chr1_43855421_43855638_MACS2STARRENH_indivuall...,chr1,43389832,43389982,rs839753
...,...,...,...,...,...,...,...,...
161,chr9,113401339,113401554,chr9_116163619_116163834_MACS2STARRENH_indivua...,chr9,113401290,113401440,rs1771220
162,chr9,128275996,128276234,chr9_131038275_131038513_MACS2STARRENH_indivua...,chr9,128276221,128276371,rs115231472
163,chr9,136118927,136119153,chr9_139010773_139010999_MACS2STARRENH_indivua...,chr9,136118794,136118944,rs79362369
164,chr9,136403654,136403775,chr9_139298106_139298227_MACS2STARRENH_indivua...,chr9,136403686,136403836,rs80029103


In [None]:
f'Unique enhancer (starr) coordinates that overlap with significant lcl mpra coordinates: {len(enh_overlap_sig_lcl.enh_name.value_counts())}'

'Unique enhancer (starr) coordinates that overlap with significant lcl mpra coordinates: 99'

In [None]:
f'Unique enhancer(starr) coordinates that overlap with all lcl mpra coordinates: {len(enh_overlap_total_lcl.enh_name.value_counts())}'

'Unique enhancer (starr) coordinates that overlap with lcl mpra: 138'

In [None]:
#Check the names of enhancer coordinates that are not in the overlap with significant lcl mpra's dataframe
not_overlapping = enh_overlap_total_lcl[~enh_overlap_total_lcl.enh_name.isin(enh_overlap_sig_lcl.enh_name)]
f'Unique enhancer coordinates that overlap with total and not significant lcl mpra coordinates: {len(not_overlapping.enh_name.value_counts())}'

'Unique enhancer coordinates that overlap with total and not significant lcl mpra coordinates: 39'

In [None]:
#Check the names of enhancer coordinates that are not in the overlap with significant lcl mpra's dataframe
not_overlapping = enh_overlap_total_lcl[~enh_overlap_total_lcl.snp.isin(enh_overlap_sig_lcl.snp)]
f'Unique snps that are in total and not significant lcl snps: {len(not_overlapping.snp.value_counts())}'

'Unique snps that are in total and not significant lcl snps: 53'

- enhancer - sig lcl mpra: 99 coordinates overlap
- enhancer - total lcl mpra: 138 coordinates overlap
- difference in overlap coordinates: 39

In [None]:
f'Unique nre (starr) coordinates that overlap with significant lcl mpra coordinates: {len(nre_overlap_sig_lcl.nre_name.value_counts())}'


'Unique enhancer (starr) coordinates that overlap with significant lcl mpra coordinates: 79'

In [None]:
f'Unique enhancer(starr) coordinates that overlap with all lcl mpra coordinates: {len(nre_overlap_total_lcl.nre_name.value_counts())}'

'Unique enhancer(starr) coordinates that overlap with all lcl mpra coordinates: 137'

In [None]:
#Check the names of enhancer coordinates that are not in the overlap with significant lcl mpra's dataframe
not_overlapping_nre = nre_overlap_total_lcl[~nre_overlap_total_lcl.nre_name.isin(nre_overlap_sig_lcl.nre_name)]
f'Unique enhancer coordinates that overlap with total and not significant lcl mpra coordinates: {len(not_overlapping_nre.nre_name.value_counts())}'

'Unique enhancer coordinates that overlap with total and not significant lcl mpra coordinates: 58'

In [None]:
#enh_overlap_sig_lcl[enh_overlap_sig_lcl['enh_name'] == 'chr6_26474036_26474615_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_163788']

In [None]:
#enh_overlap_total_lcl[enh_overlap_total_lcl['enh_name'] == 'chr6_26474036_26474615_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_163788']

Enhancers were called using MACS2 in which the plasmid reads were used as a control, [macs2 callpeak -f BEDPE –keep-dup all -g 2.7E9], and with -log10(P-value) greater than 75. NRE were also detected with MACS2, using the plasmid reads as treatment and the STARR RNA as control [macs2 callpeak -f BEDPE –keep-dup all -g 2.7E9], with -log10(P-value) greater than 30. Largely similar results can be obtained using FAST-NR [77]. Both enhancers and NRE are retained only if intersecting an ATAC-Seq narrowPeak called using MACS2 from the plasmid reads alone.

All final values and CRE locations are listed in Supplemental Table 2. Detailed data processing script can be found at https://github.com/Barski-lab/Lenti-STARR

In [114]:
sig_cd4qtls = pd.read_csv('data/cd4_qtl.csv', sep='\t')

In [130]:
sig_cd4qtls

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_chromosome,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org
0,RPS26,12:56007301:G:A,5.750549e-61,1.525292,0.277239,3.875870e-58,12,56041351,56044697,ENSG00000197728,...,12,56007301,G,1,0.447368,0.509556,12:56007301:G:A-RPS26,16.472841,217721,16.886385
1,SMDT1,22:42074313:T:C,3.718128e-59,-1.446559,0.271777,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,22,42074313,T,1,0.342105,1.000000,22:42074313:T:C-SMDT1,-16.218728,217721,-16.704826
2,SMDT1,22:42080766:A:T,4.780513e-59,1.449479,0.272295,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,22,42080766,A,1,0.342105,1.000000,22:42080766:A:T-SMDT1,16.203283,217721,16.682376
3,SMDT1,22:42080750:A:C,4.832899e-59,1.447290,0.271892,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,22,42080750,A,1,0.342105,1.000000,22:42080750:A:C-SMDT1,16.202613,217721,16.681792
4,SMDT1,22:42078134:C:G,4.860768e-59,1.445682,0.271640,1.774930e-57,22,42079691,42084284,ENSG00000183172,...,22,42078134,C,1,0.342105,1.000000,22:42078134:C:G-SMDT1,16.202260,217721,16.680695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199622,PEA15,1:160013969:C:T,1.194720e-03,0.595101,0.254633,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,1,160013969,C,1,0.171053,1.000000,1:160013969:C:T-PEA15,3.240138,217721,2.913457
199623,PEA15,1:160015506:GA:G,1.196031e-03,0.557679,0.255824,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,1,160015506,GA,1,0.184211,1.000000,1:160015506:GA:G-PEA15,3.239825,217721,2.927943
199624,PEA15,1:160027419:C:G,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,1,160027419,C,1,0.184211,1.000000,1:160027419:C:G-PEA15,3.235851,217721,2.932173
199625,PEA15,1:160022992:T:C,1.212807e-03,0.582930,0.252949,2.580576e-02,1,160205380,160215376,ENSG00000162734,...,1,160022992,T,1,0.184211,1.000000,1:160022992:T:C-PEA15,3.235851,217721,2.932173


In [131]:
sig_cd4qtls = sig_cd4qtls.astype({'snp_chromosome':'str'})
sig_cd4qtls['snp_chromosome'] = 'chr' + sig_cd4qtls['snp_chromosome']

In [133]:
sig_cd4qtls.dtypes

feature_id                    object
snp_id                        object
p_value                      float64
beta                         float64
beta_se                      float64
empirical_feature_p_value    float64
feature_chromosome             int64
feature_start                  int64
feature_end                    int64
ENSG                          object
biotype                       object
n_samples                      int64
n_e_samples                    int64
alpha_param                  float64
beta_param                   float64
snp_chromosome                object
snp_position                   int64
assessed_allele               object
call_rate                      int64
maf                          float64
hwe_p                        float64
QTL                           object
z_score                      float64
weight                         int64
z_score_org                  float64
dtype: object

In [134]:
enh_grch_38

Unnamed: 0,chromosome,start_coord,end_coord,name
0,chr1,844682,844790,chr1_780062_780170_MACS2STARRENH_indivuallyrmD...
1,chr1,1116100,1116462,chr1_1051480_1051842_MACS2STARRENH_indivuallyr...
2,chr1,1201502,1201726,chr1_1136882_1137106_MACS2STARRENH_indivuallyr...
3,chr1,1208078,1208173,chr1_1143458_1143553_MACS2STARRENH_indivuallyr...
4,chr1,1231960,1232156,chr1_1167340_1167536_MACS2STARRENH_indivuallyr...
...,...,...,...,...
8129,chr9,137579026,137579210,chr9_140473478_140473662_MACS2STARRENH_indivua...
8130,chr9,137590556,137590720,chr9_140485008_140485172_MACS2STARRENH_indivua...
8131,chr9,137605159,137605312,chr9_140499611_140499764_MACS2STARRENH_indivua...
8132,chr9,137628626,137628829,chr9_140523078_140523281_MACS2STARRENH_indivua...


In [144]:
def get_all_snp_ids_from_overlapdict(overlap_dict):
    snpids = [snp_id for value in overlap_dict.values() for snp_id in value] 
    return snpids

In [None]:
def snp_position_in_mpra(mpraseq_dataframe, snp_dataframe, colname_seq_name, snp_id_colname):
    dict_mpraname_snp={}

    for idx, row  in mpraseq_dataframe.iterrows():
        name = row[colname_seq_name]
        
        start = row["start_coord"]
        end = row["end_coord"]
        #snp_pos = row['snp_position']
        chrom = row["chromosome"]

        overlapping = snp_dataframe[(snp_dataframe["snp_position"]  >= start) & (snp_dataframe["snp_position"] <= end) & (snp_dataframe["snp_chromosome"]== chrom)]

        if not overlapping.empty:
            dict_mpraname_snp[name] = overlapping[snp_id_colname].tolist()
    return dict_mpraname_snp

In [135]:
sig_cd4qtls_enhancer = snp_position_in_mpra(enh_grch_38, sig_cd4qtls, 'name', 'snp_id')

In [165]:
snp_ids_sig_cd4_enh = get_all_snp_ids_from_overlapdict(sig_cd4qtls_enhancer)
snp_ids_sig_cd4_enh_unique = list(snp_ids_sig_cd4_enh)
print('amount of rows', len(snp_ids_sig_cd4_enh))
print('unique ids', len(set(snp_ids_sig_cd4_enh)))

amount of rows 323
unique ids 231


In [166]:
sig_cd4qtls[sig_cd4qtls['snp_id'].isin(snp_ids_sig_cd4_enh_unique)]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_chromosome,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org
2942,XRRA1,11:74949098:G:C,1.882793e-50,-1.422397,0.279969,2.530878e-48,11,74807739,74949200,ENSG00000166435,...,chr11,74949098,G,1,0.263158,0.235435,11:74949098:G:C-XRRA1,-14.937363,217721,-15.276445
5130,ELP5,17:7205187:T:G,3.190423e-32,1.192322,0.298287,1.632024e-30,17,7251416,7259940,ENSG00000170291,...,chr17,7205187,T,1,0.315789,0.717019,17:7205187:T:G-ELP5,11.816954,217721,12.091769
6431,CBR3,21:36114244:A:C,5.010141e-30,0.958961,0.294934,7.732457e-27,21,36135079,36146562,ENSG00000159231,...,chr21,36114244,A,1,0.434211,0.526619,21:36114244:A:C-CBR3,11.384243,217721,11.110350
6556,RBM43,2:151261757:G:A,2.361397e-28,0.912972,0.202870,3.576534e-26,2,151247940,151261863,ENSG00000184898,...,chr2,151261757,G,1,0.473684,0.342222,2:151261757:G:A-RBM43,11.043321,217721,10.899083
6557,RBM43,2:151261761:A:G,2.398367e-28,0.912097,0.202591,3.576534e-26,2,151247940,151261863,ENSG00000184898,...,chr2,151261761,A,1,0.473684,0.342222,2:151261761:A:G-RBM43,11.041926,217721,10.896518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192250,DBT,1:99808794:A:C,2.957756e-03,0.237903,0.177475,2.267389e-02,1,100186919,100249834,ENSG00000137992,...,chr1,99808794,A,1,0.276316,0.418573,1:99808794:A:C-DBT,2.972095,217721,3.030142
194763,TSPYL1,6:116254181:T:G,3.068215e-03,0.161211,0.210881,2.368815e-02,6,116267760,116279930,ENSG00000189241,...,chr6,116254181,T,1,0.421053,0.331485,6:116254181:T:G-TSPYL1,2.960820,217721,2.877099
197800,CUL3,2:224585444:G:C,8.810405e-04,0.010443,0.159599,2.494977e-02,2,224470150,224585397,ENSG00000036257,...,chr2,224585444,G,1,0.434211,1.000000,2:224585444:G:C-CUL3,-3.325993,217721,-3.194663
198774,ZNF593,1:26233990:C:A,7.045794e-03,0.088995,0.248069,2.543969e-02,1,26169908,26170873,ENSG00000142684,...,chr1,26233990,C,1,0.381579,0.490409,1:26233990:C:A-ZNF593,2.694672,217721,2.748443


In [161]:
len(sig_cd4qtls_enhancer)

171

In [105]:
cd4_qtl = pd.read_csv('eQTL\WMA_meta_CD4_T_qtl_results_fastApprox.txt', sep='\t')


In [106]:
total_cd4qtls_enhancer = snp_position_in_mpra(enh_grch_38, cd4_qtl, 'name', 'snp_id')

4625.6875 seconds


In [150]:
len(total_cd4qtls_enhancer)

734

In [145]:
snp_ids_total_enh = get_all_snp_ids_from_overlapdict(total_cd4qtls_enhancer)

In [158]:
unique_snp_ids_total_cd4_enh = list(set(snp_ids_total_enh))
print('amount of rows containing the snp ids',len(snp_ids_total_enh))
print('unique ids',len(unique_snp_ids_total_cd4_enh))

amount of rows containing the snp ids 7649
unique ids 956


In [156]:
cd4_qtl[cd4_qtl['snp_id'].isin(unique_snp_ids_total_cd4_enh)]

Unnamed: 0,V1,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,...,snp_chromosome,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org
2942,1046109,XRRA1,11:74949098:G:C,1.882793e-50,-1.422397,0.279969,2.530878e-48,11,74807739,74949200,...,11,74949098,G,1,0.263158,0.235435,11:74949098:G:C-XRRA1,-14.937363,217721,-15.276445
5130,2980678,ELP5,17:7205187:T:G,3.190423e-32,1.192322,0.298287,1.632024e-30,17,7251416,7259940,...,17,7205187,T,1,0.315789,0.717019,17:7205187:T:G-ELP5,11.816954,217721,12.091769
6431,5592525,CBR3,21:36114244:A:C,5.010141e-30,0.958961,0.294934,7.732457e-27,21,36135079,36146562,...,21,36114244,A,1,0.434211,0.526619,21:36114244:A:C-CBR3,11.384243,217721,11.110350
6556,5916853,RBM43,2:151261757:G:A,2.361397e-28,0.912972,0.202870,3.576534e-26,2,151247940,151261863,...,2,151261757,G,1,0.473684,0.342222,2:151261757:G:A-RBM43,11.043321,217721,10.899083
6557,5916852,RBM43,2:151261761:A:G,2.398367e-28,0.912097,0.202591,3.576534e-26,2,151247940,151261863,...,2,151261761,A,1,0.473684,0.342222,2:151261761:A:G-RBM43,11.041926,217721,10.896518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7773377,2102882,MAP4K5,14:50321495:C:T,4.219381e-01,-0.086227,0.190976,9.999950e-01,14,50418501,50561126,...,14,50321495,C,1,0.381579,0.738671,14:50321495:C:T-MAP4K5,0.803063,217721,0.762832
7774668,2069192,MAP4K5,14:49868395:C:G,9.280497e-01,0.428517,0.255656,9.999950e-01,14,50418501,50561126,...,14,49868395,C,1,0.157895,0.205598,14:49868395:C:G-MAP4K5,0.090299,217721,0.093199
7775441,8219253,HNRNPH1,5:179858752:G:A,3.089729e-02,0.408058,0.201010,9.999964e-01,5,179614178,179634784,...,5,179858752,G,1,0.302632,1.000000,5:179858752:G:A-HNRNPH1,2.158393,217721,1.850589
7776606,8290944,HNRNPH1,5:178730808:C:A,5.053853e-01,0.111531,0.279828,9.999964e-01,5,179614178,179634784,...,5,178730808,C,1,0.236842,1.000000,5:178730808:C:A-HNRNPH1,-0.666040,217721,-1.157159


In [169]:
enh_peaks_sig_cd4 =list(total_cd4qtls_enhancer.keys())
enh_grch_38[enh_grch_38['name'].isin(enh_peaks_sig_cd4)]

Unnamed: 0,chromosome,start_coord,end_coord,name
16,chr1,2391528,2391628,chr1_2322967_2323067_MACS2STARRENH_indivuallyr...
19,chr1,2546978,2547299,chr1_2478417_2478738_MACS2STARRENH_indivuallyr...
27,chr1,6602858,6603087,chr1_6662918_6663147_MACS2STARRENH_indivuallyr...
31,chr1,7961479,7961734,chr1_8021539_8021794_MACS2STARRENH_indivuallyr...
35,chr1,8409137,8409369,chr1_8469197_8469429_MACS2STARRENH_indivuallyr...
...,...,...,...,...
8044,chr9,126914546,126914792,chr9_129676825_129677071_MACS2STARRENH_indivua...
8075,chr9,129235986,129236124,chr9_131998265_131998403_MACS2STARRENH_indivua...
8113,chr9,135961356,135961555,chr9_138853202_138853401_MACS2STARRENH_indivua...
8117,chr9,136533360,136533513,chr9_139427812_139427965_MACS2STARRENH_indivua...


In [171]:
len(enh_grch_38)

8130

In [172]:
import re

In [None]:
#re.match('(?:[A-Z].+)', i)

In [173]:
names = [name for i in enh_grch_38['name'] for name in re.findall(r'(?:[A-Z].+)', i)]

***

In [178]:
nre_grch_38

Unnamed: 0,chromosome,start_coord,end_coord,name
0,chr1,827261,827812,chr1_762641_763192_CALLNREreversecontrolTxMACS...
1,chr1,976113,976406,chr1_911493_911786_CALLNREreversecontrolTxMACS...
2,chr1,1013549,1013744,chr1_948929_949124_CALLNREreversecontrolTxMACS...
3,chr1,1053853,1054016,chr1_989233_989396_CALLNREreversecontrolTxMACS...
4,chr1,1217012,1217240,chr1_1152392_1152620_CALLNREreversecontrolTxMA...
...,...,...,...,...
6042,chr9,137086622,137087090,chr9_139981074_139981542_CALLNREreversecontrol...
6043,chr9,137611849,137612043,chr9_140506301_140506495_CALLNREreversecontrol...
6044,chr9,137618475,137618850,chr9_140512927_140513302_CALLNREreversecontrol...
6045,chr9,137682620,137682776,chr9_140577072_140577228_CALLNREreversecontrol...


In [177]:
sig_cd4qtls_nre = snp_position_in_mpra(nre_grch_38, sig_cd4qtls, 'name', 'snp_id')

69.03125 seconds


In [179]:
len(sig_cd4qtls_nre)

197

In [180]:
snp_ids_sig_cd4_nre = get_all_snp_ids_from_overlapdict(sig_cd4qtls_nre)

In [183]:
unique_snp_ids_sig_cd4_nre =list(set(snp_ids_sig_cd4_nre))
print(len(unique_snp_ids_sig_cd4_nre))

294


In [184]:
sig_cd4qtls[sig_cd4qtls['snp_id'].isin(unique_snp_ids_sig_cd4_nre)]

Unnamed: 0,feature_id,snp_id,p_value,beta,beta_se,empirical_feature_p_value,feature_chromosome,feature_start,feature_end,ENSG,...,snp_chromosome,snp_position,assessed_allele,call_rate,maf,hwe_p,QTL,z_score,weight,z_score_org
3240,CENPK,5:65624976:C:T,2.431329e-47,1.064431,0.218891,4.102135e-46,5,65517766,65563168,ENSG00000123219,...,chr5,65624976,C,1,0.421053,0.746864,5:65624976:C:T-CENPK,14.452107,217721,14.762291
3682,DNAJC15,13:43023729:A:G,1.631142e-46,1.582635,0.293102,8.562601e-44,13,43023203,43114213,ENSG00000120675,...,chr13,43023729,A,1,0.223684,0.651845,13:43023729:A:G-DNAJC15,14.320429,217721,14.452438
3705,DNAJC15,13:43023844:G:A,1.256548e-45,1.539551,0.298101,1.298433e-43,13,43023203,43114213,ENSG00000120675,...,chr13,43023844,G,1,0.210526,1.000000,13:43023844:G:A-DNAJC15,14.177840,217721,14.340994
3734,DNAJC15,13:43023796:C:G,2.232979e-45,1.573227,0.303667,2.018985e-43,13,43023203,43114213,ENSG00000120675,...,chr13,43023796,C,1,0.210526,1.000000,13:43023796:C:G-DNAJC15,14.137428,217721,14.305534
3792,DNAJC15,13:43023859:G:T,3.361593e-45,1.572161,0.303484,2.917863e-43,13,43023203,43114213,ENSG00000120675,...,chr13,43023859,G,1,0.210526,1.000000,13:43023859:G:T-DNAJC15,14.108606,217721,14.282614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192985,RPAIN,17:5419886:T:G,1.341848e-03,0.001005,0.328373,2.296052e-02,17,5419641,5432877,ENSG00000129197,...,chr17,5419886,T,1,0.355263,0.298242,17:5419886:T:G-RPAIN,-3.206876,217721,-3.474947
194554,SCRN2,17:48107564:C:T,3.926050e-03,0.255550,0.252840,2.361915e-02,17,47837692,47841289,ENSG00000141295,...,chr17,48107564,C,1,0.223684,0.651845,17:48107564:C:T-SCRN2,2.884043,217721,2.856411
194666,ATP5PO,21:33931469:A:G,1.229574e-03,0.389017,0.154086,2.366931e-02,21,33903453,33915814,ENSG00000241837,...,chr21,33931469,A,1,0.250000,0.664641,21:33931469:A:G-ATP5PO,3.231929,217721,2.956788
195408,GIMAP2,7:150676964:T:C,1.607268e-03,-0.288272,0.185411,2.389630e-02,7,150685697,150693641,ENSG00000106560,...,chr7,150676964,T,1,0.289474,1.000000,7:150676964:T:C-GIMAP2,3.154585,217721,3.207018


In [185]:
all_cd4qtls_nre = snp_position_in_mpra(nre_grch_38, cd4_qtl, 'name', 'snp_id')

***
overlap analysis

In [None]:
lcl_analyis.head()


Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
0,rs11810220,chr1,163311300,163311225,163311375
1,rs11585048,chr1,2602648,2602573,2602723
2,rs11585844,chr1,37563668,37563593,37563743
3,rs11587500,chr1,24190390,24190315,24190465
4,rs11588318,chr1,200669534,200669459,200669609


In [None]:
lcl_analyis[lcl_analyis['snp_position'] == 15552942]

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
161,rs113612815,chr1,15552942,15552867,15553017


In [None]:
#dict_mpraname_snp

{'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_92371': ['rs73153267'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_148824': ['rs35188965'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_151006': ['chr5:34008206:D'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_78610': ['rs117140391'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_42758': ['rs3782235'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_33270': ['rs3884627']}

In [None]:
significant_cd4_qtls = pd.read_csv('data\cd4_qtl.csv', sep='\t')


In [None]:
significant_cd4_qtls['snp_chromosome'].dtype

dtype('int64')

In [None]:
#significant_cd4_qtls

In [None]:
#print(len(dict_mpraname_snp.keys()))

6


In [None]:
# i1l = []
# for idx, row in enh.iterrows():
#     chrom = row['chromosome']
#     start1 = row['start_coord']
#     end1 = row['end_coord']
#     i1 = pd.Interval(start1, end1)
#     i1l.append(i1)

    # for idx, row in lcl_analyis.iterrows():
    #     snp_chrom = row['snp_chromosome']
    #     start2 = row['start_coord']
    #     end2 = row['end_coord']
    #     i2 = pd.Interval(start2, end2)
        

In [None]:
enh

Unnamed: 0,name,chromosome,start_coord,end_coord,score,signalValue,pvalue(-log10),qvalue(-log10),qvalue,peak
5832,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr4,51652,51860,2106,2.65842,213.66156,210.61075,2.450473e-211,86
2899,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr16,103666,103886,1051,1.49737,107.86304,105.16298,6.871001e-106,38
3641,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr18,108139,108488,6508,3.71818,655.22162,650.82867,0.000000e+00,251
5833,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr4,124232,124450,889,1.74005,91.54394,88.90956,1.231516e-89,45
3642,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr18,158293,158666,1030,1.71547,105.71258,103.02120,9.523575e-104,190
...,...,...,...,...,...,...,...,...,...,...
867,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,247553533,247553935,888,2.17502,91.48414,88.84994,1.412733e-89,109
868,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,247603599,247603768,1436,3.82628,146.43970,143.60269,2.496376e-144,104
869,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,249132303,249132632,2141,1.87178,217.24672,214.18452,6.538528e-215,66
870,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr1,249168320,249168681,2751,2.84091,278.38608,275.13571,7.316275e-276,159


In [None]:
#dict_mpraname_snp

{'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_92371': ['rs73153267'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_148824': ['rs35188965'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_151006': ['chr5:34008206:D'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_78610': ['rs117140391'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_42758': ['rs3782235'],
 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_33270': ['rs3884627']}

In [None]:
overlap_snp =['rs73153267', 'rs35188965', 'chr5:34008206:D', 'rs117140391', 'rs3782235','rs3884627' ]
overlap_names = [ 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_92371',
                 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_148824',
                 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_151006',
                 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_78610',
                 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_42758',
                 'MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_33270']

In [None]:
lcl_analyis[lcl_analyis['snp'].isin(overlap_snp)].sort_values(by='chromosome')

Unnamed: 0,snp,chromosome,snp_position,start_coord,end_coord
1014,rs3884627,chr11,69258332,69258257,69258407
2703,rs3782235,chr12,56521763,56521688,56521838
3123,rs117140391,chr17,45772421,45772346,45772496
2048,rs73153267,chr2,676782,676707,676857
1240,rs35188965,chr5,1104823,1104748,1104898
3423,chr5:34008206:D,chr5,34008101,34008026,34008176


In [None]:
enh[enh['name'].isin(overlap_names)].sort_values(by='chromosome')

Unnamed: 0,name,chromosome,start_coord,end_coord,score,signalValue,pvalue(-log10),qvalue(-log10),qvalue,peak
1496,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr11,69258237,69258346,2280,3.25933,231.1507,228.04617,8.991456e-229,52
1882,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr12,56521690,56522010,1969,1.53676,199.95511,196.94754,1.128392e-197,244
3425,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr17,45772365,45772577,1271,3.99661,129.9053,127.12438,7.509655000000001e-128,126
4248,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr2,676757,676916,1422,1.5924,145.04224,142.20995,6.16666e-143,106
6150,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr5,1104808,1105008,789,1.97361,81.57341,78.98267,1.0407109999999999e-79,251
6182,MACS2STARRENH_indivuallyrmDupat75thpercentile0...,chr5,34007813,34008139,790,1.73286,81.64395,79.05278,8.855641e-80,76
