# Determine background (non-significant regions) for enrichment analysis of Lenti-STARR MPRA
Further exploring the plasmid reads of the MPRA to determine the non-significant regions <br>
found out reads do not cover the whole annotated regions by Kurtis & Barski (2023) <br>
Final method decided on increasing the range of the reads to cover the whole annotated regions


In [None]:
def increase_range_plasmid(df, left_shift, right_shift, location):
    df['start_plasmid'] = [i - left_shift for i in df['start_plasmid']]
    df['end_plasmid'] = [i + right_shift for i in df['end_plasmid']]

    name = f'L{left_shift}R{right_shift}_enh_plasmid.txt'
    path = location + '/' + name
    df.to_csv(path, header=None, sep='\t', index=False)
    return df

In [12]:
import pandas as pd
import hvplot.pandas
import glob

In [14]:
plasmid = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid_bed.txt', sep='\t', header=None)

***
### grch38 plasmid coordinates
load in converted coordinates into pandas dataframes

In [57]:
path = r'C:/Users/annav/Documents/Stage/data/bed_files/plasmid/converted'
def load_files_into_df(path):
    files = glob.glob(path + '/*.bed')
    dfs = [pd.read_csv(f, header=None, sep='\t') for f in files]
    return dfs

In [59]:
chr_dataframes = load_files_into_df(path)

In [60]:
for i in range(0,len(chr_dataframes)):
    chr_dataframes[i] = chr_dataframes[i].sort_values(by=[0,1])

Combine seperate dataframes into one dataframe

In [61]:
converted_plasmid = pd.concat(chr_dataframes,ignore_index=True)

In [62]:
converted_plasmid

Unnamed: 0,0,1,2,3,4
0,chr1,10027,10036,0.013659,0
1,chr1,10039,10048,0.040976,1
2,chr1,10109,10118,0.040976,2
3,chr1,10128,10137,0.013659,3
4,chr1,10158,10165,0.013659,4
...,...,...,...,...,...
7281963,chr9,138217605,138217614,0.314150,7307251
7281964,chr9,138220317,138220326,0.013659,7307252
7281965,chr9,138220555,138220564,0.013659,7307253
7281966,chr9,138237176,138237185,0.150246,7307254


In [None]:
# See if merging the plasmid reads on the positions makes a difference
converted_plasmid_to_merge = converted_plasmid[[ 0, 1,2, 3]]
#converted_plasmid_to_merge = converted_plasmid_to_merge.sort_values(by=0)
converted_plasmid_to_merge.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/merge_plasmid_coverage.txt', sep='\t', header=None, index=False)

In [64]:
converted_plasmid[(converted_plasmid[0] == 'chr1') & (converted_plasmid[1] <= 494497)]

Unnamed: 0,0,1,2,3,4
0,chr1,10027,10036,0.013659,0
1,chr1,10039,10048,0.040976,1
2,chr1,10109,10118,0.040976,2
3,chr1,10128,10137,0.013659,3
4,chr1,10158,10165,0.013659,4
...,...,...,...,...,...
69,chr1,268968,268977,0.013659,69
70,chr1,269197,269206,0.013659,70
71,chr1,462916,462925,0.013659,74
72,chr1,463312,463321,0.013659,73


In [65]:
converted_plasmid[converted_plasmid.index <= 473993]

Unnamed: 0,0,1,2,3,4
0,chr1,10027,10036,0.013659,0
1,chr1,10039,10048,0.040976,1
2,chr1,10109,10118,0.040976,2
3,chr1,10128,10137,0.013659,3
4,chr1,10158,10165,0.013659,4
...,...,...,...,...,...
473989,chr1,161132793,161132795,2.199050,476938
473990,chr1,161132795,161132797,1.898560,476939
473991,chr1,161132797,161132799,1.447820,476940
473992,chr1,161132799,161132800,1.570750,476941


In [67]:
#converted_plasmid.to_csv(path+'/grch38_plasmid.bed', sep='\t', header=None, index=False)

In [68]:
print(len(plasmid))
print(len(converted_plasmid))

7307256
7281968


In [69]:
print(len(plasmid) - len(converted_plasmid))

25288


***
Test enhancer length with combined reads

In [72]:
enh_plasmid_overlap = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/bedtools/active_sequences_enh_grch38.txt',sep='\t', header=None)

In [110]:
enh_plasmid_overlap =enh_plasmid_overlap.rename(columns={0:'chr_enh', 1:'start_enh', 2:'end_enh',3:'name', 4:'chr_plasmid',5:'start_plasmid', 6:'end_plasmid', 7:'coverage', 8:'id'})

In [111]:
enh_plasmid_overlap['id'].value_counts()

4517487    1
4335567    1
3733162    1
3078156    1
6816155    1
          ..
6888244    1
1851729    1
4501609    1
6888282    1
2872932    1
Name: id, Length: 571970, dtype: int64

In [112]:
enh_plasmid_overlap['length_plasmid'] = (enh_plasmid_overlap['end_plasmid'] - enh_plasmid_overlap['start_plasmid']) +1

In [113]:
enh_plasmid_max = enh_plasmid_overlap.drop_duplicates(subset='name', keep='last')
enh_plasmid_min = enh_plasmid_overlap.drop_duplicates(subset='name', keep='first')

In [114]:
enh_plasmid_max = enh_plasmid_max.sort_values(by='coverage')
enh_plasmid_min = enh_plasmid_min.sort_values(by='coverage')

In [115]:
select_cov_min = [np.log(i +1) for i in enh_plasmid_min['coverage']]
enh_plasmid_min['log_coverage'] = select_cov_min
enh_plasmid_min

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,log_coverage
374565,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001799,18001803,0.013659,4517487,5,0.013566
256318,chr17,75589174,75589268,chr17_73585255_73585349_MACS2STARRENH_indivual...,chr17,75589213,75589215,0.013659,3078734,3,0.013566
33741,chr1,224330221,224330411,chr1_224517923_224518113_MACS2STARRENH_indivua...,chr1,224330374,224330379,0.013659,642857,6,0.013566
329704,chr2,201127641,201127801,chr2_201992364_201992524_MACS2STARRENH_indivua...,chr2,201127682,201127691,0.013659,4122641,10,0.013566
489520,chr6,31197090,31197350,chr6_31164867_31165127_MACS2STARRENH_indivuall...,chr6,31197126,31197135,0.013659,5975401,10,0.013566
...,...,...,...,...,...,...,...,...,...,...,...
289354,chr19,36054433,36054538,chr19_36545335_36545440_MACS2STARRENH_indivual...,chr19,36054460,36054462,1.475140,3483846,3,0.906297
389050,chr3,115147676,115147785,chr3_114866523_114866632_MACS2STARRENH_indivua...,chr3,115147708,115147709,1.584410,4926114,2,0.949497
324951,chr2,175168369,175168476,chr2_176033097_176033204_MACS2STARRENH_indivua...,chr2,175168444,175168445,1.802950,4062919,2,1.030672
256454,chr17,7572613,7572715,chr17_7475931_7476033_MACS2STARRENH_indivually...,chr17,7572637,7572638,1.802950,2818636,2,1.030672


In [116]:
select_cov_max = [np.log(i +1) for i in enh_plasmid_max['coverage']]
enh_plasmid_max['log_coverage'] = select_cov_max
enh_plasmid_max

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,log_coverage
258430,chr17,77935251,77935394,chr17_75931333_75931476_MACS2STARRENH_indivual...,chr17,77935385,77935394,0.040976,3097052,10,0.040159
558132,chr9,137450483,137450632,chr9_140344935_140345084_MACS2STARRENH_indivua...,chr9,137450630,137450631,0.081952,7304125,2,0.078767
483060,chr6,25137653,25137667,chr6_25137881_25137895_MACS2STARRENH_indivuall...,chr6,25137663,25137667,0.081952,5936799,5,0.078767
23126,chr1,181718630,181718916,chr1_181687766_181688052_MACS2STARRENH_indivua...,chr1,181718630,181718635,0.095611,534528,6,0.091312
366274,chr20,60011979,60012083,chr20_58587034_58587138_MACS2STARRENH_indivual...,chr20,60012074,60012083,0.109270,4400602,10,0.103702
...,...,...,...,...,...,...,...,...,...,...,...
57756,chr1,7961479,7961734,chr1_8021539_8021794_MACS2STARRENH_indivuallyr...,chr1,7961561,7961563,40.129300,37375,3,3.716721
317819,chr2,117814445,117814700,chr2_118572021_118572276_MACS2STARRENH_indivua...,chr2,117814527,117814528,40.498000,3945415,2,3.725645
35206,chr1,228139888,228140117,chr1_228327589_228327818_MACS2STARRENH_indivua...,chr1,228140034,228140035,43.257100,659519,2,3.790016
8462,chr1,145095617,145095818,chr1_143913324_143913525_MACS2STARRENH_indivua...,chr1,145095644,145095645,43.994700,391041,2,3.806545


In [117]:
enh_plasmid_max['length_enh'] = (enh_plasmid_max['end_enh'] - enh_plasmid_max['start_enh']) +1
enh_plasmid_min['length_enh'] = (enh_plasmid_min['end_enh'] - enh_plasmid_min['start_enh']) +1

In [118]:
enh_plasmid_min

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,log_coverage,length_enh
374565,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001799,18001803,0.013659,4517487,5,0.013566,288
256318,chr17,75589174,75589268,chr17_73585255_73585349_MACS2STARRENH_indivual...,chr17,75589213,75589215,0.013659,3078734,3,0.013566,95
33741,chr1,224330221,224330411,chr1_224517923_224518113_MACS2STARRENH_indivua...,chr1,224330374,224330379,0.013659,642857,6,0.013566,191
329704,chr2,201127641,201127801,chr2_201992364_201992524_MACS2STARRENH_indivua...,chr2,201127682,201127691,0.013659,4122641,10,0.013566,161
489520,chr6,31197090,31197350,chr6_31164867_31165127_MACS2STARRENH_indivuall...,chr6,31197126,31197135,0.013659,5975401,10,0.013566,261
...,...,...,...,...,...,...,...,...,...,...,...,...
289354,chr19,36054433,36054538,chr19_36545335_36545440_MACS2STARRENH_indivual...,chr19,36054460,36054462,1.475140,3483846,3,0.906297,106
389050,chr3,115147676,115147785,chr3_114866523_114866632_MACS2STARRENH_indivua...,chr3,115147708,115147709,1.584410,4926114,2,0.949497,110
324951,chr2,175168369,175168476,chr2_176033097_176033204_MACS2STARRENH_indivua...,chr2,175168444,175168445,1.802950,4062919,2,1.030672,108
256454,chr17,7572613,7572715,chr17_7475931_7476033_MACS2STARRENH_indivually...,chr17,7572637,7572638,1.802950,2818636,2,1.030672,103


In [119]:
dict_enh_value_counts = dict(enh_plasmid_overlap['name'].value_counts())

In [120]:
enh_plasmid_min = pd.merge(enh_plasmid_min, pd.DataFrame(dict_enh_value_counts.items(), columns=['name', 'count']), on='name' )

In [121]:
enh_plasmid_min

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,log_coverage,length_enh,count
0,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001799,18001803,0.013659,4517487,5,0.013566,288,33
1,chr17,75589174,75589268,chr17_73585255_73585349_MACS2STARRENH_indivual...,chr17,75589213,75589215,0.013659,3078734,3,0.013566,95,34
2,chr1,224330221,224330411,chr1_224517923_224518113_MACS2STARRENH_indivua...,chr1,224330374,224330379,0.013659,642857,6,0.013566,191,23
3,chr2,201127641,201127801,chr2_201992364_201992524_MACS2STARRENH_indivua...,chr2,201127682,201127691,0.013659,4122641,10,0.013566,161,14
4,chr6,31197090,31197350,chr6_31164867_31165127_MACS2STARRENH_indivuall...,chr6,31197126,31197135,0.013659,5975401,10,0.013566,261,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8125,chr19,36054433,36054538,chr19_36545335_36545440_MACS2STARRENH_indivual...,chr19,36054460,36054462,1.475140,3483846,3,0.906297,106,81
8126,chr3,115147676,115147785,chr3_114866523_114866632_MACS2STARRENH_indivua...,chr3,115147708,115147709,1.584410,4926114,2,0.949497,110,85
8127,chr2,175168369,175168476,chr2_176033097_176033204_MACS2STARRENH_indivua...,chr2,175168444,175168445,1.802950,4062919,2,1.030672,108,81
8128,chr17,7572613,7572715,chr17_7475931_7476033_MACS2STARRENH_indivually...,chr17,7572637,7572638,1.802950,2818636,2,1.030672,103,84


In [122]:
enh_plasmid_overlap[enh_plasmid_overlap['coverage'] ==enh_plasmid_min.iloc[0,7]].sort_values(by='length_plasmid')

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid
247208,chr17,50169386,50169571,chr17_48246747_48246932_MACS2STARRENH_indivual...,chr17,50169397,50169398,0.013659,2994711,2
34481,chr1,226739278,226739598,chr1_226926979_226927299_MACS2STARRENH_indivua...,chr1,226739573,226739574,0.013659,654413,2
95874,chr11,14644160,14644403,chr11_14665706_14665949_MACS2STARRENH_indivual...,chr11,14644338,14644339,0.013659,1111871,2
238572,chr17,35521543,35521986,chr17_33848562_33849005_MACS2STARRENH_indivual...,chr17,35521833,35521834,0.013659,2909230,2
133827,chr12,31890676,31890858,chr12_32043610_32043792_MACS2STARRENH_indivual...,chr12,31890712,31890713,0.013659,1533796,2
...,...,...,...,...,...,...,...,...,...,...
177680,chr14,50542985,50543140,chr14_51009703_51009858_MACS2STARRENH_indivual...,chr14,50543076,50543090,0.013659,2096040,15
241179,chr17,40565264,40565714,chr17_38721516_38721966_MACS2STARRENH_indivual...,chr17,40565624,40565641,0.013659,2937497,18
400386,chr3,179322655,179322951,chr3_179040443_179040739_MACS2STARRENH_indivua...,chr3,179322825,179322843,0.013659,5067999,19
198220,chr15,57561210,57561441,chr15_57853408_57853639_MACS2STARRENH_indivual...,chr15,57561409,57561427,0.013659,2366628,19


In [123]:
enh_plasmid_overlap['length_plasmid'].sum()

1842973

In [124]:
enh_plasmid_overlap['length_enh'] = (enh_plasmid_overlap['end_enh'] - enh_plasmid_overlap['start_enh']) +1

In [125]:
d =enh_plasmid_overlap.drop_duplicates(subset='name', keep='first')

In [126]:
d['length_enh'].sum()

1657435

In [128]:
#enh_plasmid_max = enh_plasmid_max.merge(pd.DataFrame(dict_enh_value_counts.items(), columns=['name', 'count']), on='name', how='left')
enh_plasmid_max = pd.merge(enh_plasmid_max, pd.DataFrame(dict_enh_value_counts.items(), columns=['name', 'count']), on='name' )

In [129]:
enh_plasmid_min[enh_plasmid_min['name'] == 'chr22_18484355_18484642_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_118006']

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,log_coverage,length_enh,count
0,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001799,18001803,0.013659,4517487,5,0.013566,288,33


In [130]:
enh_plasmid_max[enh_plasmid_max['name'] == 'chr22_18484355_18484642_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_118006']

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,log_coverage,length_enh,count
1854,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001808,18001812,1.40685,4517490,5,0.878319,288,33


In [131]:
enh_plasmid_max = enh_plasmid_max.drop('id', axis=True)

In [132]:
enh_plasmid_max.sort_values(by='coverage')

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,length_plasmid,log_coverage,length_enh,count
0,chr17,77935251,77935394,chr17_75931333_75931476_MACS2STARRENH_indivual...,chr17,77935385,77935394,0.040976,10,0.040159,144,2
1,chr9,137450483,137450632,chr9_140344935_140345084_MACS2STARRENH_indivua...,chr9,137450630,137450631,0.081952,2,0.078767,150,19
2,chr6,25137653,25137667,chr6_25137881_25137895_MACS2STARRENH_indivuall...,chr6,25137663,25137667,0.081952,5,0.078767,15,4
3,chr1,181718630,181718916,chr1_181687766_181688052_MACS2STARRENH_indivua...,chr1,181718630,181718635,0.095611,6,0.091312,287,4
4,chr20,60011979,60012083,chr20_58587034_58587138_MACS2STARRENH_indivual...,chr20,60012074,60012083,0.109270,10,0.103702,105,2
...,...,...,...,...,...,...,...,...,...,...,...,...
8125,chr1,7961479,7961734,chr1_8021539_8021794_MACS2STARRENH_indivuallyr...,chr1,7961561,7961563,40.129300,3,3.716721,256,152
8126,chr2,117814445,117814700,chr2_118572021_118572276_MACS2STARRENH_indivua...,chr2,117814527,117814528,40.498000,2,3.725645,256,138
8127,chr1,228139888,228140117,chr1_228327589_228327818_MACS2STARRENH_indivua...,chr1,228140034,228140035,43.257100,2,3.790016,230,179
8128,chr1,145095617,145095818,chr1_143913324_143913525_MACS2STARRENH_indivua...,chr1,145095644,145095645,43.994700,2,3.806545,202,149


In [133]:
enh_plasmid_min.iloc[0,3]

'chr22_18484355_18484642_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_118006'

In [134]:
enh_plasmid_overlap[enh_plasmid_overlap['name'] == enh_plasmid_min.iloc[0,3]].sort_values(by='start_plasmid')

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,length_enh
374550,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001582,18001591,0.396102,4517472,10,288
374551,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001591,18001600,0.068293,4517473,10,288
374552,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001629,18001638,0.505372,4517474,10,288
374553,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001659,18001668,0.505372,4517475,10,288
374554,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001679,18001688,0.163904,4517476,10,288
374555,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001711,18001720,0.669276,4517477,10,288
374556,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001736,18001745,0.163904,4517478,10,288
374557,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001745,18001754,0.218539,4517479,10,288
374558,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001777,18001779,0.532689,4517480,3,288
374559,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001779,18001784,0.682935,4517481,6,288


In [135]:
enh_plasmid_max.iloc[-1,3]

'chr17_22020698_22020877_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_76697'

In [136]:
chr12_117175903 =enh_plasmid_overlap[enh_plasmid_overlap['name'] == 'chr12_117175903_117176026_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_47520'
].sort_values(by='start_plasmid')
chr12_117175903

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,length_enh
123366,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738098,116738099,2.69076,1765542,2,124
123367,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738099,116738100,2.37661,1765543,2,124
123368,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738100,116738101,7.97668,1765544,2,124
123369,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738101,116738102,8.63230,1765545,2,124
123370,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738102,116738104,8.57766,1765546,3,124
...,...,...,...,...,...,...,...,...,...,...,...
123462,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738215,116738216,14.62850,1765638,2,124
123463,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738216,116738217,11.41870,1765639,2,124
123464,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738217,116738218,8.63230,1765640,2,124
123465,chr12,116738098,116738221,chr12_117175903_117176026_MACS2STARRENH_indivu...,chr12,116738218,116738219,5.80495,1765641,2,124


In [137]:
chr12_117175903['length_plasmid'].sum()

224

In [138]:
enh_plasmid_min.hvplot.hist('log_coverage')

In [139]:
enh_plasmid_max.hvplot.hist('log_coverage')

In [151]:
converted_plasmid = converted_plasmid.sort_values(by='log_coverage')

In [152]:
converted_plasmid = converted_plasmid.rename(columns={3:'coverage', 5:'log_coverage'})

In [153]:
overlap_coverage = [ i +1 for i in enh_plasmid_overlap['coverage']]
log_coverage_overlap = list(np.log(overlap_coverage))
enh_plasmid_overlap['log_coverage'] = log_coverage_overlap

In [158]:
len(converted_plasmid)- len(enh_plasmid_overlap)

6709998

In [169]:
converted_plasmid_to_merge = converted_plasmid[[ 0, 1,2]]

In [170]:
converted_plasmid_to_merge = converted_plasmid_to_merge.sort_values(by=0)

In [171]:
converted_plasmid_to_merge

Unnamed: 0,0,1,2
0,chr1,10027,10036
103066,chr1,24536691,24536692
704586,chr1,246724379,246724380
710923,chr1,248922260,248922269
133626,chr1,28329050,28329055
...,...,...,...
7133975,chr9,93915721,93915730
7133979,chr9,93916039,93916048
7097477,chr9,77433833,77433835
7141830,chr9,96654452,96654453


In [172]:
converted_plasmid_to_merge.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/_merge_plasmid.txt', sep='\t', header=None, index=False)

In [173]:
converted_plasmid

Unnamed: 0,0,1,2,coverage,4,log_coverage
0,chr1,10027,10036,0.013659,0,0.013566
5018223,chr4,130714640,130714649,0.013659,5377381,0.013566
5018261,chr4,130756497,130756506,0.013659,5377419,0.013566
5018262,chr4,130756735,130756744,0.013659,5377420,0.013566
5018269,chr4,130769583,130769592,0.013659,5377427,0.013566
...,...,...,...,...,...,...
1084,chr1,634590,634591,405.704000,1084,6.008086
1082,chr1,634588,634589,417.451000,1082,6.036560
1085,chr1,634591,634592,421.835000,1085,6.046982
1083,chr1,634589,634590,434.019000,1083,6.075390


In [174]:
merged_plasmid = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/bedtools/merged_plasmid.txt', sep='\t', header=None)

In [175]:
merged_plasmid

Unnamed: 0,0,1,2
0,chr1,10027,10036
1,chr1,10039,10048
2,chr1,10109,10118
3,chr1,10128,10137
4,chr1,10158,10174
...,...,...,...
4054424,chr9,138217605,138217614
4054425,chr9,138220317,138220326
4054426,chr9,138220555,138220564
4054427,chr9,138237176,138237185


In [176]:
enh_merged_plasmid = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/bedtools/enh_overlap_merged_plasmid.txt', sep='\t', header=None)

In [177]:
enh_merged_plasmid

Unnamed: 0,0,1,2,3,4,5,6
0,chr1,100046979,100047187,chr1_100512535_100512743_MACS2STARRENH_indivua...,chr1,100046975,100046988
1,chr1,100046979,100047187,chr1_100512535_100512743_MACS2STARRENH_indivua...,chr1,100046990,100047005
2,chr1,100046979,100047187,chr1_100512535_100512743_MACS2STARRENH_indivua...,chr1,100047026,100047035
3,chr1,100046979,100047187,chr1_100512535_100512743_MACS2STARRENH_indivua...,chr1,100047048,100047057
4,chr1,100046979,100047187,chr1_100512535_100512743_MACS2STARRENH_indivua...,chr1,100047061,100047072
...,...,...,...,...,...,...,...
43062,chr9,99236033,99236236,chr9_101998315_101998518_MACS2STARRENH_indivua...,chr9,99236104,99236118
43063,chr9,99236033,99236236,chr9_101998315_101998518_MACS2STARRENH_indivua...,chr9,99236122,99236138
43064,chr9,99236033,99236236,chr9_101998315_101998518_MACS2STARRENH_indivua...,chr9,99236154,99236163
43065,chr9,99236033,99236236,chr9_101998315_101998518_MACS2STARRENH_indivua...,chr9,99236227,99236244


In [178]:
enh_merged_plasmid['lenght_plasmid'] = (enh_merged_plasmid[6] - enh_merged_plasmid[5]) +1

In [179]:
enh_merged_plasmid[3].value_counts()

chr12_498076_498895_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_38319          31
chr2_8684967_8685545_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_92919         23
chr3_52016812_52017286_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_124873      22
chr21_37661293_37661980_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_116982     21
chr7_3340402_3340826_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_175135        21
                                                                                            ..
chr6_33385855_33386164_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_164418       1
chr6_33359408_33359588_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_164416       1
chr3_49507608_49507716_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_124646       1
chr16_15188144_15188312_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_70178       1
chr9_102584008_102584114_MACS2STARRENH_indivuallyr

In [180]:
enh_merged_plasmid[enh_merged_plasmid[3] == 'chr2_8684967_8685545_MACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_92919']

Unnamed: 0,0,1,2,3,4,5,6,lenght_plasmid
24886,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8544837,8544860,24
24887,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8544900,8544909,10
24888,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8544914,8544926,13
24889,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8544951,8544960,10
24890,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8544966,8544975,10
24891,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8544978,8544989,12
24892,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8544990,8545012,23
24893,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8545014,8545023,10
24894,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8545029,8545038,10
24895,chr2,8544837,8545415,chr2_8684967_8685545_MACS2STARRENH_indivuallyr...,chr2,8545049,8545071,23


In [181]:
enh_merged_plasmid['lenght_plasmid'].value_counts()

10     12551
15      1795
12      1392
17      1214
18      1186
       ...  
620        1
457        1
377        1
511        1
495        1
Name: lenght_plasmid, Length: 496, dtype: int64

In [None]:
merged_enh_plasmid = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer/merged_plasmid_in_enh.txt', sep='\t', header=None)

In [None]:
merged_enh_plasmid[2] = [ i +10 for i in merged_enh_plasmid[2]]

In [None]:
merged_enh_plasmid.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer/10_merged_enh_plasmid.txt', header=None, index=False, sep= '\t')

In [182]:
enh_plasmid_overlap

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,length_enh,log_coverage
374565,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001799,18001803,0.013659,4517487,5,288,0.013566
244694,chr17,47100104,47100375,chr17_45177470_45177741_MACS2STARRENH_indivual...,chr17,47100125,47100134,0.013659,2974649,10,272,0.013566
393462,chr3,132417265,132417548,chr3_132136109_132136392_MACS2STARRENH_indivua...,chr3,132417305,132417308,0.013659,4975191,4,284,0.013566
81072,chr10,7269350,7269676,chr10_7311312_7311638_MACS2STARRENH_indivually...,chr10,7269502,7269504,0.013659,737526,3,327,0.013566
393423,chr3,13110824,13111039,chr3_13152324_13152539_MACS2STARRENH_indivuall...,chr3,13110988,13110995,0.013659,4686674,8,216,0.013566
...,...,...,...,...,...,...,...,...,...,...,...,...
235449,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521422,22521423,63.772500,2872936,2,180,4.170881
235448,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521421,22521422,64.441700,2872935,2,180,4.181160
235447,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521420,22521421,66.654500,2872934,2,180,4.214414
235446,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521419,22521420,67.105200,2872933,2,180,4.221054


In [183]:
converted_plasmid[(converted_plasmid[0]== 'chr1') & (converted_plasmid[1] < 26321543)]

Unnamed: 0,0,1,2,coverage,4,log_coverage
0,chr1,10027,10036,0.013659,0,0.013566
68199,chr1,16012887,16012891,0.013659,68213,0.013566
54494,chr1,11730295,11730304,0.013659,54493,0.013566
18890,chr1,2546500,2546507,0.013659,18889,0.013566
18891,chr1,2546513,2546522,0.013659,18890,0.013566
...,...,...,...,...,...,...
1084,chr1,634590,634591,405.704000,1084,6.008086
1082,chr1,634588,634589,417.451000,1082,6.036560
1085,chr1,634591,634592,421.835000,1085,6.046982
1083,chr1,634589,634590,434.019000,1083,6.075390


In [184]:
converted_plasmid[converted_plasmid.index <= 116693]

Unnamed: 0,0,1,2,coverage,4,log_coverage
0,chr1,10027,10036,0.013659,0,0.013566
68199,chr1,16012887,16012891,0.013659,68213,0.013566
54494,chr1,11730295,11730304,0.013659,54493,0.013566
18890,chr1,2546500,2546507,0.013659,18889,0.013566
18891,chr1,2546513,2546522,0.013659,18890,0.013566
...,...,...,...,...,...,...
1084,chr1,634590,634591,405.704000,1084,6.008086
1082,chr1,634588,634589,417.451000,1082,6.036560
1085,chr1,634591,634592,421.835000,1085,6.046982
1083,chr1,634589,634590,434.019000,1083,6.075390


enhancer sequence grch37: chr1_26647987_26648255 <br>
- chr1	26321496	26321764	chr1_26647987_26648255_MACS2STARRENH_indivuall... |	chr1	26321543	26321544	1:26321543:C:T<br>
                                                                                    chr1	26648034	26648035

- grch38 enhancer = chr1	26321496	26321764

- cd4 qtl        =  chr1	26321543	26321544

- grch38 plasmid =  chr1	26321496	26321505
-                   chr1	26321577	26321578


In [185]:
plasmid[(plasmid.index <= 116708)]

Unnamed: 0,0,1,2,3,4
0,chr1,10027,10036,0.013659,0
1,chr1,10039,10048,0.040976,1
2,chr1,10109,10118,0.040976,2
3,chr1,10128,10137,0.013659,3
4,chr1,10158,10165,0.013659,4
...,...,...,...,...,...
116704,chr1,26647961,26647965,0.040976,116704
116705,chr1,26647965,26647974,0.259515,116705
116706,chr1,26647987,26647996,0.792205,116706
116707,chr1,26648068,26648069,0.081952,116707


In [186]:
converted_plasmid[converted_plasmid.index <= 116693]

Unnamed: 0,0,1,2,coverage,4,log_coverage
0,chr1,10027,10036,0.013659,0,0.013566
68199,chr1,16012887,16012891,0.013659,68213,0.013566
54494,chr1,11730295,11730304,0.013659,54493,0.013566
18890,chr1,2546500,2546507,0.013659,18889,0.013566
18891,chr1,2546513,2546522,0.013659,18890,0.013566
...,...,...,...,...,...,...
1084,chr1,634590,634591,405.704000,1084,6.008086
1082,chr1,634588,634589,417.451000,1082,6.036560
1085,chr1,634591,634592,421.835000,1085,6.046982
1083,chr1,634589,634590,434.019000,1083,6.075390


In [187]:
plasmid

Unnamed: 0,0,1,2,3,4
0,chr1,10027,10036,0.013659,0
1,chr1,10039,10048,0.040976,1
2,chr1,10109,10118,0.040976,2
3,chr1,10128,10137,0.013659,3
4,chr1,10158,10165,0.013659,4
...,...,...,...,...,...
7307251,chr9,141108055,141108064,0.314150,7307251
7307252,chr9,141110767,141110776,0.013659,7307252
7307253,chr9,141111005,141111014,0.013659,7307253
7307254,chr9,141127626,141127635,0.150246,7307254


In [188]:
converted_plasmid = converted_plasmid.rename(columns={0:'chr', 1:'start', 2:'end'})

In [189]:
converted_plasmid

Unnamed: 0,chr,start,end,coverage,4,log_coverage
0,chr1,10027,10036,0.013659,0,0.013566
5018223,chr4,130714640,130714649,0.013659,5377381,0.013566
5018261,chr4,130756497,130756506,0.013659,5377419,0.013566
5018262,chr4,130756735,130756744,0.013659,5377420,0.013566
5018269,chr4,130769583,130769592,0.013659,5377427,0.013566
...,...,...,...,...,...,...
1084,chr1,634590,634591,405.704000,1084,6.008086
1082,chr1,634588,634589,417.451000,1082,6.036560
1085,chr1,634591,634592,421.835000,1085,6.046982
1083,chr1,634589,634590,434.019000,1083,6.075390


In [190]:
plasmid = plasmid.rename(columns={0:'chr', 1:'start', 2:'end'})

In [None]:
plasmid.query("chr == 'chr1' and start >= 26647987 and end <= 26648255")

26321496	26321764

In [192]:
enh_in_plasmid_grch38 = converted_plasmid.query("chr == 'chr1' and start >= 26321496 and end <= 26321764")
enh_in_plasmid_grch38.head()

Unnamed: 0,chr,start,end,coverage,4,log_coverage
116698,chr1,26321619,26321620,0.054635,116712,0.053195
116693,chr1,26321577,26321578,0.081952,116707,0.078767
116708,chr1,26321641,26321642,0.095611,116722,0.091312
116738,chr1,26321698,26321699,0.095611,116752,0.091312
116694,chr1,26321578,26321583,0.232198,116708,0.2088


In [194]:
enh_in_plasmid_grch38.query("chr == 'chr1' and start >= 2632153 and end <= 2632153")

Unnamed: 0,chr,start,end,coverage,4,log_coverage


***

In [195]:
len(converted_plasmid)

7281968

In [198]:
len(enh_plasmid_overlap)

571970

In [199]:
# ratio 8130 enhancer regios / 571970 plasmide parts
len(enh_plasmid_overlap)/8130

70.3530135301353

In [201]:
non_sig_plasmid_enh = (len(converted_plasmid) - len(enh_plasmid_overlap))
non_sig_plasmid_enh

6709998

In [202]:

non_sig_enh = (non_sig_plasmid_enh / (len(enh_plasmid_overlap)/8130))
non_sig_enh

95376.12766403833

#### NREs

In [203]:
nre_plasmid_overlap = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/bedtools/active_sequences_nre_grch38.txt', sep='\t', header=None)

In [204]:
nre= pd.read_csv('C:/Users/annav/Documents/Stage/data/nre_cd4_coords.txt', sep='\t', header=None)
len(nre)

6024

In [205]:
len(nre_plasmid_overlap)

416076

In [206]:
non_sig_plamsid_nre = len(converted_plasmid) - len(nre_plasmid_overlap)
non_sig_plamsid_nre

6865892

In [207]:
# ratio: 6024 nre / 416076 plasmide parts
ratio_nre = len(nre_plasmid_overlap) / len(nre)
ratio_nre

69.06972111553785

In [208]:
non_sig_nre = non_sig_plamsid_nre / ratio_nre
non_sig_nre

99405.2370432325

- sig cd4 qtls: 199627
<br><br>
- sig cd4 qtls in enh: 230, 228 unique
- sig cd4 in non-sig plasmid: 199397 ( 199627 - 230 )
- 571970
- 6709998


***

In [210]:
plasmid_in_enh = enh_plasmid_overlap.iloc[:, 4:9]
plasmid_in_enh


Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
374565,chr22,18001799,18001803,0.013659,4517487
244694,chr17,47100125,47100134,0.013659,2974649
393462,chr3,132417305,132417308,0.013659,4975191
81072,chr10,7269502,7269504,0.013659,737526
393423,chr3,13110988,13110995,0.013659,4686674
...,...,...,...,...,...
235449,chr17,22521422,22521423,63.772500,2872936
235448,chr17,22521421,22521422,64.441700,2872935
235447,chr17,22521420,22521421,66.654500,2872934
235446,chr17,22521419,22521420,67.105200,2872933


In [211]:
plasmid_in_enh[plasmid_in_enh.chr_plasmid == 'chr4'].sort_values(by='start_plasmid')

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
433385,chr4,51758,51762,2.048800,5124625
433386,chr4,51762,51763,2.103440,5124626
433387,chr4,51763,51764,1.352210,5124627
433388,chr4,51764,51765,0.983426,5124628
433389,chr4,51765,51767,0.915133,5124629
...,...,...,...,...,...
428522,chr4,186191385,186191391,0.068293,5468181
428523,chr4,186191391,186191392,0.587324,5468182
428524,chr4,186191392,186191393,0.723911,5468183
428525,chr4,186191393,186191394,0.655618,5468184


In [212]:
sorted = plasmid_in_enh.sort_values(by=['chr_plasmid', 'start_plasmid'])


In [213]:
sorted[(sorted['id'] <= 473925) & (sorted['chr_plasmid']== 'chr1') & (sorted['id'] >= 473916)]

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
16650,chr1,160796003,160796008,3.34638,473916
16651,chr1,160796008,160796009,2.96394,473917
16652,chr1,160796009,160796012,2.29466,473918
16653,chr1,160796012,160796013,0.081952,473919
16654,chr1,160796013,160796015,0.382444,473920
16655,chr1,160796015,160796017,0.42342,473921
16656,chr1,160796017,160796018,0.341467,473922
16657,chr1,160796018,160796020,1.43416,473923
16658,chr1,160796020,160796022,1.74831,473924
16659,chr1,160796022,160796024,2.32198,473925


In [214]:
sorted[(sorted['chr_plasmid'] == 'chr1') & (sorted['start_plasmid'] >= 26321496)]

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
42608,chr1,26321496,26321505,0.792205,116706
42609,chr1,26321577,26321578,0.081952,116707
42610,chr1,26321578,26321583,0.232198,116708
42611,chr1,26321583,26321586,0.942450,116709
42612,chr1,26321586,26321587,0.860498,116710
...,...,...,...,...,...
41007,chr1,248906548,248906549,4.466390,713703
41008,chr1,248906549,248906553,3.810780,713704
41009,chr1,248906553,248906554,4.766890,713705
41010,chr1,248906554,248906555,4.097610,713706


Test range plasmid parts create files

In [215]:
sorted

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
58217,chr1,844682,844686,0.218539,1913
58218,chr1,844686,844691,0.587324,1914
58219,chr1,844691,844695,0.368785,1915
58220,chr1,844715,844724,0.218539,1916
58221,chr1,844736,844745,0.396102,1917
...,...,...,...,...,...
558388,chr9,137722877,137722884,0.013659,7306388
558389,chr9,137722884,137722886,0.068293,7306389
558390,chr9,137722886,137722888,0.833181,7306390
558391,chr9,137722888,137722893,0.874157,7306391


In [216]:
df_enh_plasmid = sorted.copy()

#### Increase range of the plasmid reads
Merging shows missing basepairs within determined significant enhancers and NREs <br>
eqtls counted in the whole region vs the reads differs, to account for those we decided to increase the reads range

In [217]:
def increase_range_plasmid(df, left_shift, right_shift, location):
    df['start_plasmid'] = [i - left_shift for i in df['start_plasmid']]
    df['end_plasmid'] = [i + right_shift for i in df['end_plasmid']]

    name = f'L{left_shift}R{right_shift}_enh_plasmid.txt'
    path = location + '/' + name
    df.to_csv(path, header=None, sep='\t', index=False)
    return df

In [218]:
df_enh_plasmid = increase_range_plasmid(df_enh_plasmid, 15, 15, 'C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer')

In [219]:
all_plasmid = pd.read_csv('C:/Users/annav/Documents/Stage/data/grch38_plasmid.bed', sep='\t',header=None, names=['chr_plasmid', 'start_plasmid', 'end_plasmid','coverage', 'id'] )

In [224]:
all_plasmid[all_plasmid['id'] == 4472654]

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
3521854,chr21,36173889,36173898,0.355126,4472654


In [225]:
df_all_plasmid = increase_range_plasmid(all_plasmid, 15, 15, 'C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer')


In [226]:
df_all_plasmid

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
0,chr1,10012,10051,0.013659,0
1,chr1,10024,10063,0.040976,1
2,chr1,10094,10133,0.040976,2
3,chr1,10113,10152,0.013659,3
4,chr1,10143,10180,0.013659,4
...,...,...,...,...,...
7281963,chr9,138217590,138217629,0.314150,7307251
7281964,chr9,138220302,138220341,0.013659,7307252
7281965,chr9,138220540,138220579,0.013659,7307253
7281966,chr9,138237161,138237200,0.150246,7307254


In [261]:
df_all_plasmid.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/L15R15_plasmid.txt', header=None, index=False, sep='\t')

In [268]:
df_enh_plasmid.sort_values(by='id')

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
58217,chr1,844667,844701,0.218539,1913
58218,chr1,844671,844706,0.587324,1914
58219,chr1,844676,844710,0.368785,1915
58220,chr1,844700,844739,0.218539,1916
58221,chr1,844721,844760,0.396102,1917
...,...,...,...,...,...
558388,chr9,137722862,137722899,0.013659,7306388
558389,chr9,137722869,137722901,0.068293,7306389
558390,chr9,137722871,137722903,0.833181,7306390
558391,chr9,137722873,137722908,0.874157,7306391


In [259]:
df_all_plasmid[~df_all_plasmid['id'].isin(df_enh_plasmid['id'])]

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
0,chr1,10012,10051,0.013659,0
1,chr1,10024,10063,0.040976,1
2,chr1,10094,10133,0.040976,2
3,chr1,10113,10152,0.013659,3
4,chr1,10143,10180,0.013659,4
...,...,...,...,...,...
7281963,chr9,138217590,138217629,0.314150,7307251
7281964,chr9,138220302,138220341,0.013659,7307252
7281965,chr9,138220540,138220579,0.013659,7307253
7281966,chr9,138237161,138237200,0.150246,7307254


In [277]:
df_all_plasmid[(df_all_plasmid['chr_plasmid'] == 'chr1') & (df_all_plasmid['start_plasmid'] < 26321543)]

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
0,chr1,10012,10051,0.013659,0
1,chr1,10024,10063,0.040976,1
2,chr1,10094,10133,0.040976,2
3,chr1,10113,10152,0.013659,3
4,chr1,10143,10180,0.013659,4
...,...,...,...,...,...
116688,chr1,26321446,26321480,0.450737,116702
116689,chr1,26321450,26321485,0.491713,116703
116690,chr1,26321455,26321489,0.040976,116704
116691,chr1,26321459,26321498,0.259515,116705


save sig enhancers

In [245]:
sig_enh_plasmid = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer/L15R15_plasmid_enh_overlap.txt', sep='\t', header=None)

In [246]:
sig_enh_plasmid = sig_enh_plasmid[[0 , 1, 2, 3, 4]]

In [271]:
sig_enh_plasmid =sig_enh_plasmid.sort_values(by=4)

In [273]:
sig_enh_plasmid.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer/sig_enh_plasmid.txt', sep='\t', header=None, index=False)

In [227]:
df_enh_plasmid

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
58217,chr1,844667,844701,0.218539,1913
58218,chr1,844671,844706,0.587324,1914
58219,chr1,844676,844710,0.368785,1915
58220,chr1,844700,844739,0.218539,1916
58221,chr1,844721,844760,0.396102,1917
...,...,...,...,...,...
558388,chr9,137722862,137722899,0.013659,7306388
558389,chr9,137722869,137722901,0.068293,7306389
558390,chr9,137722871,137722903,0.833181,7306390
558391,chr9,137722873,137722908,0.874157,7306391


Determine background

In [None]:
non_sig_enh_plasmid = df_all_plasmid[~df_all_plasmid['id'].isin(sig_enh_plasmid[4])]

In [296]:
non_sig_enh_plasmid.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer/non_sig_enh_plasmid.txt',
                           header=None, sep='\t', index=False)

In [6]:
nre_coords = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/nre_cd4_coords.txt', sep='\t', header=None)
nre_coords

Unnamed: 0,0,1,2,3
0,chr1,827261,827812,chr1_762641_763192_CALLNREreversecontrolTxMACS...
1,chr1,976113,976406,chr1_911493_911786_CALLNREreversecontrolTxMACS...
2,chr1,1013549,1013744,chr1_948929_949124_CALLNREreversecontrolTxMACS...
3,chr1,1053853,1054016,chr1_989233_989396_CALLNREreversecontrolTxMACS...
4,chr1,1217012,1217240,chr1_1152392_1152620_CALLNREreversecontrolTxMA...
...,...,...,...,...
6019,chr9,137086622,137087090,chr9_139981074_139981542_CALLNREreversecontrol...
6020,chr9,137611849,137612043,chr9_140506301_140506495_CALLNREreversecontrol...
6021,chr9,137618475,137618850,chr9_140512927_140513302_CALLNREreversecontrol...
6022,chr9,137682620,137682776,chr9_140577072_140577228_CALLNREreversecontrol...


In [257]:
sig_nre_plasmid = pd.read_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/nre/L15R15_plasmid_nre_overlap.txt', sep='\t', header=None)

In [258]:
sig_nre_plasmid[8].value_counts()

chr11_59383383_59383859_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_58302      371
chr4_103749019_103749398_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_258087    317
chr19_41769396_41770263_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_163318     285
chr12_50505583_50506018_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_75861      283
chr12_53773360_53773696_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_76436      264
                                                                                                                  ... 
chr4_148652465_148652824_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_263644      1
chr1_31867523_31867805_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_5114          1
chr1_167774490_167774633_CALLNREreversecontrolTx

In [243]:
non_sig_enh_plasmid[~non_sig_enh_plasmid['id'].isin(list(sig_nre_plasmid[4]))]

Unnamed: 0,chr_plasmid,start_plasmid,end_plasmid,coverage,id
0,chr1,10012,10051,0.013659,0
1,chr1,10024,10063,0.040976,1
2,chr1,10094,10133,0.040976,2
3,chr1,10113,10152,0.013659,3
4,chr1,10143,10180,0.013659,4
...,...,...,...,...,...
7281963,chr9,138217590,138217629,0.314150,7307251
7281964,chr9,138220302,138220341,0.013659,7307252
7281965,chr9,138220540,138220579,0.013659,7307253
7281966,chr9,138237161,138237200,0.150246,7307254


In [250]:
sig_enh_plasmid

Unnamed: 0,0,1,2,3,4
0,chr1,100046944,100046983,0.314150,327031
1,chr1,100046960,100046992,0.177563,327032
2,chr1,100046962,100046994,0.191222,327033
3,chr1,100046964,100046999,1.092700,327034
4,chr1,100046969,100047001,0.915133,327035
...,...,...,...,...,...
644306,chr9,99821821,99821853,0.901474,7179068
644307,chr9,99821823,99821855,0.915133,7179069
644308,chr9,99821825,99821859,0.846839,7179070
644309,chr9,99821829,99821860,0.368785,7179071


In [9]:
sig_nre_plasmid = sig_nre_plasmid[[0 , 1, 2, 3, 4]]

In [10]:
sig_nre_plasmid =sig_nre_plasmid.sort_values(by=4)

In [232]:
sig_nre_plasmid.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/nre/sig_nre_plasmid.txt', sep='\t', header=None, index=False)

In [233]:
non_sig_nre_plasmid = df_all_plasmid[~df_all_plasmid['id'].isin(sig_nre_plasmid[4])]

In [238]:
non_sig_nre_plasmid.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/nre/non_sig_nre_plasmid.txt',
                           header=None, sep='\t', index=False)

In [235]:
nre_plasmid_overlap

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,100351423,100351619,chr1_100816979_100817175_CALLNREreversecontrol...,chr1,100351423,100351425,2.513200,328215
1,chr1,100351423,100351619,chr1_100816979_100817175_CALLNREreversecontrol...,chr1,100351425,100351427,2.991260,328216
2,chr1,100351423,100351619,chr1_100816979_100817175_CALLNREreversecontrol...,chr1,100351427,100351428,1.707340,328217
3,chr1,100351423,100351619,chr1_100816979_100817175_CALLNREreversecontrol...,chr1,100351428,100351430,2.458570,328218
4,chr1,100351423,100351619,chr1_100816979_100817175_CALLNREreversecontrol...,chr1,100351430,100351432,2.663450,328219
...,...,...,...,...,...,...,...,...,...
416071,chr9,99433634,99433839,chr9_102195916_102196121_CALLNREreversecontrol...,chr9,99433785,99433792,0.027317,7177983
416072,chr9,99433634,99433839,chr9_102195916_102196121_CALLNREreversecontrol...,chr9,99433801,99433810,0.505372,7177984
416073,chr9,99433634,99433839,chr9_102195916_102196121_CALLNREreversecontrol...,chr9,99433827,99433830,0.054635,7177985
416074,chr9,99433634,99433839,chr9_102195916_102196121_CALLNREreversecontrol...,chr9,99433830,99433836,0.423420,7177986


In [None]:
non_sig_enh_plasmid.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer/non_sig_enh_plasmid.txt',
                           header=None, sep='\t', index=False)

In [213]:

sorted.to_csv('C:/Users/annav/Documents/Stage/data/bed_files/plasmid/enhancer/plasmid_in_enh.txt',index=False, header=None, sep= '\t')

In [218]:
enh_plasmid_overlap[enh_plasmid_overlap.name.map(enh_plasmid_overlap.name.value_counts() >1)]

Unnamed: 0,chr_enh,start_enh,end_enh,name,chr_plasmid,start_plasmid,end_plasmid,coverage,id,length_plasmid,length_enh,log_coverage
374565,chr22,18001589,18001876,chr22_18484355_18484642_MACS2STARRENH_indivual...,chr22,18001799,18001803,0.013659,4517487,5,288,0.013566
244694,chr17,47100104,47100375,chr17_45177470_45177741_MACS2STARRENH_indivual...,chr17,47100125,47100134,0.013659,2974649,10,272,0.013566
393462,chr3,132417265,132417548,chr3_132136109_132136392_MACS2STARRENH_indivua...,chr3,132417305,132417308,0.013659,4975191,4,284,0.013566
81072,chr10,7269350,7269676,chr10_7311312_7311638_MACS2STARRENH_indivually...,chr10,7269502,7269504,0.013659,737526,3,327,0.013566
393423,chr3,13110824,13111039,chr3_13152324_13152539_MACS2STARRENH_indivuall...,chr3,13110988,13110995,0.013659,4686674,8,216,0.013566
...,...,...,...,...,...,...,...,...,...,...,...,...
235449,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521422,22521423,63.772500,2872936,2,180,4.170881
235448,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521421,22521422,64.441700,2872935,2,180,4.181160
235447,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521420,22521421,66.654500,2872934,2,180,4.214414
235446,chr17,22521372,22521551,chr17_22020698_22020877_MACS2STARRENH_indivual...,chr17,22521419,22521420,67.105200,2872933,2,180,4.221054
