# Data inspection of plasmid reads of CD4+ T-cell Lenti-STARR MPRA of Kurtis & Barksi (Cis-regulatory atlas of primary human CD4+ T cells)
Paper: https://pmc.ncbi.nlm.nih.gov/articles/PMC10173520/#_ad93_<br>
Data available in GEO: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE217535

In [None]:
import pandas as pd
import hvplot.pandas

In [None]:
import numpy as np
from position_in_region import snp_position_in_mpra

In [None]:
colnames=['snp_chromosome', 'snp_position', 'end_coord', 'coverage']

In [None]:
# Contains all plasmid reads that are tested with Lenti-STARR (significant and non-significant)
plasmid = pd.read_csv('GSE217535_Plasmid_R1.fq.bedGraph', sep='\t', header=None, names=colnames)

In [None]:
plasmid

Unnamed: 0,snp_chromosome,snp_position,end_coord,coverage
0,chr1,10027,10036,0.013659
1,chr1,10039,10048,0.040976
2,chr1,10109,10118,0.040976
3,chr1,10128,10137,0.013659
4,chr1,10158,10165,0.013659
...,...,...,...,...
7499408,chrY,59029251,59029260,0.245857
7499409,chrY,59029514,59029523,0.409761
7499410,chrY,59029667,59029676,0.409761
7499411,chrY,59031814,59031823,0.218539


Determine if there is a threshold in the coverage of the plasmid reads that can be discarded possibly <br>
Would be a column followed by a normal distribution

In [None]:
hist_plasmid = plasmid.hvplot.hist('coverage')
hist_plasmid



In [None]:
densitiy = plasmid.hvplot.kde('coverage')
densitiy



In [None]:
plasmid_cov = plasmid[plasmid['coverage'] > 30]

In [None]:
plasmid_cov.sort_values(by='coverage')

Unnamed: 0,snp_chromosome,snp_position,end_coord,coverage
5734815,chr5,134260292,134260293,30.1448
5734814,chr5,134260291,134260292,30.2130
4492483,chr21,44394395,44394396,30.2404
6891777,chr8,100508141,100508142,30.2813
2872921,chr17,22020733,22020734,30.3906
...,...,...,...,...
1084,chr1,569970,569971,405.7040
1082,chr1,569968,569969,417.4510
1085,chr1,569971,569972,421.8350
1083,chr1,569969,569970,434.0190


In [None]:
histogram_plasmid = plasmid_cov.hvplot.hist('coverage')
histogram_plasmid



In [None]:
plasmid_cov.hvplot.kde('coverage')



In [None]:
test = plasmid[plasmid['coverage'] > 1]

In [None]:
test

Unnamed: 0,snp_chromosome,snp_position,end_coord,coverage
301,chr1,565767,565771,1.05172
991,chr1,569876,569877,1.10635
992,chr1,569877,569878,5.05372
993,chr1,569878,569879,12.10160
994,chr1,569879,569880,13.60410
...,...,...,...,...
7499250,chrY,58993295,58993304,2.02149
7499256,chrY,58993363,58993368,1.76197
7499257,chrY,58993368,58993370,2.07612
7499258,chrY,58993370,58993372,1.80295


In [None]:
test.hvplot.hist('coverage')



In [None]:
test.hvplot.kde('coverage')



***
Compare to the sequences of enhancer and nre regions determined by Kurtis and Barski 

In [None]:
names = ['enh_chr', 'start_enh', 'end_enh', 'name', 'plasmid_chr', 'start_plasmid', 'end_plasmid', 'coverage']

In [None]:
active_seq_enh = pd.read_csv('data/bed_files/bedtools/active_sequences_cd4.txt', sep='\t', header=None, names=names)

In [None]:
active_seq_enh

Unnamed: 0,enh_chr,start_enh,end_enh,name,plasmid_chr,start_plasmid,end_plasmid,coverage
0,chr1,100351635,100351837,chr1_100817191_100817393_MACS2STARRENH_indivua...,chr1,100351696,100351705,0.150246
1,chr1,100452713,100453106,chr1_100918269_100918662_MACS2STARRENH_indivua...,chr1,100452706,100452715,0.177563
2,chr1,100452713,100453106,chr1_100918269_100918662_MACS2STARRENH_indivua...,chr1,100452811,100452820,0.177563
3,chr1,100896711,100896885,chr1_101362267_101362441_MACS2STARRENH_indivua...,chr1,100896881,100896890,0.327809
4,chr1,100927391,100927733,chr1_101392947_101393289_MACS2STARRENH_indivua...,chr1,100927457,100927466,0.191222
...,...,...,...,...,...,...,...,...
31055,chr9,97633564,97633797,chr9_100395846_100396079_MACS2STARRENH_indivua...,chr9,97633723,97633726,0.628300
31056,chr9,97633564,97633797,chr9_100395846_100396079_MACS2STARRENH_indivua...,chr9,97633726,97633732,0.464396
31057,chr9,97633564,97633797,chr9_100395846_100396079_MACS2STARRENH_indivua...,chr9,97633792,97633800,0.341467
31058,chr9,97938321,97938520,chr9_100700603_100700802_MACS2STARRENH_indivua...,chr9,97938415,97938424,0.163904


In [None]:
active_seq_enh.sort_values(by='coverage')

Unnamed: 0,enh_chr,start_enh,end_enh,name,plasmid_chr,start_plasmid,end_plasmid,coverage
8016,chr19,1568410,1568573,chr19_1568409_1568572_MACS2STARRENH_indivually...,chr19,1568425,1568430,0.013659
12094,chr19,57947479,57947873,chr19_58458847_58459241_MACS2STARRENH_indivual...,chr19,57947834,57947843,0.013659
12092,chr19,57947479,57947873,chr19_58458847_58459241_MACS2STARRENH_indivual...,chr19,57947719,57947728,0.013659
2913,chr13,102596254,102596354,chr13_103248604_103248704_MACS2STARRENH_indivu...,chr13,102596248,102596257,0.013659
16850,chr5,10441755,10442119,chr5_10441867_10442231_MACS2STARRENH_indivuall...,chr5,10442035,10442044,0.013659
...,...,...,...,...,...,...,...,...
12469,chr19,5904351,5904739,chr19_5904362_5904750_MACS2STARRENH_indivually...,chr19,5904682,5904683,37.397500
12470,chr19,5904351,5904739,chr19_5904362_5904750_MACS2STARRENH_indivually...,chr19,5904683,5904684,37.561400
12468,chr19,5904351,5904739,chr19_5904362_5904750_MACS2STARRENH_indivually...,chr19,5904681,5904682,37.698000
12467,chr19,5904351,5904739,chr19_5904362_5904750_MACS2STARRENH_indivually...,chr19,5904680,5904681,39.118500


In [None]:
active_seq_nre = pd.read_csv('data/bed_files/bedtools/active_sequences_nre.txt', sep='\t', header=None, names=names)

In [None]:
active_seq_nre.name.value_counts()

chr9_3526053_3527029_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_356356        342
chr19_5720138_5720732_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_158249       336
chr19_506838_507472_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_157199         305
chr5_43007580_43008894_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_274012      299
chr6_2989439_2990457_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_292548        298
                                                                                                                  ... 
chr19_35557954_35558240_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_162345       1
chr13_79768737_79769050_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_95841        1
chr13_100086729_100086966_CALLNREreversecontrolT

In [None]:
nre = pd.read_csv('data/nre_cd4_coords.txt', sep='\t', header=None, names=['snp_chromosome', 'start_coord', 'end_coord', 'name'])

In [None]:
nre.name.value_counts()

chr1_762641_763192_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_27              1
chr3_15468656_15469332_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_220723      1
chr3_18720812_18721017_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_221241      1
chr3_17816092_17816362_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_221089      1
chr3_17734938_17735118_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_221080      1
                                                                                                                  ..
chr15_74988152_74988640_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_119717     1
chr15_74913501_74913662_CALLNREreversecontrolTxMACS2STARRENH_indivuallyrmDupat75thpercentile021623_peak_119701     1
chr15_74890756_74891023_CALLNREreversecontrolTxMACS2STARRENH_ind

In [None]:
active_seq_nre.sort_values(by='coverage')

Unnamed: 0,enh_chr,start_enh,end_enh,name,plasmid_chr,start_plasmid,end_plasmid,coverage
29846,chr9,98255465,98256327,chr9_101017747_101018609_CALLNREreversecontrol...,chr9,98255642,98255651,0.013659
12647,chr19,5293492,5293739,chr19_5293503_5293750_CALLNREreversecontrolTxM...,chr19,5293529,5293538,0.013659
12575,chr19,510718,511504,chr19_510718_511504_CALLNREreversecontrolTxMAC...,chr19,511353,511354,0.013659
12371,chr19,510718,511504,chr19_510718_511504_CALLNREreversecontrolTxMAC...,chr19,510766,510775,0.013659
12239,chr19,506838,507472,chr19_506838_507472_CALLNREreversecontrolTxMAC...,chr19,507238,507243,0.013659
...,...,...,...,...,...,...,...,...
28458,chr9,37120168,37120624,chr9_37120165_37120621_CALLNREreversecontrolTx...,chr9,37120484,37120486,19.900700
26094,chr9,34048910,34049535,chr9_34048908_34049533_CALLNREreversecontrolTx...,chr9,34049217,34049218,20.324100
9701,chr19,2427604,2427916,chr19_2427602_2427914_CALLNREreversecontrolTxM...,chr19,2427910,2427914,20.474400
13259,chr19,5720127,5720721,chr19_5720138_5720732_CALLNREreversecontrolTxM...,chr19,5720620,5720621,20.856800


In [None]:
active_seq_enh.hvplot.hist('coverage')



In [None]:
active_seq_nre.hvplot.hist('coverage')



In [None]:
 plasmid = plasmid[~(plasmid['snp_chromosome'] == 'chrX') & ~(plasmid['snp_chromosome'] == 'chrY')]

In [None]:
plasmid['id'] = list(plasmid.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  plasmid['id'] = list(plasmid.index)


In [None]:
plasmid = plasmid.astype({'id': 'str'})

In [None]:
nre_plasmid = snp_position_in_mpra(nre, plasmid, 'name', 'id')

3538.296875 seconds


In [None]:
len(nre_plasmid)

2018