## Currents features (CNN wavenet algorithm) aggregated results filtering on well known SNPs and m5C positions

In [1]:
# import basic modules
import pandas as pd
from tqdm import tqdm
import pysam

In [2]:
# define input paths (results with pvalues for apobec1 motif computed in notebook "Consensus_analysis_wt_ko.currents_features")
ref_filepath = "/lustre/bio_running/C_to_U_editing/refs/GRCm39.genome.fa"
ko_bam_filepath = "/lustre/bio_running/C_to_U_editing_minimap2_spliced/ko.bam"
wt = pd.read_csv("/lustre/bio_running/C_to_U_editing_minimap2_spliced/src_jupyter_notebooks_multi_thr//WT.df_CT_predicted_aggregated_CNN_apobec1_pvalues.tsv", index_col=0)
ko = pd.read_csv("/lustre/bio_running/C_to_U_editing_minimap2_spliced/src_jupyter_notebooks_multi_thr//KO.df_CT_predicted_aggregated_CNN_apobec1_pvalues.tsv", index_col=0)

In [3]:
# show wt dataframe of aggregated results
wt

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
0,chr1,4846611,-,1,0,100,0.010000,0.000000,GTCTG,0,1.0
1,chr1,4846619,-,1,0,101,0.009901,0.000000,AGCTT,0,1.0
2,chr1,4846635,-,4,3,77,0.051948,0.038961,TTCTT,0,1.0
3,chr1,4846643,-,3,0,103,0.029126,0.000000,CACAT,0,1.0
4,chr1,4846645,-,3,0,103,0.029126,0.000000,TGCAC,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
337784,chrY,90804626,+,2,0,66,0.030303,0.000000,AGCGG,0,1.0
337785,chrY,90804632,+,1,0,67,0.014925,0.000000,CGCCG,0,1.0
337786,chrY,90804649,+,1,0,58,0.017241,0.000000,CTCTG,0,1.0
337787,chrY,90804680,+,2,0,60,0.033333,0.000000,CTCCA,0,1.0


In [4]:
# show ko dataframe of aggregated results
ko

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
0,chr1,4846619,-,2,0,121,0.016529,0.000000,AGCTT,0,1.0
1,chr1,4846635,-,7,3,98,0.071429,0.030612,TTCTT,0,1.0
2,chr1,4846643,-,1,0,125,0.008000,0.000000,CACAT,0,1.0
3,chr1,4846645,-,4,0,125,0.032000,0.000000,TGCAC,0,1.0
4,chr1,4846648,-,2,0,126,0.015873,0.000000,GGCTG,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
465902,chrY,90833612,+,3,0,54,0.055556,0.000000,CGCCA,0,1.0
465903,chrY,90833626,+,3,0,56,0.053571,0.000000,GACTG,0,1.0
465904,chrY,90833629,+,5,1,55,0.090909,0.018182,TGCGT,1,1.0
465905,chrY,90833633,+,2,0,56,0.035714,0.000000,TACAG,0,1.0


In [5]:
# focus on sites with called edited sites
wt.query("y_hat == 1")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
11,chr1,4846693,-,3,1,99,0.030303,0.010101,AGCTC,1,1.000000
14,chr1,4846710,-,8,3,83,0.096386,0.036145,ATCTT,1,0.761332
24,chr1,4846812,-,4,2,102,0.039216,0.019608,TGCTC,1,1.000000
25,chr1,4846819,-,2,1,94,0.021277,0.010638,TTCAG,1,1.000000
28,chr1,4846855,-,4,1,98,0.040816,0.010204,GGCTA,1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
337734,chrY,90797058,+,1,1,61,0.016393,0.016393,CCCGC,1,1.000000
337758,chrY,90797198,+,1,1,65,0.015385,0.015385,GCCAC,1,1.000000
337760,chrY,90797201,+,2,1,65,0.030769,0.015385,ACCGA,1,1.000000
337781,chrY,90804604,+,2,2,66,0.030303,0.030303,CCCAC,1,1.000000


In [6]:
# focus on sites with called edited sites (CNN model training dataset) and filtered by APOBEC1
wt.query("y_hat == 1").query("p_value < 0.01")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
115,chr1,4915457,+,4,2,51,0.078431,0.039216,TACTA,1,0.000047
132,chr1,4915694,+,7,2,51,0.137255,0.039216,TACTA,1,0.000521
185,chr1,4916328,+,2,2,60,0.033333,0.033333,ATCTT,1,0.000028
187,chr1,4916336,+,20,18,67,0.298507,0.268657,GCCGA,1,0.000316
197,chr1,4916406,+,3,3,67,0.044776,0.044776,GACTT,1,0.000099
...,...,...,...,...,...,...,...,...,...,...,...
336612,chrX,141022514,+,4,1,54,0.074074,0.018519,AACAG,1,0.000286
336630,chrX,141022655,+,3,1,52,0.057692,0.019231,AACTT,1,0.000013
336865,chrX,153996147,-,8,1,62,0.129032,0.016129,TACTC,1,0.000004
337139,chrX,158165691,+,6,4,125,0.048000,0.032000,ATCAA,1,0.000734


In [7]:
ko.query("y_hat == 1")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
12,chr1,4846693,-,8,3,123,0.065041,0.024390,AGCTC,1,1.000000
25,chr1,4846807,-,15,4,126,0.119048,0.031746,AGCGC,1,1.000000
34,chr1,4846855,-,5,2,123,0.040650,0.016260,GGCTA,1,1.000000
99,chr1,4854322,-,6,3,131,0.045802,0.022901,GACGT,1,0.503792
118,chr1,4855906,-,57,14,70,0.814286,0.200000,CGCGG,1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
465889,chrY,90827645,+,32,4,59,0.542373,0.067797,GACGG,1,1.000000
465893,chrY,90827692,+,5,2,59,0.084746,0.033898,TGCGT,1,1.000000
465898,chrY,90833568,+,1,1,53,0.018868,0.018868,GCCGA,1,1.000000
465899,chrY,90833582,+,32,3,56,0.571429,0.053571,GACGG,1,1.000000


In [8]:
ko.query("y_hat == 1").query("p_value < 0.01")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
279,chr1,4916252,+,3,1,91,0.032967,0.010989,AACAC,1,0.000367
289,chr1,4916336,+,24,20,104,0.230769,0.192308,GCCGA,1,0.000316
574,chr1,4967787,+,11,5,133,0.082707,0.037594,AACAT,1,0.004150
743,chr1,5206037,+,1,1,83,0.012048,0.012048,CTCAT,1,0.000248
807,chr1,5232494,+,24,3,68,0.352941,0.044118,TTCTG,1,0.000028
...,...,...,...,...,...,...,...,...,...,...,...
464338,chrX,139376011,+,1,1,60,0.016667,0.016667,AACAT,1,0.001633
464514,chrX,141022343,+,8,1,54,0.148148,0.018519,CTCAT,1,0.002212
465283,chrX,161555354,+,7,7,175,0.040000,0.040000,AACTT,1,0.005799
465350,chrX,161560774,+,8,2,176,0.045455,0.011364,TTCAT,1,0.008214


In [9]:
# print intersection sites among WT and KO without correction
pd.merge(wt, ko, how="inner", on=["region", "position", "strand"]).shape[0]

260991

In [10]:
# print intersection sites among WT and KO with correction CNN
pd.merge(wt.query("y_hat == 1"), ko.query("y_hat == 1"), how="inner", on=["region", "position", "strand"]).shape[0]

8294

In [11]:
# print intersection sites among WT and KO with correction
pd.merge(wt.query("y_hat == 1").query("p_value < 0.01"), 
         ko.query("y_hat == 1").query("p_value < 0.01"), 
         how="inner", 
         on=["region", "position", "strand"]).shape[0]

387

## SNPs filtering
Variations were downloaded from https://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/snp142.txt.gz and lifted-over to mm39 by CrossMap.py script (used by ensembl webapp).

In [12]:
vcf_filepath = "/lustre/bio_running/C_to_U_editing/refs/mm10/snp142.mm39.bed"
vcf = pd.read_table(vcf_filepath, error_bad_lines=False, warn_bad_lines=False, header=None)
vcf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,chr1,3070116,3070117,rs580370473,0,+,G,G,G/T,genomic
1,chr1,3070175,3070176,rs585444580,0,+,G,G,G/T,genomic
2,chr1,3070224,3070225,rs579469519,0,+,G,G,A/G,genomic
3,chr1,3070248,3070249,rs582985490,0,+,G,G,G/T,genomic
4,chr1,3070249,3070250,rs586234354,0,+,T,T,G/T,genomic
...,...,...,...,...,...,...,...,...,...,...
82219295,chrY_JH584303v1_random,114969,114970,rs108306402,0,+,A,A,A/C,genomic
82219296,chrY_JH584303v1_random,115198,115199,rs108041287,0,+,C,C,A/C,genomic
82219297,chrY_JH584303v1_random,123248,123249,rs108213532,0,-,T,T,A/G,genomic
82219298,chrY_JH584303v1_random,125055,125056,rs107955593,0,-,T,T,A/T,genomic


In [13]:
vcf_ed = vcf.iloc[:,[0,2,3,9]]

In [14]:
vcf_ed.shape

(82219300, 4)

In [15]:
vcf_ed

Unnamed: 0,0,2,3,9
0,chr1,3070117,rs580370473,genomic
1,chr1,3070176,rs585444580,genomic
2,chr1,3070225,rs579469519,genomic
3,chr1,3070249,rs582985490,genomic
4,chr1,3070250,rs586234354,genomic
...,...,...,...,...
82219295,chrY_JH584303v1_random,114970,rs108306402,genomic
82219296,chrY_JH584303v1_random,115199,rs108041287,genomic
82219297,chrY_JH584303v1_random,123249,rs108213532,genomic
82219298,chrY_JH584303v1_random,125056,rs107955593,genomic


In [16]:
vcf_ed.columns = ["region", "position", "name", "molType"]

In [17]:
vcf_ed.dtypes

region      object
position     int64
name        object
molType     object
dtype: object

In [18]:
vcf_ed

Unnamed: 0,region,position,name,molType
0,chr1,3070117,rs580370473,genomic
1,chr1,3070176,rs585444580,genomic
2,chr1,3070225,rs579469519,genomic
3,chr1,3070249,rs582985490,genomic
4,chr1,3070250,rs586234354,genomic
...,...,...,...,...
82219295,chrY_JH584303v1_random,114970,rs108306402,genomic
82219296,chrY_JH584303v1_random,115199,rs108041287,genomic
82219297,chrY_JH584303v1_random,123249,rs108213532,genomic
82219298,chrY_JH584303v1_random,125056,rs107955593,genomic


In [19]:
vcf_ed.molType.value_counts()

genomic    82210892
cDNA           8395
mito             13
Name: molType, dtype: int64

In [20]:
# selecting only genomic SNPs
vcf_ed = vcf_ed.query("molType == 'genomic'")
vcf_ed

Unnamed: 0,region,position,name,molType
0,chr1,3070117,rs580370473,genomic
1,chr1,3070176,rs585444580,genomic
2,chr1,3070225,rs579469519,genomic
3,chr1,3070249,rs582985490,genomic
4,chr1,3070250,rs586234354,genomic
...,...,...,...,...
82219295,chrY_JH584303v1_random,114970,rs108306402,genomic
82219296,chrY_JH584303v1_random,115199,rs108041287,genomic
82219297,chrY_JH584303v1_random,123249,rs108213532,genomic
82219298,chrY_JH584303v1_random,125056,rs107955593,genomic


In [21]:
# produce a set with vcf_ed positions as region:position format
with tqdm(total=vcf_ed.shape[0]) as pbar:
    vcf_ed_pos = set()
    for var in vcf_ed.itertuples():
        vcf_ed_pos.add(f"{var[1]}:{var[2]}")
        pbar.update(1)
vcf_ed_pos

100%|██████████| 82210892/82210892 [02:09<00:00, 633645.75it/s]


{'chr3:4297596',
 'chr5:81227342',
 'chr6:129525989',
 'chr12:12924833',
 'chr11:32737894',
 'chr3:37809385',
 'chr8:6798784',
 'chr7:119450931',
 'chr13:23003645',
 'chr10:123160896',
 'chr5:41992931',
 'chr13:52410091',
 'chr2:15623536',
 'chr5:8050868',
 'chr16:41541451',
 'chr3:115740959',
 'chr2:56182609',
 'chr5:18309517',
 'chr18:75149194',
 'chr19:37639679',
 'chr8:13405246',
 'chr18:68331478',
 'chr1:158574339',
 'chr18:34696828',
 'chr12:73881971',
 'chr10:111202375',
 'chr18:88176957',
 'chr2:114897814',
 'chr17:28216453',
 'chr13:90308668',
 'chr1:158026540',
 'chr3:24041510',
 'chr8:16203749',
 'chr1:25887537',
 'chr14:105496862',
 'chr18:46234720',
 'chr1:29277350',
 'chr1:102103376',
 'chr7:143397646',
 'chr1:13613812',
 'chr5:82514110',
 'chr18:46014985',
 'chr14:101412878',
 'chr16:93613773',
 'chr15:6161735',
 'chr17:22570562',
 'chr1:72369616',
 'chr5:32361601',
 'chr9:26295642',
 'chr7:109071432',
 'chr2:111394186',
 'chr13:78889290',
 'chr5:151102072',
 'chr8:53988

In [22]:
len(vcf_ed_pos)

80657750

In [23]:
# produce sets for wt and ko that after correction are called as edited (above 99th percentile CC data) and with pvalue of 
# abobec1 model lower than significance level 0.01
wt_ed_pos = set()
for var in wt.query("y_hat == 1").query("p_value < 0.01").itertuples():
    wt_ed_pos.add(f"{var[1]}:{var[2]}")

ko_ed_pos = set()
for var in ko.query("y_hat == 1").query("p_value < 0.01").itertuples():
    ko_ed_pos.add(f"{var[1]}:{var[2]}")

In [24]:
len(wt_ed_pos)

1636

In [25]:
len(ko_ed_pos)

1230

In [26]:
# intersection between ko and wt sites after SNPs filtering and APOBEC1 model selection
len(wt_ed_pos.intersection(ko_ed_pos))

387

In [27]:
len(wt_ed_pos.difference(vcf_ed_pos)) # number of edited sites for WT after SNPs filtering and APOBEC1 model selection

1591

In [28]:
len(ko_ed_pos.difference(vcf_ed_pos)) # number of edited sites for KO after SNPs filtering and APOBEC1 model selection

1189

In [29]:
# filter out position involved into genomic variation and load into pandas dataframes
wt_ed_pos = wt_ed_pos.difference(vcf_ed_pos)
ko_ed_pos = ko_ed_pos.difference(vcf_ed_pos)

## Let's filter out m5C known positions
The m5C known positions for mm10 Mus musculus assembly were downloaded from m5C-Atlas (url: https://www.xjtlu.edu.cn/biologicalsciences/m5c-atlas). These positions were thus lifted-over from mm10 to mm39 genome space coordinates.

In [30]:
# load m5C_mm39.bed file
m5C_mm39_bed = pd.read_table("/lustre/bio_running/C_to_U_editing/refs/mm10/m5C_mm39.bed", header=None)
m5C_mm39_bed

Unnamed: 0,0,1,2,3,4,5
0,chr1,13636583,13636584,.,0,-
1,chr1,13646667,13646668,.,0,-
2,chr1,20678627,20678628,.,0,-
3,chr1,20873518,20873519,.,0,-
4,chr1,20884976,20884977,.,0,-
...,...,...,...,...,...,...
16274,chr9,108378490,108378491,.,0,+
16275,chr14,19802519,19802520,.,0,+
16276,chr18,34901490,34901491,.,0,+
16277,chr2,25392997,25392998,.,0,+


In [31]:
# produce a set with vcf_ed positions as region:position format
with tqdm(total=m5C_mm39_bed.shape[0]) as pbar:
    m5C_pos = set()
    for var in m5C_mm39_bed.itertuples():
        m5C_pos.add(f"{var[1]}:{var[2]}")
        pbar.update(1)
m5C_pos

100%|██████████| 16279/16279 [00:00<00:00, 440777.47it/s]


{'chr1:180690068',
 'chr5:144192055',
 'chr13:67854574',
 'chrX:8021456',
 'chr15:85249546',
 'chr15:36771335',
 'chr8:87611571',
 'chr16:20520384',
 'chr15:79672940',
 'chr9:108172840',
 'chr17:24631446',
 'chr10:7069695',
 'chr2:34716200',
 'chr6:48001546',
 'chr7:59634791',
 'chr8:124130253',
 'chr5:129937483',
 'chr12:100173753',
 'chr15:98029835',
 'chr5:34154340',
 'chr13:32986609',
 'chr2:25254635',
 'chr19:7404870',
 'chr3:96080881',
 'chr8:71151835',
 'chr12:100173705',
 'chr3:132545189',
 'chr2:32679263',
 'chr5:93189243',
 'chr11:21507456',
 'chrX:135737559',
 'chr8:106927193',
 'chr10:80269222',
 'chr15:74620622',
 'chr4:122752089',
 'chr4:140793897',
 'chr16:10105998',
 'chr5:36555248',
 'chr16:16843951',
 'chr6:42338258',
 'chr10:95011529',
 'chr2:163860378',
 'chr17:24292229',
 'chr16:20520576',
 'chr1:171068649',
 'chr15:36771405',
 'chr10:62105177',
 'chr12:112566572',
 'chr11:116250122',
 'chr4:138184747',
 'chr11:100953481',
 'chr11:7120611',
 'chr18:37973708',
 'chr

In [32]:
len(m5C_pos)

16279

In [33]:
# filter out position involved into m5C known variations from dataframes
wt_ed_pos = wt_ed_pos.difference(m5C_pos)
ko_ed_pos = ko_ed_pos.difference(m5C_pos)

In [34]:
len(wt_ed_pos)

1591

In [35]:
len(ko_ed_pos)

1189

In [36]:
# produce pandas dataframe
wt_filtered = pd.merge(wt, pd.DataFrame([[i.split(":")[0],int(i.split(":")[1]) ] for i in list(wt_ed_pos)], columns=["region", "position"]), how="inner")
wt_filtered

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
0,chr1,4915457,+,4,2,51,0.078431,0.039216,TACTA,1,0.000047
1,chr1,4915694,+,7,2,51,0.137255,0.039216,TACTA,1,0.000521
2,chr1,4916328,+,2,2,60,0.033333,0.033333,ATCTT,1,0.000028
3,chr1,4916336,+,20,18,67,0.298507,0.268657,GCCGA,1,0.000316
4,chr1,4916406,+,3,3,67,0.044776,0.044776,GACTT,1,0.000099
...,...,...,...,...,...,...,...,...,...,...,...
1586,chrX,141022514,+,4,1,54,0.074074,0.018519,AACAG,1,0.000286
1587,chrX,141022655,+,3,1,52,0.057692,0.019231,AACTT,1,0.000013
1588,chrX,153996147,-,8,1,62,0.129032,0.016129,TACTC,1,0.000004
1589,chrX,158165691,+,6,4,125,0.048000,0.032000,ATCAA,1,0.000734


In [37]:
wt_filtered.describe()

Unnamed: 0,position,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,y_hat,p_value
count,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0,1591.0
mean,74270400.0,15.117536,5.456945,168.375236,0.082982,0.031778,1.0,0.002328918
std,44249860.0,68.775613,28.367343,704.419366,0.073526,0.031466,0.0,0.002680755
min,1648.0,1.0,1.0,51.0,0.010101,0.010101,1.0,7.332901e-08
25%,37509300.0,3.0,1.0,63.0,0.034153,0.014706,1.0,0.0002135126
50%,71262240.0,5.0,2.0,79.0,0.060976,0.019802,1.0,0.00113006
75%,108204400.0,11.0,4.0,111.0,0.108792,0.036854,1.0,0.003642209
max,194793400.0,1340.0,691.0,11837.0,0.771084,0.590361,1.0,0.009992778


In [38]:
# produce pandas dataframe
ko_filtered = pd.merge(ko, pd.DataFrame([[i.split(":")[0],int(i.split(":")[1]) ] for i in list(ko_ed_pos)], columns=["region", "position"]), how="inner")
ko_filtered

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
0,chr1,4916252,+,3,1,91,0.032967,0.010989,AACAC,1,0.000367
1,chr1,4916336,+,24,20,104,0.230769,0.192308,GCCGA,1,0.000316
2,chr1,4967787,+,11,5,133,0.082707,0.037594,AACAT,1,0.004150
3,chr1,5206037,+,1,1,83,0.012048,0.012048,CTCAT,1,0.000248
4,chr1,5232494,+,24,3,68,0.352941,0.044118,TTCTG,1,0.000028
...,...,...,...,...,...,...,...,...,...,...,...
1184,chrX,135633974,-,8,3,184,0.043478,0.016304,TGCAT,1,0.001077
1185,chrX,141022343,+,8,1,54,0.148148,0.018519,CTCAT,1,0.002212
1186,chrX,161555354,+,7,7,175,0.040000,0.040000,AACTT,1,0.005799
1187,chrX,161560774,+,8,2,176,0.045455,0.011364,TTCAT,1,0.008214


In [39]:
ko_filtered.describe()

Unnamed: 0,position,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,y_hat,p_value
count,1189.0,1189.0,1189.0,1189.0,1189.0,1189.0,1189.0,1189.0
mean,74683300.0,16.702271,5.305299,202.449958,0.087206,0.030084,1.0,0.002634941
std,45670920.0,64.275636,20.822243,938.08719,0.079584,0.032349,0.0,0.002863074
min,1648.0,1.0,1.0,51.0,0.010101,0.010101,1.0,2.566249e-07
25%,37199170.0,3.0,1.0,61.0,0.034483,0.014706,1.0,0.0003552543
50%,71385780.0,5.0,2.0,77.0,0.0625,0.018868,1.0,0.001326198
75%,108175400.0,12.0,4.0,116.0,0.115385,0.035714,1.0,0.004264825
max,194793400.0,973.0,428.0,13092.0,0.772152,0.588608,1.0,0.009987688


In [40]:
# save to disk
wt_filtered.to_csv("/lustre/bio_running/C_to_U_editing_minimap2_spliced/src_jupyter_notebooks_multi_thr/wt_ko/wt_ko.CNN_wavenet_08/wt_CNNwavenet_and_SNP_m5C_filtered_snp142.apobec1.tsv", sep="\t")
ko_filtered.to_csv("/lustre/bio_running/C_to_U_editing_minimap2_spliced/src_jupyter_notebooks_multi_thr/wt_ko/wt_ko.CNN_wavenet_08/ko_CNNwavenet_and_SNP_m5C_filtered_snp142.apobec1.tsv", sep="\t")

In [41]:
intersection = pd.merge(wt_filtered, ko_filtered, how="inner", on=["region", "position", "strand"])[["region", "position", "strand"]]
intersection

Unnamed: 0,region,position,strand
0,chr1,4916336,+
1,chr1,4967787,+
2,chr1,13635485,-
3,chr1,39590495,-
4,chr1,39590524,-
...,...,...,...
367,chrX,100491909,+
368,chrX,105065473,+
369,chrX,135633808,-
370,chrX,135633812,-


In [42]:
# lest's evaluate the presence of well known chr2:121983221/3 sites in wt and ko
wt_filtered.query("region == 'chr2'").query("position == 121983221")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
830,chr2,121983221,+,254,9,851,0.298472,0.010576,TACAC,1,0.004419


In [43]:
wt_filtered.query("region == 'chr2'").query("position == 121983223")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value


In [44]:
ko_filtered.query("region == 'chr2'").query("position == 121983221")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value


In [45]:
ko_filtered.query("region == 'chr2'").query("position == 121983223")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value


## Evaluate reliable CtoU editing sites
To eliminate RNA-DNA problems we focused out attention on sites that are modified in WT but that are not in KO sample

In [46]:
# we are interested in sites that on KO are not edited after correction
ko.query("Tfreq_corrected <= 0.01").describe()

Unnamed: 0,position,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,y_hat,p_value
count,408551.0,408551.0,408551.0,408551.0,408551.0,408551.0,408551.0,408551.0
mean,79540730.0,6.188809,0.219796,315.565521,0.029866,0.000525,0.0,0.8898624
std,45718700.0,25.499401,1.496227,839.428933,0.038349,0.001781,0.0,0.2980813
min,87.0,1.0,0.0,51.0,6.6e-05,0.0,0.0,9.667981e-10
25%,38466270.0,1.0,0.0,73.0,0.009804,0.0,0.0,1.0
50%,79597430.0,2.0,0.0,116.0,0.017544,0.0,0.0,1.0
75%,117252800.0,5.0,0.0,230.0,0.035714,0.0,0.0,1.0
max,194813600.0,3296.0,175.0,19536.0,0.954082,0.01,0.0,1.0


In [47]:
# produce a set KO sites that are below 0.01 threshold after correction and are not SNP and/or known m5C sites
ko_NoEd_sites = set()
for site in ko.query("Tfreq_corrected <= 0.01").itertuples():
    ko_NoEd_sites.add(f"{site[1]}:{site[2]}")
# filters SNPs 
ko_NoEd_sites = ko_NoEd_sites.difference(vcf_ed_pos)
# filters out m5C sites
ko_NoEd_sites = ko_NoEd_sites.difference(m5C_pos)
len(ko_NoEd_sites)

400335

In [48]:
# produce a set of not filtered ko CT sites
ko_no_filters = set()
for site in ko.itertuples():
    ko_no_filters.add(f"{site[1]}:{site[2]}")
# filters SNPs 
ko_no_filters = ko_no_filters.difference(vcf_ed_pos)
# filters out m5C sites
ko_no_filters = ko_no_filters.difference(m5C_pos)
len(ko_no_filters)

448159

In [49]:
# sites that results edited on WT after algorithm correction and SNPs-m5C filtering and that are not edited on KO 
# (if covered on both with a depth higher than 50).
reliable_ed_sites = set()

counter = 0
bam_file = pysam.AlignmentFile(ko_bam_filepath)
ref = pysam.FastaFile(ref_filepath)
with tqdm(total=len(wt_ed_pos)) as pbar:
    for site in wt_ed_pos:
        if site in ko_NoEd_sites:
            reliable_ed_sites.add(site)
            counter += 1
        else:
            #assess if it is not present at all among CT sites retrieved for KO
            if not site in ko_no_filters:
                # asses if position is covered in ko
                for pileupcolumn in bam_file.pileup(site.split(":")[0], 
                                                    int(site.split(":")[1])-1,
                                                    int(site.split(":")[1]), 
                                                    truncate=True, 
                                                    max_depth=1000000, 
                                                    min_base_quality=0):

                    column = pileupcolumn.get_query_sequences(mark_matches=True, add_indels=True)
                    depth_ = 0
                    for i in [i[0].upper() for i in column]:
                        if i.isupper():
                            depth_ += 1              
                    if depth_ > 50:
                        # asses if there is no editing on this position of KO sample
                        ref_base = ref.fetch(site.split(":")[0], int(site.split(":")[1])-1, int(site.split(":")[1]))
                        if ref_base == "C":
                            if column.count("T") == 0:
                                reliable_ed_sites.add(site)
                                counter += 1
                        elif ref_base == "G":
                            if column.count("a") == 0:
                                reliable_ed_sites.add(site)
                                counter += 1
        pbar.update(1)
# close input files and print total counter of reliable sites retrieved
bam_file.close()         
ref.close()
counter

100%|██████████| 1591/1591 [00:07<00:00, 206.70it/s]


946

In [50]:
len(reliable_ed_sites)

946

In [51]:
# produce pandas dataframe for reliable sites
reliable_ed_sites = pd.merge(wt, pd.DataFrame([[i.split(":")[0],int(i.split(":")[1]) ] for i in list(reliable_ed_sites)], columns=["region", "position"]), how="inner")
reliable_ed_sites

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
0,chr1,4915457,+,4,2,51,0.078431,0.039216,TACTA,1,4.720720e-05
1,chr1,4915694,+,7,2,51,0.137255,0.039216,TACTA,1,5.212852e-04
2,chr1,4916406,+,3,3,67,0.044776,0.044776,GACTT,1,9.861242e-05
3,chr1,4916432,+,3,1,69,0.043478,0.014493,AACAT,1,4.565154e-07
4,chr1,4916454,+,6,3,60,0.100000,0.050000,ATCTT,1,8.767610e-05
...,...,...,...,...,...,...,...,...,...,...,...
941,chrX,135147776,+,15,9,56,0.267857,0.160714,AACTT,1,2.345928e-05
942,chrX,135633733,-,12,3,122,0.098361,0.024590,ATCTT,1,9.604205e-06
943,chrX,135633941,-,16,2,156,0.102564,0.012821,GACGT,1,2.943394e-03
944,chrX,141022655,+,3,1,52,0.057692,0.019231,AACTT,1,1.256081e-05


In [52]:
reliable_ed_sites.describe()

Unnamed: 0,position,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,y_hat,p_value
count,946.0,946.0,946.0,946.0,946.0,946.0,946.0,946.0
mean,76026280.0,10.737844,3.792812,139.448203,0.064109,0.023839,1.0,0.002235417
std,44829570.0,61.898492,25.311047,548.767159,0.058456,0.017547,0.0,0.002633749
min,1692.0,1.0,1.0,51.0,0.010101,0.010101,1.0,7.332901e-08
25%,37510250.0,2.0,1.0,65.0,0.028986,0.013514,1.0,0.0001696791
50%,72932540.0,4.0,2.0,81.0,0.04902,0.017534,1.0,0.001031615
75%,109338300.0,8.0,3.0,107.0,0.079444,0.028777,1.0,0.003415117
max,194793400.0,1340.0,691.0,10281.0,0.637405,0.16092,1.0,0.009992778


In [53]:
# save to disk reliable sites
reliable_ed_sites.to_csv("/lustre/bio_running/C_to_U_editing_minimap2_spliced/src_jupyter_notebooks_multi_thr/wt_ko/wt_ko.CNN_wavenet_08/reliable_ed_sites_CNNwavenet_SNP_m5C_WT_noKO.apobec1.tsv", sep="\t")

In [54]:
# evaluate if among these sites there is the chr2:121983221 
reliable_ed_sites.query("region == 'chr2'").query("position == 121983221")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
485,chr2,121983221,+,254,9,851,0.298472,0.010576,TACAC,1,0.004419


In [55]:
# evaluate if among these sites there is the chr2:121983223
reliable_ed_sites.query("region == 'chr2'").query("position == 121983223")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value


In [56]:
# print overall statistics
print("WT sample - results of CtoU editing events called:")
print(f'\t- ONT native: {wt.shape[0]}')
print(f'\t- ONT CNN corrected: {wt.query("y_hat == 1").shape[0]}')
print(f'\t- ONT CNN corrected and SNP-m5C filtered and APOBEC1 Model selected: {wt_filtered.shape[0]}')
print()
print("KO sample - results of CtoU editing events called:")
print(f'\t- ONT native: {ko.shape[0]}')
print(f'\t- ONT CNN corrected: {ko.query("y_hat == 1").shape[0]}')
print(f'\t- ONT CNN corrected and SNP-m5C filtered and APOBEC1 Model selected: {ko_filtered.shape[0]}')
print()
print(f"Common called editing positions among WT and KO after CNN and SNPs-m5C filtering: {intersection.shape[0]}")
print()
print(f"WT edited sites (algorithm+SNPs+m5C filtered) that are not still edited in KO (on well covered positions): {reliable_ed_sites.shape[0]}")

WT sample - results of CtoU editing events called:
	- ONT native: 331762
	- ONT CNN corrected: 26536
	- ONT CNN corrected and SNP-m5C filtered and APOBEC1 Model selected: 1591

KO sample - results of CtoU editing events called:
	- ONT native: 458321
	- ONT CNN corrected: 28196
	- ONT CNN corrected and SNP-m5C filtered and APOBEC1 Model selected: 1189

Common called editing positions among WT and KO after CNN and SNPs-m5C filtering: 372

WT edited sites (algorithm+SNPs+m5C filtered) that are not still edited in KO (on well covered positions): 946
