## Analysis of merged Editing Sites candidates (after correction) for WT and OXP

In [13]:
# import basic modules
import numpy as np
import pandas as pd
import pysam
from tqdm import tqdm

### define input files and filepaths
# load dbSNP tabix file (hg38)
dbsnp_filepath = "/lustre/bio_running/refs/snp151.hg38.sorted.gtf.gz"
dbsnp = pysam.TabixFile(dbsnp_filepath)

In [3]:
# load merged wt (wt1+wt4 outer join) sites with evidence of editing after correction (cov > 50 and Tfreq_corrected > 0.01)
dfCT_wt_aggregated_candidates_merged = pd.read_table("/lustre/bio_running/CtoUclassifier_new_model_training23052022/src_jupyter_notebooks/iForest_cc1_wt_ko_no_indels_mismatches___NEW_TRAINING/dfCT_wt_aggregated_candidates_merged_human.tsv", index_col=0)
dfCT_wt_aggregated_candidates_merged

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,depth_stranded_wt4,Tfreq_native_wt4,Tfreq_corrected_wt4
0,chr1,186428,-,8.0,3.0,51.0,0.156863,0.058824,,,,,
1,chr1,630832,+,163.0,76.0,225.0,0.724444,0.337778,114.0,49.0,166.0,0.686747,0.295181
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,126.0,0.150794,0.015873
3,chr1,946247,-,57.0,11.0,213.0,0.267606,0.051643,,,,,
4,chr1,952544,-,9.0,2.0,159.0,0.056604,0.012579,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6878,chrX,154436104,+,,,,,,11.0,2.0,177.0,0.062147,0.011299
6879,chrX,154436416,+,,,,,,14.0,2.0,194.0,0.072165,0.010309
6880,chrX,155054642,+,,,,,,14.0,2.0,122.0,0.114754,0.016393
6881,chrX,155054918,+,,,,,,6.0,2.0,160.0,0.037500,0.012500


In [10]:
# now let's try to assess if a random wt site is a SNP or not using the indexed dbSNP file
pd.DataFrame([i.split("\t") for i in dbsnp.fetch("chr1", 186428-1, 186428)])

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,ucsc_snp153_hg38,snp,186428,186428,.,+,.,"gene_id ""rs1428030847""; transcript_id ""rs14280..."


In [41]:
# open all 3 WT illumina reditools indexed tables wih pysam as tabix files
wt1 = pysam.TabixFile("/lustre/bio_running/conticello/illumina/wt1/DnaRna_505821894/outTable_505821894.gz")
wt2 = pysam.TabixFile("/lustre/bio_running/conticello/illumina/wt2/DnaRna_83292749/outTable_83292749.gz")
wt3 = pysam.TabixFile("/lustre/bio_running/conticello/illumina/wt3/DnaRna_296402424/outTable_296402424.gz")
wts = [wt1, wt2, wt3]

In [83]:
covs = []
vars_ = []
issnp = []

# assess each site
with tqdm(total=dfCT_wt_aggregated_candidates_merged.shape[0]) as pbar:
    for wt_site in dfCT_wt_aggregated_candidates_merged.itertuples():
        region = wt_site.region
        pos1based = wt_site.position
        pos0based = pos1based-1
        #print(region, pos1based)
        # assess if the site is covered at least in one wt reditools sample. 
        # if covered verify 
        redicovs = []
        redivars = []
        for rediout in wts:
            try:
                rediquery = ([i.split("\t") for i in rediout.fetch(region, pos0based, pos0based+1)])[0]
                redicovs.append(int(rediquery[4]))
                # it is covered thus assess if it is a variant or not
                if "CT" in rediquery[7]:
                    redivars.append(1)
                else:
                    redivars.append(0)
            except IndexError as e: # not covered
                redicovs.append(0)
                redivars.append(0)
        covs.append(redicovs)
        vars_.append(redivars)
        # assess if known snp or not
        query = pd.DataFrame([i.split("\t") for i in dbsnp.fetch(region, pos0based, pos0based+1)]) # pysam works in a python manner with 0indexing and right opned intervals
        if query.empty:
            issnp.append(0)
        else:
            issnp.append(1)
        pbar.update(1)

dfCT_wt_aggregated_candidates_merged_anno = dfCT_wt_aggregated_candidates_merged.copy()
dfCT_wt_aggregated_candidates_merged_anno[["cov_wt1", "cov_wt2", "cov_wt3"]] = covs
dfCT_wt_aggregated_candidates_merged_anno[["isvar_wt1", "isvar_wt2", "isvar_wt3"]] = vars_
dfCT_wt_aggregated_candidates_merged_anno["issnp"] = issnp
dfCT_wt_aggregated_candidates_merged_anno

100%|██████████| 6883/6883 [00:47<00:00, 145.19it/s]


Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,depth_stranded_wt4,Tfreq_native_wt4,Tfreq_corrected_wt4,cov_wt1,cov_wt2,cov_wt3,isvar_wt1,isvar_wt2,isvar_wt3,issnp
0,chr1,186428,-,8.0,3.0,51.0,0.156863,0.058824,,,,,,49,65,42,1,1,1,1
1,chr1,630832,+,163.0,76.0,225.0,0.724444,0.337778,114.0,49.0,166.0,0.686747,0.295181,4,8,2,1,1,1,1
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,126.0,0.150794,0.015873,558,807,477,0,0,0,0
3,chr1,946247,-,57.0,11.0,213.0,0.267606,0.051643,,,,,,1043,1720,836,1,1,1,1
4,chr1,952544,-,9.0,2.0,159.0,0.056604,0.012579,,,,,,687,1189,654,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6878,chrX,154436104,+,,,,,,11.0,2.0,177.0,0.062147,0.011299,473,809,439,0,1,0,0
6879,chrX,154436416,+,,,,,,14.0,2.0,194.0,0.072165,0.010309,400,746,364,0,0,0,1
6880,chrX,155054642,+,,,,,,14.0,2.0,122.0,0.114754,0.016393,474,724,450,0,0,0,1
6881,chrX,155054918,+,,,,,,6.0,2.0,160.0,0.037500,0.012500,737,1170,645,0,0,0,1


In [85]:
# selecting sites not falling on knonw snp positions
wt_nosnp = dfCT_wt_aggregated_candidates_merged_anno.query("issnp != 1")
wt_nosnp

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,depth_stranded_wt4,Tfreq_native_wt4,Tfreq_corrected_wt4,cov_wt1,cov_wt2,cov_wt3,isvar_wt1,isvar_wt2,isvar_wt3,issnp
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,126.0,0.150794,0.015873,558,807,477,0,0,0,0
5,chr1,999124,-,139.0,13.0,478.0,0.290795,0.027197,52.0,5.0,212.0,0.245283,0.023585,7,5,0,0,0,0,0
10,chr1,1233600,+,47.0,7.0,109.0,0.431193,0.064220,37.0,6.0,70.0,0.528571,0.085714,99,191,86,0,0,1,0
12,chr1,1255428,-,11.0,2.0,94.0,0.117021,0.021277,,,,,,257,352,190,0,0,0,0
13,chr1,1373819,-,139.0,20.0,590.0,0.235593,0.033898,57.0,8.0,248.0,0.229839,0.032258,756,1334,686,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6874,chrX,154428837,+,,,,,,4.0,2.0,82.0,0.048780,0.024390,67,88,37,0,0,0,0
6875,chrX,154432414,+,,,,,,5.0,3.0,109.0,0.045872,0.027523,435,706,425,0,1,0,0
6877,chrX,154435306,+,,,,,,22.0,2.0,136.0,0.161765,0.014706,494,892,471,0,0,0,0
6878,chrX,154436104,+,,,,,,11.0,2.0,177.0,0.062147,0.011299,473,809,439,0,1,0,0


In [98]:
wt_nosnp_noCT = wt_nosnp[(wt_nosnp["isvar_wt1"] == 0)&(wt_nosnp["isvar_wt2"] == 0)&(wt_nosnp["isvar_wt3"] == 0)].copy()
wt_nosnp_noCT

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,depth_stranded_wt4,Tfreq_native_wt4,Tfreq_corrected_wt4,cov_wt1,cov_wt2,cov_wt3,isvar_wt1,isvar_wt2,isvar_wt3,issnp
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,126.0,0.150794,0.015873,558,807,477,0,0,0,0
5,chr1,999124,-,139.0,13.0,478.0,0.290795,0.027197,52.0,5.0,212.0,0.245283,0.023585,7,5,0,0,0,0,0
12,chr1,1255428,-,11.0,2.0,94.0,0.117021,0.021277,,,,,,257,352,190,0,0,0,0
13,chr1,1373819,-,139.0,20.0,590.0,0.235593,0.033898,57.0,8.0,248.0,0.229839,0.032258,756,1334,686,0,0,0,0
19,chr1,1598750,-,6.0,2.0,57.0,0.105263,0.035088,,,,,,53,100,44,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6871,chrX,136869239,-,,,,,,10.0,2.0,97.0,0.103093,0.020619,688,963,824,0,0,0,0
6872,chrX,152827591,-,,,,,,17.0,2.0,72.0,0.236111,0.027778,395,595,429,0,0,0,0
6874,chrX,154428837,+,,,,,,4.0,2.0,82.0,0.048780,0.024390,67,88,37,0,0,0,0
6877,chrX,154435306,+,,,,,,22.0,2.0,136.0,0.161765,0.014706,494,892,471,0,0,0,0


In [107]:
wt_nosnp_noCT.describe()[["Tfreq_corrected_wt1", "Tfreq_corrected_wt4"]]

Unnamed: 0,Tfreq_corrected_wt1,Tfreq_corrected_wt4
count,2586.0,1325.0
mean,0.029763,0.028443
std,0.023471,0.02071
min,0.010004,0.01005
25%,0.015038,0.015385
50%,0.022989,0.022989
75%,0.035294,0.034483
max,0.22093,0.196078


In [112]:
wt_nosnp_noCT["Tfreq_mean"] = wt_nosnp_noCT[["Tfreq_corrected_wt1", "Tfreq_corrected_wt4"]].mean(axis=1)
wt_nosnp_noCT

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,...,Tfreq_native_wt4,Tfreq_corrected_wt4,cov_wt1,cov_wt2,cov_wt3,isvar_wt1,isvar_wt2,isvar_wt3,issnp,Tfreq_mean
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,...,0.150794,0.015873,558,807,477,0,0,0,0,0.014246
5,chr1,999124,-,139.0,13.0,478.0,0.290795,0.027197,52.0,5.0,...,0.245283,0.023585,7,5,0,0,0,0,0,0.025391
12,chr1,1255428,-,11.0,2.0,94.0,0.117021,0.021277,,,...,,,257,352,190,0,0,0,0,0.021277
13,chr1,1373819,-,139.0,20.0,590.0,0.235593,0.033898,57.0,8.0,...,0.229839,0.032258,756,1334,686,0,0,0,0,0.033078
19,chr1,1598750,-,6.0,2.0,57.0,0.105263,0.035088,,,...,,,53,100,44,0,0,0,0,0.035088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6871,chrX,136869239,-,,,,,,10.0,2.0,...,0.103093,0.020619,688,963,824,0,0,0,0,0.020619
6872,chrX,152827591,-,,,,,,17.0,2.0,...,0.236111,0.027778,395,595,429,0,0,0,0,0.027778
6874,chrX,154428837,+,,,,,,4.0,2.0,...,0.048780,0.024390,67,88,37,0,0,0,0,0.024390
6877,chrX,154435306,+,,,,,,22.0,2.0,...,0.161765,0.014706,494,892,471,0,0,0,0,0.014706


In [118]:
wt_nosnp_noCT.sort_values(by="Tfreq_mean", ascending=False).iloc[:10,:]

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,...,Tfreq_native_wt4,Tfreq_corrected_wt4,cov_wt1,cov_wt2,cov_wt3,isvar_wt1,isvar_wt2,isvar_wt3,issnp,Tfreq_mean
5478,chrX,154508491,-,41.0,22.0,106.0,0.386792,0.207547,,,...,,,126,220,107,0,0,0,0,0.207547
4327,chr5,163442389,+,90.0,38.0,172.0,0.523256,0.22093,57.0,24.0,...,0.445312,0.1875,1053,1352,1210,0,0,0,0,0.204215
1169,chr11,74247102,+,24.0,14.0,69.0,0.347826,0.202899,,,...,,,129,219,149,0,0,0,0,0.202899
6396,chr21,8435823,+,,,,,,81.0,20.0,...,0.794118,0.196078,0,0,0,0,0,0,0,0.196078
2767,chr19,6440891,-,24.0,11.0,58.0,0.413793,0.189655,,,...,,,49,116,61,0,0,0,0,0.189655
5495,chrY,305038,-,32.0,10.0,56.0,0.571429,0.178571,,,...,,,0,0,0,0,0,0,0,0.178571
3545,chr21,6452184,-,29.0,17.0,97.0,0.298969,0.175258,,,...,,,2,8,1,0,0,0,0,0.175258
6385,chr21,8211805,+,,,,,,42.0,18.0,...,0.396226,0.169811,0,0,0,0,0,0,0,0.169811
1397,chr12,75506820,-,42.0,11.0,65.0,0.646154,0.169231,,,...,,,382,492,383,0,0,0,0,0.169231
2858,chr19,16646930,-,45.0,13.0,78.0,0.576923,0.166667,,,...,,,170,314,175,0,0,0,0,0.166667


In [117]:
wt_nosnp_noCT.sort_values(by="Tfreq_mean", ascending=False).iloc[:10,:].to_csv("wt_merged_first_site_for_ernesto.tsv", sep="\t")

### Richiesta Ernesto: adding OXP runs related info

In [120]:
dfCT_wt_aggregated_candidates_merged_anno

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,depth_stranded_wt4,Tfreq_native_wt4,Tfreq_corrected_wt4,cov_wt1,cov_wt2,cov_wt3,isvar_wt1,isvar_wt2,isvar_wt3,issnp
0,chr1,186428,-,8.0,3.0,51.0,0.156863,0.058824,,,,,,49,65,42,1,1,1,1
1,chr1,630832,+,163.0,76.0,225.0,0.724444,0.337778,114.0,49.0,166.0,0.686747,0.295181,4,8,2,1,1,1,1
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,126.0,0.150794,0.015873,558,807,477,0,0,0,0
3,chr1,946247,-,57.0,11.0,213.0,0.267606,0.051643,,,,,,1043,1720,836,1,1,1,1
4,chr1,952544,-,9.0,2.0,159.0,0.056604,0.012579,,,,,,687,1189,654,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6878,chrX,154436104,+,,,,,,11.0,2.0,177.0,0.062147,0.011299,473,809,439,0,1,0,0
6879,chrX,154436416,+,,,,,,14.0,2.0,194.0,0.072165,0.010309,400,746,364,0,0,0,1
6880,chrX,155054642,+,,,,,,14.0,2.0,122.0,0.114754,0.016393,474,724,450,0,0,0,1
6881,chrX,155054918,+,,,,,,6.0,2.0,160.0,0.037500,0.012500,737,1170,645,0,0,0,1


In [121]:
# open all 3 OXP illumina reditools indexed tables wih pysam as tabix files
oxp1 = pysam.TabixFile("/lustre/bio_running/conticello/illumina/oxp1/DnaRna_470872555/outTable_470872555.gz")
oxp2 = pysam.TabixFile("/lustre/bio_running/conticello/illumina/oxp2/DnaRna_73346045/outTable_73346045.gz")
oxp3 = pysam.TabixFile("/lustre/bio_running/conticello/illumina/oxp3/DnaRna_808842865/outTable_808842865.gz")
oxps = [oxp1, oxp2, oxp3]

In [129]:
covs = []
vars_ = []
isvar = []
issnp = []

# assess each site
with tqdm(total=dfCT_wt_aggregated_candidates_merged.shape[0]) as pbar:
    for wt_site in dfCT_wt_aggregated_candidates_merged.itertuples():
        region = wt_site.region
        pos1based = wt_site.position
        pos0based = pos1based-1
        #print(region, pos1based)
        # assess if the site is covered at least in one wt reditools sample. 
        # if covered verify 
        redicovs = []
        redivars = []
        for run_name,rediout in zip(["oxp1", "oxp2", "oxp3"], oxps):
            try:
                rediquery = ([i.split("\t") for i in rediout.fetch(region, pos0based, pos0based+1)])[0]
                #print(run_name, rediquery)
                redicovs.append(int(rediquery[4]))
                # it is covered thus assess if it is a variant or not
                if "CT" in rediquery[7]:
                    redivars.append(1)
                else:
                    redivars.append(0)
            except IndexError as e: # not covered
                redicovs.append(0)
                redivars.append(0)
        covs.append(redicovs)
        vars_.append(redivars)
        pbar.update(1)

dfCT_wt_aggregated_candidates_merged_anno_with_oxps = dfCT_wt_aggregated_candidates_merged_anno.copy()
dfCT_wt_aggregated_candidates_merged_anno_with_oxps[["cov_oxp1", "cov_oxp2", "cov_oxp3"]] = covs
dfCT_wt_aggregated_candidates_merged_anno_with_oxps[["isvar_oxp1", "isvar_oxp2", "isvar_oxp3"]] = vars_
dfCT_wt_aggregated_candidates_merged_anno_with_oxps

100%|██████████| 6883/6883 [00:40<00:00, 171.41it/s]


Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,...,isvar_wt1,isvar_wt2,isvar_wt3,issnp,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3
0,chr1,186428,-,8.0,3.0,51.0,0.156863,0.058824,,,...,1,1,1,1,60,114,52,1,1,1
1,chr1,630832,+,163.0,76.0,225.0,0.724444,0.337778,114.0,49.0,...,1,1,1,1,0,1,2,0,0,1
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,...,0,0,0,0,529,515,381,0,0,0
3,chr1,946247,-,57.0,11.0,213.0,0.267606,0.051643,,,...,1,1,1,1,963,634,751,1,1,1
4,chr1,952544,-,9.0,2.0,159.0,0.056604,0.012579,,,...,0,0,0,1,693,451,541,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6878,chrX,154436104,+,,,,,,11.0,2.0,...,0,1,0,0,476,408,404,0,1,1
6879,chrX,154436416,+,,,,,,14.0,2.0,...,0,0,0,1,385,385,386,0,0,0
6880,chrX,155054642,+,,,,,,14.0,2.0,...,0,0,0,1,544,400,345,0,0,0
6881,chrX,155054918,+,,,,,,6.0,2.0,...,0,0,0,1,791,540,545,0,0,0


In [131]:
# save table to tsv
dfCT_wt_aggregated_candidates_merged_anno_with_oxps.to_csv("/lustre/bio_running/CtoUclassifier_new_model_training23052022/src_jupyter_notebooks/iForest_cc1_wt_ko_no_indels_mismatches___NEW_TRAINING/dfCT_wt_aggregated_candidates_merged_anno_with_oxps.tsv", sep="\t")

In [134]:
# filter out known snps
WtOxp_nosnp =  dfCT_wt_aggregated_candidates_merged_anno_with_oxps.query("issnp == 0")
WtOxp_nosnp

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,...,isvar_wt1,isvar_wt2,isvar_wt3,issnp,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,...,0,0,0,0,529,515,381,0,0,0
5,chr1,999124,-,139.0,13.0,478.0,0.290795,0.027197,52.0,5.0,...,0,0,0,0,12,17,8,0,0,0
10,chr1,1233600,+,47.0,7.0,109.0,0.431193,0.064220,37.0,6.0,...,0,0,1,0,92,52,73,0,0,0
12,chr1,1255428,-,11.0,2.0,94.0,0.117021,0.021277,,,...,0,0,0,0,254,228,199,0,0,0
13,chr1,1373819,-,139.0,20.0,590.0,0.235593,0.033898,57.0,8.0,...,0,0,0,0,755,570,798,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6874,chrX,154428837,+,,,,,,4.0,2.0,...,0,0,0,0,61,75,55,0,0,0
6875,chrX,154432414,+,,,,,,5.0,3.0,...,0,1,0,0,450,404,338,0,0,0
6877,chrX,154435306,+,,,,,,22.0,2.0,...,0,0,0,0,500,500,431,0,0,0
6878,chrX,154436104,+,,,,,,11.0,2.0,...,0,1,0,0,476,408,404,0,1,1


In [139]:
# filter out also site with CT evidence in WT
WtOxp_nosnp_noCTwt = WtOxp_nosnp[(WtOxp_nosnp["isvar_wt1"]==0)&(WtOxp_nosnp["isvar_wt2"]==0)&(WtOxp_nosnp["isvar_wt3"]==0)]
WtOxp_nosnp_noCTwt

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,...,isvar_wt1,isvar_wt2,isvar_wt3,issnp,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3
2,chr1,944498,-,36.0,4.0,317.0,0.113565,0.012618,19.0,2.0,...,0,0,0,0,529,515,381,0,0,0
5,chr1,999124,-,139.0,13.0,478.0,0.290795,0.027197,52.0,5.0,...,0,0,0,0,12,17,8,0,0,0
12,chr1,1255428,-,11.0,2.0,94.0,0.117021,0.021277,,,...,0,0,0,0,254,228,199,0,0,0
13,chr1,1373819,-,139.0,20.0,590.0,0.235593,0.033898,57.0,8.0,...,0,0,0,0,755,570,798,0,0,0
19,chr1,1598750,-,6.0,2.0,57.0,0.105263,0.035088,,,...,0,0,0,0,25,15,42,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6871,chrX,136869239,-,,,,,,10.0,2.0,...,0,0,0,0,1002,882,652,1,1,1
6872,chrX,152827591,-,,,,,,17.0,2.0,...,0,0,0,0,336,290,373,0,0,0
6874,chrX,154428837,+,,,,,,4.0,2.0,...,0,0,0,0,61,75,55,0,0,0
6877,chrX,154435306,+,,,,,,22.0,2.0,...,0,0,0,0,500,500,431,0,0,0


In [142]:
# select sites that have at least one OXP run with CT evidence on illumina samples
WtOxp_nosnp_noCTwt[(WtOxp_nosnp_noCTwt["isvar_oxp1"]==1)|(WtOxp_nosnp_noCTwt["isvar_oxp2"]==1)|(WtOxp_nosnp_noCTwt["isvar_oxp3"]==1)]

Unnamed: 0,region,position,strand,T_native_wt1,T_corrected_wt1,depth_stranded_wt1,Tfreq_native_wt1,Tfreq_corrected_wt1,T_native_wt4,T_corrected_wt4,...,isvar_wt1,isvar_wt2,isvar_wt3,issnp,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3
24,chr1,1789099,-,49.0,4.0,285.0,0.171930,0.014035,17.0,2.0,...,0,0,0,0,945,1041,678,0,0,1
35,chr1,6185659,-,33.0,4.0,91.0,0.362637,0.043956,,,...,0,0,0,0,414,200,326,1,1,0
63,chr1,11023383,+,34.0,6.0,163.0,0.208589,0.036810,,,...,0,0,0,0,742,687,602,0,1,0
80,chr1,17028732,-,47.0,5.0,424.0,0.110849,0.011792,35.0,5.0,...,0,0,0,0,1039,769,767,0,0,1
100,chr1,22092747,+,45.0,6.0,203.0,0.221675,0.029557,24.0,2.0,...,0,0,0,0,1556,1352,1400,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6867,chrX,136210288,+,,,,,,16.0,2.0,...,0,0,0,0,375,258,392,0,0,1
6868,chrX,136211080,+,,,,,,30.0,3.0,...,0,0,0,0,596,463,510,1,1,1
6869,chrX,136211135,+,,,,,,36.0,3.0,...,0,0,0,0,589,440,538,0,1,0
6870,chrX,136211154,+,,,,,,34.0,2.0,...,0,0,0,0,538,405,503,0,0,1


Perform a similar analysis also for OXP editing sites candidates:

In [154]:
# open oxp sites candidates merged table
dfCT_oxp_aggregated_candidates_merged = pd.read_table("/lustre/bio_running/CtoUclassifier_new_model_training23052022/src_jupyter_notebooks/iForest_cc1_wt_ko_no_indels_mismatches___NEW_TRAINING/dfCT_oxp_aggregated_candidates_merged_human.tsv", index_col=0)
dfCT_oxp_aggregated_candidates_merged

Unnamed: 0,region,position,strand,T_native_oxp1,T_corrected_oxp1,depth_stranded_oxp1,Tfreq_native_oxp1,Tfreq_corrected_oxp1,T_native_oxp2,T_corrected_oxp2,...,T_native_oxp3,T_corrected_oxp3,depth_stranded_oxp3,Tfreq_native_oxp3,Tfreq_corrected_oxp3,T_native_oxp4,T_corrected_oxp4,depth_stranded_oxp4,Tfreq_native_oxp4,Tfreq_corrected_oxp4
0,chr1,1000018,-,54.0,15.0,201.0,0.268657,0.074627,61.0,13.0,...,,,,,,,,,,
1,chr1,1373819,-,35.0,4.0,101.0,0.346535,0.039604,73.0,8.0,...,,,,,,,,,,
2,chr1,6186681,-,67.0,20.0,559.0,0.119857,0.035778,,,...,7.0,3.0,149.0,0.046980,0.020134,13.0,3.0,220.0,0.059091,0.013636
3,chr1,6186699,-,71.0,12.0,556.0,0.127698,0.021583,,,...,,,,,,7.0,3.0,220.0,0.031818,0.013636
4,chr1,6197662,-,121.0,7.0,533.0,0.227017,0.013133,,,...,,,,,,36.0,3.0,211.0,0.170616,0.014218
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,chr16,30067538,+,,,,,,,,...,16.0,2.0,150.0,0.106667,0.013333,30.0,3.0,219.0,0.136986,0.013699
653,chr17,8377549,-,,,,,,,,...,21.0,5.0,311.0,0.067524,0.016077,24.0,5.0,433.0,0.055427,0.011547
654,chr17,39204567,+,,,,,,,,...,28.0,5.0,301.0,0.093023,0.016611,43.0,8.0,432.0,0.099537,0.018519
655,chr19,46838464,-,,,,,,,,...,10.0,4.0,61.0,0.163934,0.065574,9.0,2.0,80.0,0.112500,0.025000


In [155]:
covs = []
vars_ = []
issnp = []

# assess each site
with tqdm(total=dfCT_oxp_aggregated_candidates_merged.shape[0]) as pbar:
    for oxp_site in dfCT_oxp_aggregated_candidates_merged.itertuples():
        region = oxp_site.region
        pos1based = oxp_site.position
        pos0based = pos1based-1
        #print(region, pos1based)
        # assess if the site is covered at least in one oxp reditools sample. 
        # if covered verify 
        redicovs = []
        redivars = []
        for rediout in oxps:
            try:
                rediquery = ([i.split("\t") for i in rediout.fetch(region, pos0based, pos0based+1)])[0]
                redicovs.append(int(rediquery[4]))
                # it is covered thus assess if it is a variant or not
                if "CT" in rediquery[7]:
                    redivars.append(1)
                else:
                    redivars.append(0)
            except IndexError as e: # not covered
                redicovs.append(0)
                redivars.append(0)
        covs.append(redicovs)
        vars_.append(redivars)
        # assess if known snp or not
        query = pd.DataFrame([i.split("\t") for i in dbsnp.fetch(region, pos0based, pos0based+1)]) # pysam works in a python manner with 0indexing and right opned intervals
        if query.empty:
            issnp.append(0)
        else:
            issnp.append(1)
        pbar.update(1)

dfCT_oxp_aggregated_candidates_merged_anno = dfCT_oxp_aggregated_candidates_merged.copy()
dfCT_oxp_aggregated_candidates_merged_anno[["cov_oxp1", "cov_oxp2", "cov_oxp3"]] = covs
dfCT_oxp_aggregated_candidates_merged_anno[["isvar_oxp1", "isvar_oxp2", "isvar_oxp3"]] = vars_
dfCT_oxp_aggregated_candidates_merged_anno["issnp"] = issnp
dfCT_oxp_aggregated_candidates_merged_anno

100%|██████████| 657/657 [00:04<00:00, 157.33it/s]


Unnamed: 0,region,position,strand,T_native_oxp1,T_corrected_oxp1,depth_stranded_oxp1,Tfreq_native_oxp1,Tfreq_corrected_oxp1,T_native_oxp2,T_corrected_oxp2,...,depth_stranded_oxp4,Tfreq_native_oxp4,Tfreq_corrected_oxp4,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3,issnp
0,chr1,1000018,-,54.0,15.0,201.0,0.268657,0.074627,61.0,13.0,...,,,,0,3,1,0,1,1,1
1,chr1,1373819,-,35.0,4.0,101.0,0.346535,0.039604,73.0,8.0,...,,,,755,570,798,0,0,0,0
2,chr1,6186681,-,67.0,20.0,559.0,0.119857,0.035778,,,...,220.0,0.059091,0.013636,2720,1436,2708,1,1,1,1
3,chr1,6186699,-,71.0,12.0,556.0,0.127698,0.021583,,,...,220.0,0.031818,0.013636,3549,1771,3493,1,1,1,1
4,chr1,6197662,-,121.0,7.0,533.0,0.227017,0.013133,,,...,211.0,0.170616,0.014218,8011,3898,6627,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,chr16,30067538,+,,,,,,,,...,219.0,0.136986,0.013699,3810,2666,3915,0,0,0,0
653,chr17,8377549,-,,,,,,,,...,433.0,0.055427,0.011547,4805,2606,3393,1,1,1,0
654,chr17,39204567,+,,,,,,,,...,432.0,0.099537,0.018519,11796,6177,9302,1,1,1,0
655,chr19,46838464,-,,,,,,,,...,80.0,0.112500,0.025000,348,249,354,0,0,1,0


In [156]:
# save to disk
dfCT_oxp_aggregated_candidates_merged_anno.to_csv("dfCT_oxp_aggregated_candidates_merged_human_anno.tsv", sep="\t")

In [159]:
# check candidates editing sites supported also by illumina
dfCT_oxp_aggregated_candidates_merged_anno[(dfCT_oxp_aggregated_candidates_merged_anno["isvar_oxp1"]==1)|(dfCT_oxp_aggregated_candidates_merged_anno["isvar_oxp2"]==1)|(dfCT_oxp_aggregated_candidates_merged_anno["isvar_oxp3"]==1)]

Unnamed: 0,region,position,strand,T_native_oxp1,T_corrected_oxp1,depth_stranded_oxp1,Tfreq_native_oxp1,Tfreq_corrected_oxp1,T_native_oxp2,T_corrected_oxp2,...,depth_stranded_oxp4,Tfreq_native_oxp4,Tfreq_corrected_oxp4,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3,issnp
0,chr1,1000018,-,54.0,15.0,201.0,0.268657,0.074627,61.0,13.0,...,,,,0,3,1,0,1,1,1
2,chr1,6186681,-,67.0,20.0,559.0,0.119857,0.035778,,,...,220.0,0.059091,0.013636,2720,1436,2708,1,1,1,1
3,chr1,6186699,-,71.0,12.0,556.0,0.127698,0.021583,,,...,220.0,0.031818,0.013636,3549,1771,3493,1,1,1,1
4,chr1,6197662,-,121.0,7.0,533.0,0.227017,0.013133,,,...,211.0,0.170616,0.014218,8011,3898,6627,0,1,0,0
5,chr1,8862920,-,71.0,8.0,484.0,0.146694,0.016529,262.0,46.0,...,395.0,0.134177,0.027848,8290,5338,8100,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,chr12,112405999,-,,,,,,,,...,396.0,0.090909,0.012626,13799,7545,10718,1,1,1,1
653,chr17,8377549,-,,,,,,,,...,433.0,0.055427,0.011547,4805,2606,3393,1,1,1,0
654,chr17,39204567,+,,,,,,,,...,432.0,0.099537,0.018519,11796,6177,9302,1,1,1,0
655,chr19,46838464,-,,,,,,,,...,80.0,0.112500,0.025000,348,249,354,0,0,1,0


In [157]:
# filter out known snps positions
oxp_nosnps = dfCT_oxp_aggregated_candidates_merged_anno.query("issnp == 0")
oxp_nosnps

Unnamed: 0,region,position,strand,T_native_oxp1,T_corrected_oxp1,depth_stranded_oxp1,Tfreq_native_oxp1,Tfreq_corrected_oxp1,T_native_oxp2,T_corrected_oxp2,...,depth_stranded_oxp4,Tfreq_native_oxp4,Tfreq_corrected_oxp4,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3,issnp
1,chr1,1373819,-,35.0,4.0,101.0,0.346535,0.039604,73.0,8.0,...,,,,755,570,798,0,0,0,0
4,chr1,6197662,-,121.0,7.0,533.0,0.227017,0.013133,,,...,211.0,0.170616,0.014218,8011,3898,6627,0,1,0,0
5,chr1,8862920,-,71.0,8.0,484.0,0.146694,0.016529,262.0,46.0,...,395.0,0.134177,0.027848,8290,5338,8100,1,1,1,0
10,chr1,23694770,+,321.0,21.0,1218.0,0.263547,0.017241,370.0,29.0,...,383.0,0.133159,0.015666,13588,7078,9808,1,1,1,0
12,chr1,25900608,-,33.0,3.0,187.0,0.176471,0.016043,25.0,3.0,...,,,,1129,1148,1271,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,chr16,30067538,+,,,,,,,,...,219.0,0.136986,0.013699,3810,2666,3915,0,0,0,0
653,chr17,8377549,-,,,,,,,,...,433.0,0.055427,0.011547,4805,2606,3393,1,1,1,0
654,chr17,39204567,+,,,,,,,,...,432.0,0.099537,0.018519,11796,6177,9302,1,1,1,0
655,chr19,46838464,-,,,,,,,,...,80.0,0.112500,0.025000,348,249,354,0,0,1,0


In [158]:
oxp_nosnps_noCT = oxp_nosnps[(oxp_nosnps["isvar_oxp1"]==1)|(oxp_nosnps["isvar_oxp3"]==1)|(oxp_nosnps["isvar_oxp3"]==1)]
oxp_nosnps_noCT

Unnamed: 0,region,position,strand,T_native_oxp1,T_corrected_oxp1,depth_stranded_oxp1,Tfreq_native_oxp1,Tfreq_corrected_oxp1,T_native_oxp2,T_corrected_oxp2,...,depth_stranded_oxp4,Tfreq_native_oxp4,Tfreq_corrected_oxp4,cov_oxp1,cov_oxp2,cov_oxp3,isvar_oxp1,isvar_oxp2,isvar_oxp3,issnp
5,chr1,8862920,-,71.0,8.0,484.0,0.146694,0.016529,262.0,46.0,...,395.0,0.134177,0.027848,8290,5338,8100,1,1,1,0
10,chr1,23694770,+,321.0,21.0,1218.0,0.263547,0.017241,370.0,29.0,...,383.0,0.133159,0.015666,13588,7078,9808,1,1,1,0
12,chr1,25900608,-,33.0,3.0,187.0,0.176471,0.016043,25.0,3.0,...,,,,1129,1148,1271,1,1,1,0
16,chr1,28237967,+,26.0,4.0,240.0,0.108333,0.016667,,,...,,,,1799,1238,1653,1,1,1,0
20,chr1,42702093,+,57.0,6.0,366.0,0.155738,0.016393,,,...,149.0,0.107383,0.020134,4747,2862,4016,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
649,chr12,21644102,-,,,,,,,,...,139.0,0.115108,0.014388,5746,3172,5229,1,1,0,0
653,chr17,8377549,-,,,,,,,,...,433.0,0.055427,0.011547,4805,2606,3393,1,1,1,0
654,chr17,39204567,+,,,,,,,,...,432.0,0.099537,0.018519,11796,6177,9302,1,1,1,0
655,chr19,46838464,-,,,,,,,,...,80.0,0.112500,0.025000,348,249,354,0,0,1,0


In [175]:
oxp_nosnps_noCT[[f"Tfreq_corrected_oxp{i}" for i in range(1,5)]].mean(axis=1)

5      0.024319
10     0.016437
12     0.015164
16     0.022619
20     0.018264
         ...   
649    0.019242
653    0.013812
654    0.017565
655    0.045287
656    0.046843
Length: 226, dtype: float64