In [1]:
# import modules
import os, sys
import numpy as np
import pandas as pd

In [2]:
# load and select only putative edited sites WT
wt = pd.read_csv("WT.df_CT_predicted_aggregated_iforest_apobec1_pvalues.tsv", index_col=0)
wt = wt.query("y_hat == 1").query("p_value < 0.01").query("Tfreq_corrected > 0.05")
wt

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
117,chr1,4915443,+,5,3,54,0.092593,0.055556,TACAA,1,0.000049
120,chr1,4915467,+,5,4,55,0.090909,0.072727,TACAG,1,0.005247
136,chr1,4915694,+,7,4,51,0.137255,0.078431,TACTA,1,0.000521
192,chr1,4916336,+,20,4,67,0.298507,0.059701,GCCGA,1,0.000316
199,chr1,4916384,+,13,4,65,0.200000,0.061538,TACTG,1,0.001750
...,...,...,...,...,...,...,...,...,...,...,...
387801,chrX,135147776,+,15,12,56,0.267857,0.214286,AACTT,1,0.000023
387803,chrX,135147837,+,11,7,52,0.211538,0.134615,AACAC,1,0.001189
387804,chrX,135147839,+,8,7,53,0.150943,0.132075,CACGC,1,0.000272
387832,chrX,135633808,-,38,11,126,0.301587,0.087302,AACAT,1,0.008177


In [3]:
# load and select only putative edited sites KO
ko = pd.read_csv("KO.df_CT_predicted_aggregated_iforest_apobec1_pvalues.tsv", index_col=0)
ko = ko.query("y_hat == 1").query("p_value < 0.01").query("Tfreq_corrected > 0.05")
ko

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
309,chr1,4916336,+,25,7,104,0.240385,0.067308,GCCGA,1,0.000316
3269,chr1,30981134,-,17,4,51,0.333333,0.078431,GTCTG,1,0.004376
3472,chr1,34475857,-,11,4,69,0.159420,0.057971,TTCTT,1,0.000048
6227,chr1,43789117,-,16,3,51,0.313725,0.058824,TGCTG,1,0.002314
6575,chr1,45856512,+,25,10,66,0.378788,0.151515,TACTG,1,0.009936
...,...,...,...,...,...,...,...,...,...,...,...
518124,chrX,37508799,-,13,5,79,0.164557,0.063291,TTCTA,1,0.000814
521723,chrX,74152885,+,25,5,68,0.367647,0.073529,CTCTG,1,0.000898
525101,chrX,133507437,+,53,35,60,0.883333,0.583333,TACGT,1,0.000231
525218,chrX,135633808,-,51,9,149,0.342282,0.060403,AACAT,1,0.008177


In [4]:
wt.Tfreq_corrected.describe()

count    358.000000
mean       0.106514
std        0.120755
min        0.050473
25%        0.057692
50%        0.071429
75%        0.096619
max        0.878378
Name: Tfreq_corrected, dtype: float64

In [5]:
ko.Tfreq_corrected.describe()

count    241.000000
mean       0.116997
std        0.151742
min        0.050505
25%        0.057692
50%        0.068966
75%        0.092593
max        0.916667
Name: Tfreq_corrected, dtype: float64

In [6]:
common_sites = pd.merge(wt[["region", "position", "strand"]], ko[["region", "position", "strand"]], how="inner")
common_sites

Unnamed: 0,region,position,strand
0,chr1,4916336,+
1,chr1,34475857,-
2,chr1,63190868,-
3,chr1,134332313,+
4,chr1,162422387,+
...,...,...,...
83,chr9,104954570,+
84,chrM,3524,+
85,chrX,37508724,-
86,chrX,133507437,+


In [7]:
# filter out wt sites in common with ko to retrieve reliable editing sites
wt_reliable = wt.copy()
for s in wt.itertuples():
    if not common_sites[(common_sites["region"] == s.region)&(common_sites["position"] == s.position)&(common_sites["strand"] == s.strand)].empty:
         wt_reliable.drop(s.Index, inplace=True)
wt_reliable

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
117,chr1,4915443,+,5,3,54,0.092593,0.055556,TACAA,1,0.000049
120,chr1,4915467,+,5,4,55,0.090909,0.072727,TACAG,1,0.005247
136,chr1,4915694,+,7,4,51,0.137255,0.078431,TACTA,1,0.000521
199,chr1,4916384,+,13,4,65,0.200000,0.061538,TACTG,1,0.001750
401,chr1,4967044,+,10,8,106,0.094340,0.075472,AACTA,1,0.000271
...,...,...,...,...,...,...,...,...,...,...,...
387721,chrX,133507395,+,10,6,55,0.181818,0.109091,TACTT,1,0.000144
387801,chrX,135147776,+,15,12,56,0.267857,0.214286,AACTT,1,0.000023
387803,chrX,135147837,+,11,7,52,0.211538,0.134615,AACAC,1,0.001189
387804,chrX,135147839,+,8,7,53,0.150943,0.132075,CACGC,1,0.000272


In [8]:
# save to disk as vcf file
output_filepaht = "reliable_ed_pos_iforest.vcf"
with open(output_filepaht, "w") as out_vcf:
    for s in wt_reliable.itertuples():
        region = s.region.split("chr")[1]
        position = s.position
        site = f"{region}\t{position}\t{position}\tC\tT\n"
        out_vcf.write(site)

with open(output_filepaht) as f:
    print(f.read())

1	4915443	4915443	C	T
1	4915467	4915467	C	T
1	4915694	4915694	C	T
1	4916384	4916384	C	T
1	4967044	4967044	C	T
1	4967240	4967240	C	T
1	10105387	10105387	C	T
1	30981426	30981426	C	T
1	39590526	39590526	C	T
1	87647422	87647422	C	T
1	88146699	88146699	C	T
1	95261894	95261894	C	T
1	106699005	106699005	C	T
1	118261254	118261254	C	T
1	156257378	156257378	C	T
1	156257526	156257526	C	T
1	160030625	160030625	C	T
1	170956838	170956838	C	T
1	173281876	173281876	C	T
1	182103943	182103943	C	T
1	182103970	182103970	C	T
1	191086813	191086813	C	T
10	17887230	17887230	C	T
10	17887338	17887338	C	T
10	40163958	40163958	C	T
10	57392081	57392081	C	T
10	57392790	57392790	C	T
10	62292031	62292031	C	T
10	62292185	62292185	C	T
10	88579405	88579405	C	T
10	88579409	88579409	C	T
10	93696816	93696816	C	T
10	117185123	117185123	C	T
10	128197551	128197551	C	T
11	20175333	20175333	C	T
11	29692507	29692507	C	T
11	29692828	29692828	C	T
11	29692881	29692881	C	T
11	49995461	49995461	C	T
11	50186566	50186566	C	T
11	5027696

In [9]:
!/lustrehome/afonzino/annovar/table_annovar.pl reliable_ed_pos_iforest.vcf /lustrehome/afonzino/annovar/mousedb/ -buildver mm39 -out reliable_ed_pos_iforest.annotated -protocol refGene -operation g -nastring . --remove -polish

-----------------------------------------------------------------
NOTICE: Processing operation=g protocol=refGene

NOTICE: Running with system command <annotate_variation.pl -geneanno -buildver mm39 -dbtype refGene -outfile reliable_ed_pos_iforest.annotated.refGene -exonsort -nofirstcodondel reliable_ed_pos_iforest.vcf /lustrehome/afonzino/annovar/mousedb/>
NOTICE: Output files are written to reliable_ed_pos_iforest.annotated.refGene.variant_function, reliable_ed_pos_iforest.annotated.refGene.exonic_variant_function
NOTICE: Reading gene annotation from /lustrehome/afonzino/annovar/mousedb/mm39_refGene.txt ... Done with 47741 transcripts (including 8698 without coding sequence annotation) for 26264 unique genes
NOTICE: Processing next batch with 270 unique variants in 270 input lines
NOTICE: Reading FASTA sequences from /lustrehome/afonzino/annovar/mousedb/mm39_refGeneMrna.fa ... Done with 0 sequences
 (example: NM_011623#11#98883772 NM_013715#1#10094824 NM_011185#17#15695982)

NOTICE: 

In [10]:
# load annotations
reliable_iforest_annotated = pd.read_table("reliable_ed_pos_iforest.annotated.mm39_multianno.txt")
reliable_iforest_annotated

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene
0,1,4915443,4915443,C,T,UTR3,Lypla1,NM_001355712:c.*204C>T;NM_008866:c.*204C>T,.,.
1,1,4915467,4915467,C,T,UTR3,Lypla1,NM_001355712:c.*228C>T;NM_008866:c.*228C>T,.,.
2,1,4915694,4915694,C,T,UTR3,Lypla1,NM_001355712:c.*455C>T;NM_008866:c.*455C>T,.,.
3,1,4916384,4916384,C,T,UTR3,Lypla1,NM_001355712:c.*1145C>T;NM_008866:c.*1145C>T,.,.
4,1,4967044,4967044,C,T,UTR3,Tcea1,NM_001159750:c.*457C>T;NM_011541:c.*457C>T;NM_...,.,.
...,...,...,...,...,...,...,...,...,...,...
265,X,133507395,133507395,C,T,UTR3,Hnrnph2,NM_001313716:c.*389C>T;NM_001313717:c.*389C>T;...,.,.
266,X,135147776,135147776,C,T,UTR3,Tceal9,NM_011712:c.*347C>T,.,.
267,X,135147837,135147837,C,T,UTR3,Tceal9,NM_011712:c.*408C>T,.,.
268,X,135147839,135147839,C,T,UTR3,Tceal9,NM_011712:c.*410C>T,.,.


In [11]:
# evaluate region
reliable_iforest_annotated["Func.refGene"].value_counts()

UTR3              233
exonic             18
intronic            9
ncRNA_exonic        3
UTR5                2
downstream          2
intergenic          2
ncRNA_intronic      1
Name: Func.refGene, dtype: int64

In [12]:
# visualize most impacted genes
reliable_iforest_annotated["Gene.refGene"].value_counts()

Selenof    8
Lamp2      8
Sh3bgrl    6
Mbnl1      5
Atp6ap2    5
          ..
Psmb1      1
Ube2i      1
Ergic1     1
Wdr43      1
Rbbp7      1
Name: Gene.refGene, Length: 181, dtype: int64

In [13]:
# visualize B2m 1st site
reliable_iforest_annotated.query("Chr == '2'").query("Start == 121983221")

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene
111,2,121983221,121983221,C,T,UTR3,B2m,NM_009735:c.*105C>T,.,.


In [14]:
# visualize B2m 2nd site
reliable_iforest_annotated.query("Chr == '2'").query("Start == 121983223")

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene
112,2,121983223,121983223,C,T,UTR3,B2m,NM_009735:c.*107C>T,.,.


In [18]:
reliable_iforest_annotated[reliable_iforest_annotated['Func.refGene'] == 'exonic']

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene
6,1,10105387,10105387,C,T,exonic,Cops5,.,unknown,UNKNOWN
21,1,191086813,191086813,C,T,exonic,Ppp2r5a,.,unknown,UNKNOWN
43,11,98886196,98886196,C,T,exonic,Top2a,.,unknown,UNKNOWN
53,12,69232458,69232458,C,T,exonic,Mgat2,.,unknown,UNKNOWN
79,14,101903200,101903200,C,T,exonic,Uchl3,.,unknown,UNKNOWN
89,17,15714679,15714679,C,T,exonic,Psmb1,.,unknown,UNKNOWN
90,17,25483817,25483817,C,T,exonic,Ube2i,.,unknown,UNKNOWN
93,18,9386736,9386736,C,T,exonic,Ccny,.,unknown,UNKNOWN
103,2,71251023,71251023,C,T,exonic,Hat1,.,unknown,UNKNOWN
107,2,83492503,83492503,C,T,exonic,Zc3h15,.,unknown,UNKNOWN


In [19]:
wt_reliable.query("region == 'chr1'").query("position == 10105387")

Unnamed: 0,region,position,strand,T_native,T_corrected,depth_stranded,Tfreq_native,Tfreq_corrected,5mer,y_hat,p_value
889,chr1,10105387,-,9,8,108,0.083333,0.074074,GTCAC,1,0.004167
