In [1]:
##################
### Author: Adriano Fonzino. email: adriano.fonzino@uniba.it
##################
import os
import pandas as pd
import numpy as np


# define util function for downstream analysis
def join_bonafides(bonafide_fp, bonafide_anno_filt):
    ### function to join bonafide with reditools info with bonafide with anno and filtered 
    ### NON-REP only Refseq
    # load bonafide with reditools alignment profiles/data
    bonafide = pd.read_table(bonafide_fp)
    print("Bonafide not-filted with REDItools info:")
    display(bonafide)
    # load annotated table filtered for positives NON-REP being in Refseq
    bonafide_anno_filt = pd.read_table(bonafide_anno_filt_fp)
    print("Bonafide annotated and filtered (NON-REP only Refseq):")
    display(bonafide_anno_filt)
    # inner join between bonafide with and without annotation/filtering
    joined = pd.merge(bonafide, bonafide_anno_filt, 
                      left_on=["wt_Region", "wt_Position"],
                      right_on=["Region", "Position"], how="inner")
    print("Common sites annotated with REDItools info:")
    display(joined)
    # print not common site (not Refseq)
    left_join = pd.merge(bonafide, bonafide_anno_filt, 
                        left_on=["wt_Region", "wt_Position"],
                        right_on=["Region", "Position"], how="left")
    print("Discared sites (via left join):")
    display(left_join[left_join["Class_y"].isna()])
    output_file = bonafide_fp + ".anno.filt_refseq.tsv"
    print("Save on disk joined with annotation and filtering to :", output_file)
    joined.to_csv(output_file, index=None, sep="\t")

    return joined

## Hek Pec.

In [2]:
### Couple 1
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_530905096_outTable_905657585_bona_fide_sites_2.tsv"

couple1_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,383,46.11,"[372, 0, 11, 0]",AG,0.03,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
1,chrX,299510,A,1,388,45.23,"[281, 0, 107, 0]",AG,0.28,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
2,chrX,299513,A,1,385,45.16,"[381, 0, 4, 0]",AG,0.01,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
3,chrX,299540,A,1,365,47.04,"[359, 0, 6, 0]",AG,0.02,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
4,chrX,299543,A,1,368,47.25,"[365, 0, 3, 0]",AG,0.01,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39484,chr8,144451628,A,0,299,51.34,"[244, 0, 55, 0]",AG,0.18,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39485,chr8,144454002,A,0,59,45.15,"[0, 0, 59, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39486,chr8,144460013,A,0,57,51.93,"[41, 0, 16, 0]",AG,0.28,chr8,...,0,33,39.91,"[20, 0, 13, 0]",AG,0.39,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39487,chr8,144468337,A,1,420,48.59,"[0, 0, 420, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,16996,0,-,-,"transcript,exon",WASH7P
1,chr1,632061,1,-,-,"exon,transcript",MTCO1P12
2,chr1,926428,0,-,-,transcript,SAMD11
3,chr1,926744,0,-,-,transcript,SAMD11
4,chr1,953259,0,-,-,"transcript,CDS,exon",NOC2L
...,...,...,...,...,...,...,...
39392,chrX,154512401,1,SINE,AluSz6,"transcript,CDS,exon",FAM3A
39393,chrX,154777624,1,-,-,"transcript,3UTR,exon",DKC1
39394,chrX,155055080,1,-,-,"transcript,exon,3UTR",FUNDC2
39395,chrX,156023071,0,-,-,"exon,transcript",WASH6P


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,299493,A,1,383,46.11,"[372, 0, 11, 0]",AG,0.03,chrX,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299493,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
1,chrX,299510,A,1,388,45.23,"[281, 0, 107, 0]",AG,0.28,chrX,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299510,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
2,chrX,299513,A,1,385,45.16,"[381, 0, 4, 0]",AG,0.01,chrX,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299513,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
3,chrX,299540,A,1,365,47.04,"[359, 0, 6, 0]",AG,0.02,chrX,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299540,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
4,chrX,299543,A,1,368,47.25,"[365, 0, 3, 0]",AG,0.01,chrX,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299543,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39392,chr8,144451628,A,0,299,51.34,"[244, 0, 55, 0]",AG,0.18,chr8,...,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144451628,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
39393,chr8,144454002,A,0,59,45.15,"[0, 0, 59, 0]",AG,1.00,chr8,...,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144454002,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
39394,chr8,144460013,A,0,57,51.93,"[41, 0, 16, 0]",AG,0.28,chr8,...,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144460013,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
39395,chr8,144468337,A,1,420,48.59,"[0, 0, 420, 0]",AG,1.00,chr8,...,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144468337,0,-,-,"transcript,CDS,exon",KIFC2


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
1020,chrX,100842937,A,1,373,47.42,"[369, 0, 4, 0]",AG,0.01,chrX,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
1130,chrX,101343611,A,0,292,46.29,"[289, 0, 3, 0]",AG,0.01,chrX,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
1750,chr13,52455105,A,0,217,42.74,"[214, 0, 3, 0]",AG,0.01,chr13,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
1847,chr12,3040871,A,1,160,52.64,"[157, 0, 3, 0]",AG,0.02,chr12,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
3716,chr11,33073863,A,1,205,53.91,"[171, 0, 34, 0]",AG,0.17,chr11,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35299,chr8,29061325,A,1,258,47.03,"[255, 0, 3, 0]",AG,0.01,chr8,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
35300,chr8,29063918,A,1,251,45.07,"[235, 0, 16, 0]",AG,0.06,chr8,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
35301,chr8,29063919,A,1,249,45.51,"[246, 0, 3, 0]",AG,0.01,chr8,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
35609,chr8,51899504,A,1,333,46.92,"[327, 0, 6, 0]",AG,0.02,chr8,...,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_final.tsv.anno.filt_refseq.tsv


In [3]:
### Couple 2
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_814257267_outTable_364841872_bona_fide_sites_2.tsv"

couple2_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,324,45.15,"[320, 0, 4, 0]",AG,0.01,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
1,chrX,299510,A,1,318,45.16,"[236, 0, 82, 0]",AG,0.26,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
2,chrX,299513,A,1,314,45.80,"[305, 0, 9, 0]",AG,0.03,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
3,chrX,299543,A,1,303,45.42,"[294, 0, 9, 0]",AG,0.03,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
4,chrX,299551,A,1,288,47.07,"[283, 0, 5, 0]",AG,0.02,chrX,...,1,38,41.45,"[38, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37478,chr8,144451628,A,0,276,48.17,"[219, 0, 57, 0]",AG,0.21,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37479,chr8,144457131,A,0,90,46.32,"[11, 0, 79, 0]",AG,0.88,chr8,...,0,35,39.86,"[14, 0, 21, 0]",AG,0.60,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37480,chr8,144458636,A,0,95,51.41,"[73, 0, 22, 0]",AG,0.23,chr8,...,0,36,42.17,"[19, 0, 17, 0]",AG,0.47,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37481,chr8,144468337,A,1,396,49.43,"[0, 0, 396, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,14610,0,-,-,"exon,transcript",WASH7P
1,chr1,16961,1,-,-,"transcript,exon",WASH7P
2,chr1,16996,0,-,-,"transcript,exon",WASH7P
3,chr1,729312,1,-,-,"exon,transcript",LOC100288069
4,chr1,753804,1,SINE,AluJo,transcript,LOC100288069
...,...,...,...,...,...,...,...
37410,chrX,154512401,1,SINE,AluSz6,"transcript,CDS,exon",FAM3A
37411,chrX,155121871,1,SINE,AluJr,"transcript,3UTR,exon",BRCC3
37412,chrX,155121872,1,SINE,AluJr,"transcript,3UTR,exon",BRCC3
37413,chrX,156023071,0,-,-,"exon,transcript",WASH6P


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,299493,A,1,324,45.15,"[320, 0, 4, 0]",AG,0.01,chrX,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299493,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
1,chrX,299510,A,1,318,45.16,"[236, 0, 82, 0]",AG,0.26,chrX,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299510,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
2,chrX,299513,A,1,314,45.80,"[305, 0, 9, 0]",AG,0.03,chrX,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299513,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
3,chrX,299543,A,1,303,45.42,"[294, 0, 9, 0]",AG,0.03,chrX,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299543,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
4,chrX,299551,A,1,288,47.07,"[283, 0, 5, 0]",AG,0.02,chrX,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299551,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37410,chr8,144451628,A,0,276,48.17,"[219, 0, 57, 0]",AG,0.21,chr8,...,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144451628,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
37411,chr8,144457131,A,0,90,46.32,"[11, 0, 79, 0]",AG,0.88,chr8,...,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144457131,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
37412,chr8,144458636,A,0,95,51.41,"[73, 0, 22, 0]",AG,0.23,chr8,...,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144458636,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
37413,chr8,144468337,A,1,396,49.43,"[0, 0, 396, 0]",AG,1.00,chr8,...,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144468337,0,-,-,"transcript,CDS,exon",KIFC2


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
2201,chr12,56731567,A,0,622,44.49,"[613, 0, 9, 0]",AG,0.01,chr12,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
2202,chr12,56731568,A,0,664,44.12,"[652, 0, 12, 0]",AG,0.02,chr12,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
3496,chr11,33073863,A,1,159,51.98,"[135, 0, 24, 0]",AG,0.15,chr11,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
3497,chr11,33073882,A,1,179,47.34,"[176, 0, 3, 0]",AG,0.02,chr11,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
3498,chr11,33073924,A,1,171,50.06,"[168, 0, 3, 0]",AG,0.02,chr11,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31696,chr1,161422242,A,1,179,47.34,"[175, 0, 4, 0]",AG,0.02,chr1,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
32473,chr1,243851376,A,1,239,46.29,"[235, 0, 4, 0]",AG,0.02,chr1,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
32475,chr1,244849504,A,0,132,43.25,"[129, 0, 3, 0]",AG,0.02,chr1,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
33186,chr8,29063918,A,1,218,47.64,"[209, 0, 9, 0]",AG,0.04,chr8,...,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_final.tsv.anno.filt_refseq.tsv


In [4]:
### Couple 3
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_208420383_outTable_597789462_bona_fide_sites_2.tsv"

couple3_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299510,A,1,297,46.47,"[222, 0, 75, 0]",AG,0.25,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
1,chrX,299513,A,1,292,46.71,"[287, 0, 5, 0]",AG,0.02,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
2,chrX,299559,A,1,256,50.97,"[251, 0, 5, 0]",AG,0.02,chrX,...,1,36,42.69,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
3,chrX,299568,A,1,262,51.22,"[253, 0, 9, 0]",AG,0.03,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
4,chrX,299571,A,1,269,50.89,"[262, 0, 7, 0]",AG,0.03,chrX,...,1,37,42.32,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35279,chr8,144458020,A,0,51,49.10,"[0, 0, 51, 0]",AG,1.00,chr8,...,0,57,41.00,"[0, 0, 57, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35280,chr8,144458072,A,0,52,48.37,"[0, 0, 52, 0]",AG,1.00,chr8,...,0,48,41.33,"[0, 0, 48, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35281,chr8,144468337,A,1,377,48.61,"[0, 0, 377, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35282,chr8,144843461,A,1,122,50.05,"[24, 0, 98, 0]",AG,0.80,chr8,...,1,46,40.46,"[19, 0, 27, 0]",AG,0.59,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,16996,0,-,-,"transcript,exon",WASH7P
1,chr1,942472,1,-,-,"transcript,CDS,exon",SAMD11
2,chr1,953259,0,-,-,"transcript,CDS,exon",NOC2L
3,chr1,1009082,1,SINE,AluJb,-,-
4,chr1,1009083,1,SINE,AluJb,-,-
...,...,...,...,...,...,...,...
35218,chrX,154512392,1,SINE,AluSz6,"transcript,CDS,exon",FAM3A
35219,chrX,154512393,1,SINE,AluSz6,"transcript,CDS,exon",FAM3A
35220,chrX,154512401,1,SINE,AluSz6,"transcript,CDS,exon",FAM3A
35221,chrX,156023071,0,-,-,"exon,transcript",WASH6P


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,299510,A,1,297,46.47,"[222, 0, 75, 0]",AG,0.25,chrX,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299510,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
1,chrX,299513,A,1,292,46.71,"[287, 0, 5, 0]",AG,0.02,chrX,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299513,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
2,chrX,299559,A,1,256,50.97,"[251, 0, 5, 0]",AG,0.02,chrX,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299559,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
3,chrX,299568,A,1,262,51.22,"[253, 0, 9, 0]",AG,0.03,chrX,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299568,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
4,chrX,299571,A,1,269,50.89,"[262, 0, 7, 0]",AG,0.03,chrX,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299571,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35218,chr8,144458020,A,0,51,49.10,"[0, 0, 51, 0]",AG,1.00,chr8,...,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144458020,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
35219,chr8,144458072,A,0,52,48.37,"[0, 0, 52, 0]",AG,1.00,chr8,...,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144458072,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
35220,chr8,144468337,A,1,377,48.61,"[0, 0, 377, 0]",AG,1.00,chr8,...,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144468337,0,-,-,"transcript,CDS,exon",KIFC2
35221,chr8,144843461,A,1,122,50.05,"[24, 0, 98, 0]",AG,0.80,chr8,...,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144843461,0,SINE,AluY,"transcript,exon,3UTR&transcript",ZNF7&COMMD5


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
218,chrX,1383756,A,0,122,49.93,"[119, 0, 3, 0]",AG,0.02,chrX,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
1073,chrX,101343610,A,0,298,44.86,"[294, 0, 4, 0]",AG,0.01,chrX,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
2086,chr12,56731567,A,0,536,49.16,"[533, 0, 3, 0]",AG,0.01,chr12,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
2087,chr12,56731568,A,0,569,48.91,"[560, 0, 9, 0]",AG,0.02,chr12,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
2088,chr12,56731569,A,0,572,49.01,"[568, 0, 4, 0]",AG,0.01,chr12,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30483,chr1,247190161,A,0,145,47.55,"[142, 0, 3, 0]",AG,0.02,chr1,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
31115,chr8,29060517,A,1,211,45.70,"[208, 0, 3, 0]",AG,0.01,chr8,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
31116,chr8,29063918,A,1,172,49.68,"[163, 0, 9, 0]",AG,0.05,chr8,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
31117,chr8,29063962,A,1,201,48.54,"[198, 0, 3, 0]",AG,0.01,chr8,...,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_final.tsv.anno.filt_refseq.tsv


In [5]:
### Couple 4
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT1.WT.outTable_599710609.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_599710609_outTable_905657585_bona_fide_sites_2.tsv"

couple4_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,371,44.71,"[365, 0, 6, 0]",AG,0.02,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
1,chrX,299510,A,1,367,44.73,"[332, 0, 35, 0]",AG,0.10,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
2,chrX,299513,A,1,367,44.63,"[356, 0, 11, 0]",AG,0.03,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
3,chrX,299540,A,1,361,46.97,"[356, 0, 5, 0]",AG,0.01,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
4,chrX,299543,A,1,367,46.54,"[364, 0, 3, 0]",AG,0.01,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29602,chr8,144451628,A,0,550,47.21,"[430, 0, 120, 0]",AG,0.22,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29603,chr8,144454002,A,0,136,43.96,"[0, 0, 136, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29604,chr8,144460013,A,0,82,52.49,"[69, 0, 13, 0]",AG,0.16,chr8,...,0,33,39.91,"[20, 0, 13, 0]",AG,0.39,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29605,chr8,144468337,A,1,937,47.23,"[3, 0, 934, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,14610,0,-,-,"exon,transcript",WASH7P
1,chr1,14818,1,-,-,"exon,transcript",WASH7P
2,chr1,16996,0,-,-,"transcript,exon",WASH7P
3,chr1,908025,0,-,-,"transcript,exon",LOC284600
4,chr1,918085,1,-,-,"transcript,exon&transcript,exon",LOC284600&LINC02593
...,...,...,...,...,...,...,...
29543,chrX,154531396,1,-,-,"3UTR,exon,transcript",G6PD
29544,chrX,154533043,1,-,-,"transcript,CDS,exon",G6PD
29545,chrX,156023071,0,-,-,"exon,transcript",WASH6P
29546,chrX,156023093,0,-,-,"exon,transcript",WASH6P


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,299493,A,1,371,44.71,"[365, 0, 6, 0]",AG,0.02,chrX,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299493,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
1,chrX,299510,A,1,367,44.73,"[332, 0, 35, 0]",AG,0.10,chrX,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299510,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
2,chrX,299513,A,1,367,44.63,"[356, 0, 11, 0]",AG,0.03,chrX,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299513,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
3,chrX,299540,A,1,361,46.97,"[356, 0, 5, 0]",AG,0.01,chrX,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299540,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
4,chrX,299543,A,1,367,46.54,"[364, 0, 3, 0]",AG,0.01,chrX,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chrX,299543,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29543,chr8,144451628,A,0,550,47.21,"[430, 0, 120, 0]",AG,0.22,chr8,...,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144451628,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
29544,chr8,144454002,A,0,136,43.96,"[0, 0, 136, 0]",AG,1.00,chr8,...,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144454002,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
29545,chr8,144460013,A,0,82,52.49,"[69, 0, 13, 0]",AG,0.16,chr8,...,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144460013,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
29546,chr8,144468337,A,1,937,47.23,"[3, 0, 934, 0]",AG,1.00,chr8,...,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,chr8,144468337,0,-,-,"transcript,CDS,exon",KIFC2


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
1412,chr12,911483,A,0,101,44.33,"[98, 0, 3, 0]",AG,0.03,chr12,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
1579,chr12,50924901,A,1,172,44.74,"[169, 0, 3, 0]",AG,0.02,chr12,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
1819,chr12,56731568,A,0,598,45.94,"[595, 0, 3, 0]",AG,0.01,chr12,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
2831,chr11,33073863,A,1,266,52.87,"[252, 0, 14, 0]",AG,0.05,chr11,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
2846,chr11,33075037,A,1,379,42.55,"[373, 0, 6, 0]",AG,0.02,chr11,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
3011,chr11,64778665,A,0,653,63.02,"[648, 0, 5, 0]",AG,0.01,chr11,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
3012,chr11,64778666,A,0,669,64.38,"[665, 0, 4, 0]",AG,0.01,chr11,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
3013,chr11,64778697,A,0,591,55.21,"[588, 0, 3, 0]",AG,0.01,chr11,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
8173,chr15,66335262,A,0,193,43.08,"[189, 0, 4, 0]",AG,0.02,chr15,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,
8174,chr15,66335395,A,0,316,51.51,"[306, 0, 10, 0]",AG,0.03,chr15,...,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT1.WT.outTable_599710609.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_final.tsv.anno.filt_refseq.tsv


In [6]:
### Couple 5
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT2.WT.outTable_572868058.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_572868058_outTable_364841872_bona_fide_sites_2.tsv"

couple5_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299510,A,1,272,43.13,"[239, 0, 33, 0]",AG,0.12,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
1,chrX,299513,A,1,263,43.09,"[257, 0, 6, 0]",AG,0.02,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
2,chrX,299540,A,1,291,44.30,"[288, 0, 3, 0]",AG,0.01,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
3,chrX,299559,A,1,272,46.03,"[267, 0, 5, 0]",AG,0.02,chrX,...,1,36,42.69,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
4,chrX,299568,A,1,271,46.60,"[263, 0, 8, 0]",AG,0.03,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23815,chr8,144454002,A,0,86,45.73,"[0, 0, 86, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23816,chr8,144457131,A,0,78,45.86,"[20, 0, 58, 0]",AG,0.74,chr8,...,0,35,39.86,"[14, 0, 21, 0]",AG,0.60,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23817,chr8,144458636,A,0,90,44.97,"[77, 0, 13, 0]",AG,0.14,chr8,...,0,36,42.17,"[19, 0, 17, 0]",AG,0.47,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23818,chr8,144468337,A,1,665,48.87,"[0, 0, 665, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,14610,0,-,-,"exon,transcript",WASH7P
1,chr1,16996,0,-,-,"transcript,exon",WASH7P
2,chr1,753811,1,SINE,AluJo,transcript,LOC100288069
3,chr1,753830,1,SINE,AluJo,transcript,LOC100288069
4,chr1,926428,0,-,-,transcript,SAMD11
...,...,...,...,...,...,...,...
23782,chrX,154531460,1,-,-,"3UTR,exon,transcript",G6PD
23783,chrX,155919829,1,-,-,"transcript,CDS,exon",VAMP7
23784,chrX,156023071,0,-,-,"exon,transcript",WASH6P
23785,chrX,156023093,0,-,-,"exon,transcript",WASH6P


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,299510,A,1,272,43.13,"[239, 0, 33, 0]",AG,0.12,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299510,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
1,chrX,299513,A,1,263,43.09,"[257, 0, 6, 0]",AG,0.02,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299513,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
2,chrX,299540,A,1,291,44.30,"[288, 0, 3, 0]",AG,0.01,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299540,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
3,chrX,299559,A,1,272,46.03,"[267, 0, 5, 0]",AG,0.02,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299559,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
4,chrX,299568,A,1,271,46.60,"[263, 0, 8, 0]",AG,0.03,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chrX,299568,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23782,chr8,144454002,A,0,86,45.73,"[0, 0, 86, 0]",AG,1.00,chr8,...,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144454002,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
23783,chr8,144457131,A,0,78,45.86,"[20, 0, 58, 0]",AG,0.74,chr8,...,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144457131,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
23784,chr8,144458636,A,0,90,44.97,"[77, 0, 13, 0]",AG,0.14,chr8,...,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144458636,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
23785,chr8,144468337,A,1,665,48.87,"[0, 0, 665, 0]",AG,1.00,chr8,...,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,chr8,144468337,0,-,-,"transcript,CDS,exon",KIFC2


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
641,chrX,100842597,A,1,203,51.46,"[200, 0, 3, 0]",AG,0.01,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
757,chrX,119019378,A,1,175,44.83,"[172, 0, 3, 0]",AG,0.02,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
758,chrX,119022136,A,1,159,46.23,"[156, 0, 3, 0]",AG,0.02,chrX,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
2186,chr11,33073863,A,1,179,53.94,"[166, 0, 13, 0]",AG,0.07,chr11,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
6184,chr15,66335262,A,0,133,41.83,"[128, 0, 5, 0]",AG,0.04,chr15,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
6534,chr15,101769787,A,0,128,45.09,"[118, 0, 10, 0]",AG,0.08,chr15,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
6535,chr15,101769799,A,0,129,50.77,"[125, 0, 4, 0]",AG,0.03,chr15,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
6536,chr15,101769800,A,0,125,51.78,"[115, 0, 10, 0]",AG,0.08,chr15,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
6537,chr15,101769801,A,0,126,52.05,"[122, 0, 4, 0]",AG,0.03,chr15,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,
6538,chr15,101769806,A,0,123,51.8,"[118, 0, 5, 0]",AG,0.04,chr15,...,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT2.WT.outTable_572868058.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_final.tsv.anno.filt_refseq.tsv


In [7]:
### Couple 6
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT3.WT.outTable_110067244.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_110067244_outTable_597789462_bona_fide_sites_2.tsv"

couple6_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299431,A,1,229,49.54,"[226, 0, 3, 0]",AG,0.01,chrX,...,1,35,40.20,"[35, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
1,chrX,299510,A,1,144,45.65,"[121, 0, 23, 0]",AG,0.16,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
2,chrX,299568,A,1,137,52.31,"[132, 0, 5, 0]",AG,0.04,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
3,chrX,299572,A,1,137,51.77,"[125, 0, 12, 0]",AG,0.09,chrX,...,1,40,42.10,"[40, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
4,chrX,299574,A,1,133,51.55,"[130, 0, 3, 0]",AG,0.02,chrX,...,1,37,42.43,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20059,chr8,144425879,A,0,158,40.91,"[124, 0, 34, 0]",AG,0.22,chr8,...,0,39,41.62,"[28, 0, 11, 0]",AG,0.28,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20060,chr8,144440133,A,0,509,47.75,"[1, 0, 508, 0]",AG,1.00,chr8,...,0,59,40.75,"[0, 0, 59, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20061,chr8,144451628,A,0,220,49.34,"[169, 0, 51, 0]",AG,0.23,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20062,chr8,144468337,A,1,422,46.23,"[0, 0, 422, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,14610,0,-,-,"exon,transcript",WASH7P
1,chr1,926744,0,-,-,transcript,SAMD11
2,chr1,942909,1,-,-,"transcript,CDS,exon",SAMD11
3,chr1,953259,0,-,-,"transcript,CDS,exon",NOC2L
4,chr1,953279,0,-,-,"transcript,CDS,exon",NOC2L
...,...,...,...,...,...,...,...
20026,chrX,154531474,1,-,-,"3UTR,exon,transcript",G6PD
20027,chrX,154777540,1,-,-,"transcript,3UTR,exon",DKC1
20028,chrX,154790991,1,-,-,"transcript,CDS,exon",MPP1
20029,chrX,156023071,0,-,-,"exon,transcript",WASH6P


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,299431,A,1,229,49.54,"[226, 0, 3, 0]",AG,0.01,chrX,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299431,1,-,-,"transcript,exon,3UTR",PLCXD1
1,chrX,299510,A,1,144,45.65,"[121, 0, 23, 0]",AG,0.16,chrX,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299510,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
2,chrX,299568,A,1,137,52.31,"[132, 0, 5, 0]",AG,0.04,chrX,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299568,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
3,chrX,299572,A,1,137,51.77,"[125, 0, 12, 0]",AG,0.09,chrX,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299572,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
4,chrX,299574,A,1,133,51.55,"[130, 0, 3, 0]",AG,0.02,chrX,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chrX,299574,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20026,chr8,144425879,A,0,158,40.91,"[124, 0, 34, 0]",AG,0.22,chr8,...,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144425879,0,-,-,"transcript,5UTR,exon",VPS28
20027,chr8,144440133,A,0,509,47.75,"[1, 0, 508, 0]",AG,1.00,chr8,...,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144440133,0,-,-,"transcript,CDS,exon",TONSL
20028,chr8,144451628,A,0,220,49.34,"[169, 0, 51, 0]",AG,0.23,chr8,...,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144451628,0,-,-,transcript$transcript,CYHR1$LOC84773-CYHR1
20029,chr8,144468337,A,1,422,46.23,"[0, 0, 422, 0]",AG,1.00,chr8,...,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,chr8,144468337,0,-,-,"transcript,CDS,exon",KIFC2


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
562,chrX,101345647,A,0,126,46.9,"[123, 0, 3, 0]",AG,0.02,chrX,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
978,chr12,3045107,A,1,132,46.23,"[128, 0, 4, 0]",AG,0.03,chr12,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
984,chr12,6493736,A,0,619,52.09,"[613, 0, 6, 0]",AG,0.01,chr12,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
1236,chr12,56731213,A,0,535,46.75,"[532, 0, 3, 0]",AG,0.01,chr12,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
1973,chr11,33073863,A,1,64,51.23,"[59, 0, 5, 0]",AG,0.08,chr11,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
5595,chr15,101769773,A,0,75,41.95,"[69, 0, 6, 0]",AG,0.08,chr15,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
5596,chr15,101769778,A,0,80,42.4,"[75, 0, 5, 0]",AG,0.06,chr15,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
5597,chr15,101769787,A,0,84,44.49,"[74, 0, 10, 0]",AG,0.12,chr15,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
5598,chr15,101769809,A,0,89,44.63,"[77, 0, 12, 0]",AG,0.13,chr15,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,
5599,chr15,101772716,A,0,113,41.37,"[108, 0, 5, 0]",AG,0.04,chr15,...,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT3.WT.outTable_110067244.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_final.tsv.anno.filt_refseq.tsv
