In [1]:
##################
### Author: Adriano Fonzino. email: adriano.fonzino@uniba.it
##################
import os
import pandas as pd
import numpy as np


# define util function for downstream analysis
def join_bonafides(bonafide_fp, bonafide_anno_filt):
    ### function to join bonafide with reditools info with bonafide with anno and filtered 
    ### NON-REP only Refseq
    # load bonafide with reditools alignment profiles/data
    bonafide = pd.read_table(bonafide_fp)
    print("Bonafide not-filted with REDItools info:")
    display(bonafide)
    # load annotated table filtered for positives NON-REP being in Refseq
    bonafide_anno_filt = pd.read_table(bonafide_anno_filt_fp)
    print("Bonafide annotated and filtered (NON-REP only Refseq):")
    display(bonafide_anno_filt)
    # inner join between bonafide with and without annotation/filtering
    joined = pd.merge(bonafide, bonafide_anno_filt, 
                      left_on=["wt_Region", "wt_Position"],
                      right_on=["Region", "Position"], how="inner")
    print("Common sites annotated with REDItools info:")
    display(joined)
    # print not common site (not Refseq)
    left_join = pd.merge(bonafide, bonafide_anno_filt, 
                        left_on=["wt_Region", "wt_Position"],
                        right_on=["Region", "Position"], how="left")
    print("Discared sites (via left join):")
    display(left_join[left_join["Class_y"].isna()])
    output_file = bonafide_fp + ".anno.filt_refseq.tsv"
    print("Save on disk joined with annotation and filtering to :", output_file)
    joined.to_csv(output_file, index=None, sep="\t")

    return joined

## Hek Pub. Data

In [2]:
### Couple 1
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564274.wildtype.outTable_724242056.gz_vs_SRR5564272.ADAR1_KO.outTable_816573740.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_724242056_outTable_816573740_bona_fide_sites_2.tsv"

couple1_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,217844,A,1,32,36.72,"[29, 0, 3, 0]",AG,0.09,chrX,...,1,35,40.11,"[35, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
1,chrX,217852,A,1,36,35.89,"[25, 0, 11, 0]",AG,0.31,chrX,...,1,33,40.64,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
2,chrX,217930,A,1,44,35.32,"[42, 0, 2, 0]",AG,0.05,chrX,...,1,33,40.55,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
3,chrX,217946,A,1,42,35.50,"[39, 0, 3, 0]",AG,0.07,chrX,...,1,38,40.82,"[38, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
4,chrX,218542,A,1,32,36.38,"[30, 0, 2, 0]",AG,0.06,chrX,...,1,33,39.82,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019,chr8,124027628,A,0,103,35.65,"[0, 0, 103, 0]",AG,1.00,chr8,...,0,53,40.45,"[0, 0, 53, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7020,chr8,124031541,A,0,130,37.04,"[53, 0, 77, 0]",AG,0.59,chr8,...,0,40,39.83,"[16, 0, 24, 0]",AG,0.6,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7021,chr8,132982824,A,1,33,35.03,"[0, 0, 33, 0]",AG,1.00,chr8,...,1,40,41.92,"[0, 0, 40, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7022,chr8,145066886,A,1,83,36.94,"[0, 0, 83, 0]",AG,1.00,chr8,...,1,54,40.98,"[0, 0, 54, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,16996,0,-,-,"transcript,exon",WASH7P
1,chr1,881906,1,-,-,"transcript,CDS,exon",NOC2L
2,chr1,888639,0,-,-,"transcript,CDS,exon",NOC2L
3,chr1,888659,0,-,-,"transcript,CDS,exon",NOC2L
4,chr1,1247494,0,-,-,"transcript,CDS,exon",INTS11
...,...,...,...,...,...,...,...
7007,chrX,153296575,1,-,-,"exon,transcript,CDS",MECP2
7008,chrX,153583369,1,-,-,"transcript,CDS,exon",FLNA
7009,chrX,153633953,0,-,-,"transcript,exon,5UTR",DNASE1L1
7010,chrX,153712377,1,-,-,"3UTR,exon,transcript",UBL4A


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,217844,A,1,32,36.72,"[29, 0, 3, 0]",AG,0.09,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chrX,217844,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
1,chrX,217852,A,1,36,35.89,"[25, 0, 11, 0]",AG,0.31,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chrX,217852,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
2,chrX,217930,A,1,44,35.32,"[42, 0, 2, 0]",AG,0.05,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chrX,217930,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
3,chrX,217946,A,1,42,35.50,"[39, 0, 3, 0]",AG,0.07,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chrX,217946,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
4,chrX,218542,A,1,32,36.38,"[30, 0, 2, 0]",AG,0.06,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chrX,218542,1,SINE,AluSx1,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7007,chr8,124027628,A,0,103,35.65,"[0, 0, 103, 0]",AG,1.00,chr8,...,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chr8,124027628,0,-,-,"3UTR,exon,transcript",DERL1
7008,chr8,124031541,A,0,130,37.04,"[53, 0, 77, 0]",AG,0.59,chr8,...,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chr8,124031541,0,-,-,"transcript,CDS,exon",DERL1
7009,chr8,132982824,A,1,33,35.03,"[0, 0, 33, 0]",AG,1.00,chr8,...,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chr8,132982824,0,-,-,"transcript,CDS,exon",EFR3A
7010,chr8,145066886,A,1,83,36.94,"[0, 0, 83, 0]",AG,1.00,chr8,...,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,chr8,145066886,0,-,-,"transcript,CDS,exon",GRINA


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
106,chrX,100098022,A,1,50,35.6,"[48, 0, 2, 0]",AG,0.04,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
114,chrX,100600245,A,0,31,34.1,"[29, 0, 2, 0]",AG,0.06,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
125,chrX,106364215,A,0,34,36.65,"[32, 0, 2, 0]",AG,0.06,chrX,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
235,chr13,45907280,A,0,37,36.0,"[35, 0, 2, 0]",AG,0.05,chr13,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
265,chr13,88337706,A,1,42,36.38,"[40, 0, 2, 0]",AG,0.05,chr13,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
1083,chr17,56070347,A,0,30,41.6,"[28, 0, 2, 0]",AG,0.07,chr17,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
1147,chr17,79844502,A,0,54,36.3,"[52, 0, 2, 0]",AG,0.04,chr17,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
1438,chr15,102310012,A,0,32,35.62,"[25, 0, 7, 0]",AG,0.22,chr15,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
2224,chr22,29899471,A,0,45,35.29,"[43, 0, 2, 0]",AG,0.04,chr22,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,
2467,chr7,50509149,A,0,39,35.69,"[37, 0, 2, 0]",AG,0.05,chr7,...,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564274.wildtype.outTable_724242056.gz_vs_SRR5564272.ADAR1_KO.outTable_816573740.gz.bonafide_final.tsv.anno.filt_refseq.tsv


In [3]:
### Couple 2
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564275.wildtype.outTable_580067564.gz_vs_SRR5564273.ADAR1_KO.outTable_718392497.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_580067564_outTable_718392497_bona_fide_sites_2.tsv"

couple2_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,217741,A,1,62,38.73,"[58, 0, 4, 0]",AG,0.06,chrX,...,1,31,40.71,"[31, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
1,chrX,217778,A,1,55,39.98,"[53, 0, 2, 0]",AG,0.04,chrX,...,1,28,40.71,"[28, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
2,chrX,217844,A,1,85,38.59,"[81, 0, 4, 0]",AG,0.05,chrX,...,1,35,40.11,"[35, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
3,chrX,217875,A,1,79,36.59,"[75, 0, 4, 0]",AG,0.05,chrX,...,1,32,40.25,"[32, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
4,chrX,217930,A,1,81,37.02,"[79, 0, 2, 0]",AG,0.02,chrX,...,1,33,40.55,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8510,chr8,132982824,A,1,60,35.33,"[0, 0, 60, 0]",AG,1.00,chr8,...,1,40,41.92,"[0, 0, 40, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8511,chr8,144732418,A,1,83,35.73,"[0, 0, 83, 0]",AG,1.00,chr8,...,1,35,39.74,"[0, 0, 35, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8512,chr8,145066886,A,1,284,36.31,"[0, 0, 284, 0]",AG,1.00,chr8,...,1,54,40.98,"[0, 0, 54, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8513,chr8,145150832,A,1,361,37.37,"[0, 0, 361, 0]",AG,1.00,chr8,...,1,33,40.00,"[0, 0, 33, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,20250,0,LINE,L3,transcript,WASH7P
1,chr1,879687,0,-,-,"transcript,exon,3UTR&3UTR,exon,transcript",SAMD11&NOC2L
2,chr1,888639,0,-,-,"transcript,CDS,exon",NOC2L
3,chr1,888659,0,-,-,"transcript,CDS,exon",NOC2L
4,chr1,892600,1,-,-,"transcript,CDS,exon",NOC2L
...,...,...,...,...,...,...,...
8500,chrX,153276787,1,SINE,AluJb,"3UTR,exon,transcript",IRAK1
8501,chrX,153279606,1,-,-,"transcript,CDS,exon",IRAK1
8502,chrX,153593040,1,-,-,"transcript,CDS,exon",FLNA
8503,chrX,153633953,0,-,-,"transcript,exon,5UTR",DNASE1L1


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,217741,A,1,62,38.73,"[58, 0, 4, 0]",AG,0.06,chrX,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chrX,217741,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
1,chrX,217778,A,1,55,39.98,"[53, 0, 2, 0]",AG,0.04,chrX,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chrX,217778,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
2,chrX,217844,A,1,85,38.59,"[81, 0, 4, 0]",AG,0.05,chrX,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chrX,217844,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
3,chrX,217875,A,1,79,36.59,"[75, 0, 4, 0]",AG,0.05,chrX,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chrX,217875,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
4,chrX,217930,A,1,81,37.02,"[79, 0, 2, 0]",AG,0.02,chrX,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chrX,217930,1,SINE,AluJb,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8500,chr8,132982824,A,1,60,35.33,"[0, 0, 60, 0]",AG,1.00,chr8,...,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chr8,132982824,0,-,-,"transcript,CDS,exon",EFR3A
8501,chr8,144732418,A,1,83,35.73,"[0, 0, 83, 0]",AG,1.00,chr8,...,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chr8,144732418,0,-,-,"transcript,exon,CDS",ZNF623
8502,chr8,145066886,A,1,284,36.31,"[0, 0, 284, 0]",AG,1.00,chr8,...,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chr8,145066886,0,-,-,"transcript,CDS,exon",GRINA
8503,chr8,145150832,A,1,361,37.37,"[0, 0, 361, 0]",AG,1.00,chr8,...,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,chr8,145150832,0,-,-,"transcript,CDS,exon",CYC1


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
284,chr13,45907280,A,0,46,36.78,"[44, 0, 2, 0]",AG,0.04,chr13,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
859,chr11,93915583,A,1,54,36.41,"[52, 0, 2, 0]",AG,0.04,chr11,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
1274,chr17,19015652,A,0,337,36.82,"[335, 0, 2, 0]",AG,0.01,chr17,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
1422,chr17,56076332,A,0,66,36.12,"[64, 0, 2, 0]",AG,0.03,chr17,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
1782,chr15,64653384,A,0,32,35.03,"[30, 0, 2, 0]",AG,0.06,chr15,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
1877,chr14,21675776,A,0,45,37.69,"[43, 0, 2, 0]",AG,0.04,chr14,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
4410,chr3,73026837,A,1,103,36.42,"[101, 0, 2, 0]",AG,0.02,chr3,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
5308,chr1,62918927,A,1,64,37.81,"[62, 0, 2, 0]",AG,0.03,chr1,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
5397,chr1,110899147,A,1,97,35.91,"[95, 0, 2, 0]",AG,0.02,chr1,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,
5423,chr1,111981770,A,0,36,35.11,"[34, 0, 2, 0]",AG,0.06,chr1,...,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564275.wildtype.outTable_580067564.gz_vs_SRR5564273.ADAR1_KO.outTable_718392497.gz.bonafide_final.tsv.anno.filt_refseq.tsv


In [4]:
### Couple 3
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564276.wildtype.outTable_181728208.gz_vs_SRR5564268.ADAR1_KO.outTable_854894021.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_181728208_outTable_854894021_bona_fide_sites_2.tsv"

couple3_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,205483,A,1,180,36.29,"[178, 0, 2, 0]",AG,0.01,chrX,...,1,34,40.85,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
1,chrX,215935,A,1,68,38.91,"[66, 0, 2, 0]",AG,0.03,chrX,...,1,30,39.40,"[30, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
2,chrX,216802,A,1,51,37.86,"[49, 0, 2, 0]",AG,0.04,chrX,...,1,34,39.32,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
3,chrX,216807,A,1,53,37.62,"[50, 0, 3, 0]",AG,0.06,chrX,...,1,31,39.45,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
4,chrX,217393,A,1,43,35.02,"[26, 0, 17, 0]",AG,0.40,chrX,...,1,31,39.61,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11064,chr21,47635176,A,0,134,36.46,"[74, 0, 60, 0]",AG,0.45,chr21,...,0,34,38.62,"[26, 0, 8, 0]",AG,0.24,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11065,chr21,47705237,A,0,139,36.01,"[31, 0, 108, 0]",AG,0.78,chr21,...,0,46,40.98,"[21, 0, 25, 0]",AG,0.54,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11066,chr21,47851753,A,1,62,35.55,"[44, 0, 18, 0]",AG,0.29,chr21,...,1,39,40.38,"[29, 0, 10, 0]",AG,0.26,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11067,chr21,47862624,A,1,43,37.19,"[0, 0, 43, 0]",AG,1.00,chr21,...,1,46,41.33,"[0, 0, 46, 0]",AG,1.00,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,16996,0,-,-,"transcript,exon",WASH7P
1,chr1,694459,0,-,-,"transcript,exon",LOC100288069
2,chr1,879874,1,-,-,"transcript,exon,3UTR&3UTR,exon,transcript",SAMD11&NOC2L
3,chr1,888639,0,-,-,"transcript,CDS,exon",NOC2L
4,chr1,888659,0,-,-,"transcript,CDS,exon",NOC2L
...,...,...,...,...,...,...,...
11034,chrX,153633953,0,-,-,"transcript,exon,5UTR",DNASE1L1
11035,chrX,153676847,1,-,-,"transcript,CDS,exon",FAM50A
11036,chrX,153676853,1,-,-,"transcript,CDS,exon",FAM50A
11037,chrX,153676854,1,-,-,"transcript,CDS,exon",FAM50A


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,205483,A,1,180,36.29,"[178, 0, 2, 0]",AG,0.01,chrX,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chrX,205483,1,-,-,"transcript,CDS,exon",PLCXD1
1,chrX,215935,A,1,68,38.91,"[66, 0, 2, 0]",AG,0.03,chrX,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chrX,215935,1,-,-,"transcript,CDS,exon",PLCXD1
2,chrX,216802,A,1,51,37.86,"[49, 0, 2, 0]",AG,0.04,chrX,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chrX,216802,1,SINE,AluJr,"transcript,exon,3UTR",PLCXD1
3,chrX,216807,A,1,53,37.62,"[50, 0, 3, 0]",AG,0.06,chrX,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chrX,216807,1,SINE,AluJr,"transcript,exon,3UTR",PLCXD1
4,chrX,217393,A,1,43,35.02,"[26, 0, 17, 0]",AG,0.40,chrX,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chrX,217393,1,SINE,AluSx,"transcript,exon,3UTR",PLCXD1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11034,chr21,47635176,A,0,134,36.46,"[74, 0, 60, 0]",AG,0.45,chr21,...,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chr21,47635176,0,-,-,"transcript,CDS,exon",LSS
11035,chr21,47705237,A,0,139,36.01,"[31, 0, 108, 0]",AG,0.78,chr21,...,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chr21,47705237,0,-,-,"transcript,exon,5UTR",MCM3AP
11036,chr21,47851753,A,1,62,35.55,"[44, 0, 18, 0]",AG,0.29,chr21,...,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chr21,47851753,0,-,-,"transcript,CDS,exon",PCNT
11037,chr21,47862624,A,1,43,37.19,"[0, 0, 43, 0]",AG,1.00,chr21,...,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,chr21,47862624,0,-,-,transcript,PCNT


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
56,chrX,24072978,A,1,162,38.59,"[160, 0, 2, 0]",AG,0.01,chrX,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
418,chr13,45907280,A,0,43,35.12,"[40, 0, 3, 0]",AG,0.07,chr13,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
553,chr12,49324564,A,0,47,37.02,"[45, 0, 2, 0]",AG,0.04,chr12,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
1199,chr11,93915353,A,1,39,34.97,"[36, 0, 3, 0]",AG,0.08,chr11,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
1393,chr10,60159455,A,1,45,34.93,"[43, 0, 2, 0]",AG,0.04,chr10,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
1955,chr17,56071470,A,0,46,34.35,"[44, 0, 2, 0]",AG,0.04,chr17,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
1956,chr17,56075924,A,0,87,36.76,"[85, 0, 2, 0]",AG,0.02,chr17,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
2044,chr17,79836854,A,0,62,35.9,"[60, 0, 2, 0]",AG,0.03,chr17,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
2511,chr15,64651474,A,0,38,36.95,"[36, 0, 2, 0]",AG,0.05,chr15,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,
2517,chr15,64652749,A,0,30,35.0,"[27, 0, 3, 0]",AG,0.1,chr15,...,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564276.wildtype.outTable_181728208.gz_vs_SRR5564268.ADAR1_KO.outTable_854894021.gz.bonafide_final.tsv.anno.filt_refseq.tsv
