In [1]:
##################
### Author: Adriano Fonzino. email: adriano.fonzino@uniba.it
##################
import os
import pandas as pd
import numpy as np


# define util function for downstream analysis
def join_bonafides(bonafide_fp, bonafide_anno_filt):
    ### function to join bonafide with reditools info with bonafide with anno and filtered 
    ### NON-REP only Refseq
    # load bonafide with reditools alignment profiles/data
    bonafide = pd.read_table(bonafide_fp)
    print("Bonafide not-filted with REDItools info:")
    display(bonafide)
    # load annotated table filtered for positives NON-REP being in Refseq
    bonafide_anno_filt = pd.read_table(bonafide_anno_filt_fp)
    print("Bonafide annotated and filtered (NON-REP only Refseq):")
    display(bonafide_anno_filt)
    # inner join between bonafide with and without annotation/filtering
    joined = pd.merge(bonafide, bonafide_anno_filt, 
                      left_on=["wt_Region", "wt_Position"],
                      right_on=["Region", "Position"], how="inner")
    print("Common sites annotated with REDItools info:")
    display(joined)
    # print not common site (not Refseq)
    left_join = pd.merge(bonafide, bonafide_anno_filt, 
                        left_on=["wt_Region", "wt_Position"],
                        right_on=["Region", "Position"], how="left")
    print("Discared sites (via left join):")
    display(left_join[left_join["Class_y"].isna()])
    output_file = bonafide_fp + ".anno.filt_refseq.tsv"
    print("Save on disk joined with annotation and filtering to :", output_file)
    joined.to_csv(output_file, index=None, sep="\t")

    return joined

## U87

In [2]:
### Couple 1
# inputs
bonafide_fp = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/u87/SRR388226_SRR388227.WT.outTable_853538513.gz_vs_SRR388228_SRR388229.KO.outTable_921089530.gz.bonafide_final.tsv"
bonafide_anno_filt_fp = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/U87/outTable_853538513_outTable_921089530_bona_fide_sites_2.tsv"

couple1_join = join_bonafides(bonafide_fp, bonafide_anno_filt_fp)

Bonafide not-filted with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,16754311,A,1,146,37.49,"[144, 0, 2, 0]",AG,0.01,chrX,...,1,18,39.17,"[18, 0, 0, 0]",-,0.00,Editing,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
1,chrX,19368150,A,1,144,37.78,"[142, 0, 2, 0]",AG,0.01,chrX,...,1,17,35.59,"[17, 0, 0, 0]",-,0.00,Editing,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
2,chrX,19555884,A,0,187,37.07,"[185, 0, 2, 0]",AG,0.01,chrX,...,0,15,37.47,"[15, 0, 0, 0]",-,0.00,Editing,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
3,chrX,19610250,A,0,294,37.65,"[292, 0, 2, 0]",AG,0.01,chrX,...,0,23,35.48,"[23, 0, 0, 0]",-,0.00,Editing,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
4,chrX,23740019,A,0,101,37.06,"[99, 0, 2, 0]",AG,0.02,chrX,...,0,20,36.75,"[20, 0, 0, 0]",-,0.00,Editing,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7302,chr8,134470631,A,0,33,43.21,"[17, 0, 16, 0]",AG,0.48,chr8,...,0,45,35.93,"[27, 0, 18, 0]",AG,0.40,Not-Editing,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
7303,chr8,144991176,A,0,325,38.59,"[141, 0, 184, 0]",AG,0.57,chr8,...,0,41,31.78,"[16, 0, 25, 0]",AG,0.61,Not-Editing,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
7304,chr8,144992103,A,0,352,38.34,"[149, 0, 203, 0]",AG,0.58,chr8,...,0,33,35.52,"[18, 0, 15, 0]",AG,0.45,Not-Editing,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz
7305,chr8,145150832,A,1,215,39.82,"[0, 0, 215, 0]",AG,1.00,chr8,...,1,30,34.30,"[0, 0, 30, 0]",AG,1.00,Not-Editing,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz


Bonafide annotated and filtered (NON-REP only Refseq):


Unnamed: 0,Region,Position,Class,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chr1,880170,1,-,-,"exon,transcript,CDS",NOC2L
1,chr1,880449,1,-,-,"transcript,CDS,exon",NOC2L
2,chr1,888639,0,-,-,"transcript,CDS,exon",NOC2L
3,chr1,888659,0,-,-,"transcript,CDS,exon",NOC2L
4,chr1,949654,0,-,-,"transcript,CDS,exon",ISG15
...,...,...,...,...,...,...,...
7298,chrX,153713859,1,-,-,"3UTR,exon,transcript",UBL4A
7299,chrX,153759848,1,-,-,"3UTR,exon,transcript",G6PD
7300,chrX,154456747,0,-,-,"transcript,CDS,exon",VBP1
7301,chrY,21152719,0,-,-,"transcript$exon,transcript",TTTY14$CD24P4


Common sites annotated with REDItools info:


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
0,chrX,16754311,A,1,146,37.49,"[144, 0, 2, 0]",AG,0.01,chrX,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chrX,16754311,1,-,-,"transcript,CDS,exon",SYAP1
1,chrX,19368150,A,1,144,37.78,"[142, 0, 2, 0]",AG,0.01,chrX,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chrX,19368150,1,-,-,"transcript,CDS,exon",PDHA1
2,chrX,19555884,A,0,187,37.07,"[185, 0, 2, 0]",AG,0.01,chrX,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chrX,19555884,1,-,-,"transcript,CDS,exon",SH3KBP1
3,chrX,19610250,A,0,294,37.65,"[292, 0, 2, 0]",AG,0.01,chrX,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chrX,19610250,1,-,-,"transcript,CDS,exon",SH3KBP1
4,chrX,23740019,A,0,101,37.06,"[99, 0, 2, 0]",AG,0.02,chrX,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chrX,23740019,1,-,-,"transcript,CDS,exon",ACOT9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7298,chr8,134470631,A,0,33,43.21,"[17, 0, 16, 0]",AG,0.48,chr8,...,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chr8,134470631,0,-,-,"3UTR,exon,transcript",ST3GAL1
7299,chr8,144991176,A,0,325,38.59,"[141, 0, 184, 0]",AG,0.57,chr8,...,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chr8,144991176,0,-,-,"exon,transcript,CDS",PLEC
7300,chr8,144992103,A,0,352,38.34,"[149, 0, 203, 0]",AG,0.58,chr8,...,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chr8,144992103,0,-,-,"exon,transcript,CDS",PLEC
7301,chr8,145150832,A,1,215,39.82,"[0, 0, 215, 0]",AG,1.00,chr8,...,0,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,chr8,145150832,0,-,-,"transcript,CDS,exon",CYC1


Discared sites (via left join):


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,Class_binary,wt_sample,ko_sample,Region,Position,Class_y,RMSK-Rep,RMSK-Reg,RefSeq-Rep,RefSeq-Reg
845,chr11,66036107,A,1,96,37.56,"[94, 0, 2, 0]",AG,0.02,chr11,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,,,,,,,
1920,chr14,23305769,A,1,86,39.7,"[81, 0, 5, 0]",AG,0.06,chr14,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,,,,,,,
1921,chr14,23305771,A,1,90,40.01,"[83, 0, 7, 0]",AG,0.08,chr14,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,,,,,,,
4841,chr3,98700171,A,1,105,40.55,"[103, 0, 2, 0]",AG,0.02,chr3,...,1,SRR388226_SRR388227.WT.outTable_853538513.gz,SRR388228_SRR388229.KO.outTable_921089530.gz,,,,,,,


Save on disk joined with annotation and filtering to : /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/u87/SRR388226_SRR388227.WT.outTable_853538513.gz_vs_SRR388228_SRR388229.KO.outTable_921089530.gz.bonafide_final.tsv.anno.filt_refseq.tsv
