In [1]:
##################
### Author: Adriano Fonzino. email: adriano.fonzino@uniba.it
##################

import pandas as pd
import numpy as np
import pysam, gzip, os
from datetime import datetime

In [2]:
# define util function
def extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp):
    wt = pysam.TabixFile(wt_fp)
    ko = pysam.TabixFile(ko_fp)
    wgs = pysam.TabixFile(wgs_fp)
    bonafide = []
    start_time = datetime.now()
    with gzip.open(wt_fp) as wt_table:
        for c,s in enumerate(wt_table):
            site = s.decode("utf-8").rstrip().split("\t")
            if c % 50000000 == 0:
                print("Evaluated sites:", c)
            if site[0] == "Region":
                # store header
                header = site[:9]
            else:
                if site[0].startswith("chr"):
                    if site[0] != "chrM":
                        if site[2] == "A":
                            if site[4] != "-":
                                if int(site[4]) >= min_cov:
                                    if site[7] == "AG" and float(site[8]) >= 0.01:
                                        vector = eval(site[6])
                                        if vector[2] >= min_ag_subs:
                                            ko_query = [i for i in ko.fetch(site[0], int(site[1])-1, int(site[1]))]
                                            if len(ko_query) == 1:
                                                ko_query = ko_query[0].split("\t")
                                                if ko_query[4] != "-":
                                                    if int(ko_query[4]) >= min_cov:
                                                        wgs_query = [i for i in wgs.fetch(site[0], int(site[1])-1, int(site[1]))]
                                                        if len(wgs_query) == 1:
                                                            wgs_query = wgs_query[0].split("\t")
                                                            if wgs_query[9] != "-":
                                                                if int(wgs_query[9]) >= min_cov_wgs:
                                                                    whole_site = site[:9] + ko_query[:9] + wgs_query[0:4] + wgs_query[9:]
                                                                    bonafide.append(whole_site)
    print("Iteration on tabix outTables finished. Elapsed time: ", datetime.now()-start_time)
    columns=["wt_"+i for i in header]+["ko_"+i for i in header]+["g"+i for i in header]
    bonafide = pd.DataFrame(bonafide, columns=columns)
    
    # save to disk bonafide candidates
    output_file = os.path.join(output_folder, wt_name + "_vs_" + ko_name + ".bonafide_candidates.tsv")
    print("Save to disk bonafide candidates:", output_file)
    bonafide.to_csv(output_file, sep="\t", index=None)
    
    # load from disk to infer dtypes
    bonafide = pd.read_table(output_file)
    
    # drop unstranded
    bonafide = bonafide[(bonafide["wt_Strand"]!=2)&(bonafide["ko_Strand"]!=2)&(bonafide["gStrand"]!=2)].copy()
    
    # select concordand for strand for wt and ko
    # select only strand concordand sites
    bonafide = bonafide[bonafide["wt_Strand"] == bonafide["ko_Strand"]].copy()
    bonafide = bonafide[bonafide["ko_Strand"] == bonafide["gStrand"]].copy()
    bonafide.reset_index(inplace=True, drop=True)
    display(bonafide)
    pos = bonafide[(bonafide["wt_AllSubs"]=="AG")&(bonafide["ko_AllSubs"]=="-")&(bonafide["gAllSubs"]=="-")].copy()
    pos["Class"] = "Editing"
    pos["Class_binary"] = 1
    print("Pos:", pos.shape)
    neg = bonafide[(bonafide["wt_AllSubs"]=="AG")&(bonafide["ko_AllSubs"]=="AG")&(bonafide["gAllSubs"]=="AG")].copy()
    # select negs with 0.01 AG freq and min_ag_subs
    neg = neg.query("ko_Frequency >= 0.01")
    mask = []
    for cand in neg["ko_BaseCount[A,C,G,T]"]:
        vector = eval(cand)
        if vector[2] >= min_ag_subs:
            mask.append(True)
        else:
            mask.append(False)
    neg = neg[mask].copy()
    neg["Class"] = "Not-Editing"
    neg["Class_binary"] = 0
    print("Neg:", neg.shape)
    # merge pos and neg and add samples id
    bonafide_final = pd.concat([pos, neg])
    bonafide_final.reset_index(inplace=True, drop=True)
    bonafide_final["wt_sample"] = wt_name
    bonafide_final["ko_sample"] = ko_name
    
    display(bonafide_final)
    print(bonafide_final[["Class", "Class_binary"]].value_counts())
    
    # save to disk bonafide final
    output_file = os.path.join(output_folder, wt_name + "_vs_" + ko_name + ".bonafide_final.tsv")
    print("Save to disk bonafide final:", output_file)
    bonafide_final.to_csv(output_file, sep="\t", index=None)
    
    wt.close()
    ko.close()
    wgs.close()
    return bonafide_final

In [3]:
# define output folder
output_folder = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549"

In [4]:
# define wgs tabix outTable file path
wgs_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/a549/SRR8639173.wgs/DnaRna_355499054/outTable_355499054.gz"

# define common filters
min_cov = 50
min_cov_wgs = 10
min_ag_subs = 3

In [5]:
# couple 1
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/a549/SRR12492043.SRR12492044.control/DnaRna_773331943/outTable_773331943.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/a549/SRR12492025.SRR12492026.siADARs/DnaRna_302610513/outTable_302610513.gz"
wt_name = "SRR12492043.SRR12492044.control" + f".{os.path.basename(wt_fp)}"
ko_name = "SRR12492025.SRR12492026.siADARs" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple1 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple1

SRR12492043.SRR12492044.control.outTable_773331943.gz
SRR12492025.SRR12492026.siADARs.outTable_302610513.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Iteration on tabix outTables finished. Elapsed time:  0:06:06.399067
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549/SRR12492043.SRR12492044.control.outTable_773331943.gz_vs_SRR12492025.SRR12492026.siADARs.outTable_302610513.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrY,21152719,A,0,2348,54.76,"[1, 0, 2347, 0]",AG,1.00,chrY,...,1.00,chrY,21152719,A,0,20,37.55,"[0, 0, 20, 0]",AG,1.00
1,chrX,216177,A,1,193,51.64,"[189, 0, 4, 0]",AG,0.02,chrX,...,0.00,chrX,216177,A,1,47,36.77,"[47, 0, 0, 0]",-,0.00
2,chrX,216207,A,1,188,52.04,"[184, 0, 4, 0]",AG,0.02,chrX,...,0.00,chrX,216207,A,1,45,36.64,"[45, 0, 0, 0]",-,0.00
3,chrX,216215,A,1,189,52.53,"[118, 0, 71, 0]",AG,0.38,chrX,...,0.05,chrX,216215,A,1,47,36.57,"[47, 0, 0, 0]",-,0.00
4,chrX,216235,A,1,188,51.60,"[185, 0, 3, 0]",AG,0.02,chrX,...,0.00,chrX,216235,A,1,31,37.39,"[31, 0, 0, 0]",-,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13629,chr8,145740402,A,0,165,47.93,"[162, 0, 3, 0]",AG,0.02,chr8,...,0.00,chr8,145740402,A,0,100,35.81,"[100, 0, 0, 0]",-,0.00
13630,chr8,145740405,A,0,166,47.50,"[163, 0, 3, 0]",AG,0.02,chr8,...,0.00,chr8,145740405,A,0,84,35.70,"[84, 0, 0, 0]",-,0.00
13631,chr8,145742879,A,0,99,45.47,"[60, 0, 39, 0]",AG,0.39,chr8,...,0.51,chr8,145742879,A,0,51,35.49,"[29, 0, 22, 0]",AG,0.43
13632,chr8,145753130,A,0,468,48.96,"[464, 0, 4, 0]",AG,0.01,chr8,...,0.00,chr8,145753130,A,0,63,35.65,"[63, 0, 0, 0]",-,0.00


Pos: (6369, 29)
Neg: (2464, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,216177,A,1,193,51.64,"[189, 0, 4, 0]",AG,0.02,chrX,...,1,47,36.77,"[47, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
1,chrX,216207,A,1,188,52.04,"[184, 0, 4, 0]",AG,0.02,chrX,...,1,45,36.64,"[45, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
2,chrX,216235,A,1,188,51.60,"[185, 0, 3, 0]",AG,0.02,chrX,...,1,31,37.39,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
3,chrX,216241,A,1,198,52.23,"[190, 0, 8, 0]",AG,0.04,chrX,...,1,35,37.29,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
4,chrX,216248,A,1,213,51.35,"[204, 0, 9, 0]",AG,0.04,chrX,...,1,41,36.59,"[41, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8828,chr8,145066886,A,1,2337,51.77,"[0, 0, 2337, 0]",AG,1.00,chr8,...,1,106,36.79,"[0, 0, 106, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
8829,chr8,145665516,A,0,71,47.69,"[1, 0, 70, 0]",AG,0.99,chr8,...,0,116,36.78,"[0, 0, 116, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
8830,chr8,145677011,A,0,126,47.74,"[0, 0, 126, 0]",AG,1.00,chr8,...,0,98,36.48,"[0, 0, 98, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
8831,chr8,145693720,A,1,160,54.27,"[0, 0, 160, 0]",AG,1.00,chr8,...,1,133,35.52,"[0, 0, 133, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...


Class        Class_binary
Editing      1               6369
Not-Editing  0               2464
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549/SRR12492043.SRR12492044.control.outTable_773331943.gz_vs_SRR12492025.SRR12492026.siADARs.outTable_302610513.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,216177,A,1,193,51.64,"[189, 0, 4, 0]",AG,0.02,chrX,...,1,47,36.77,"[47, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
1,chrX,216207,A,1,188,52.04,"[184, 0, 4, 0]",AG,0.02,chrX,...,1,45,36.64,"[45, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
2,chrX,216235,A,1,188,51.60,"[185, 0, 3, 0]",AG,0.02,chrX,...,1,31,37.39,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
3,chrX,216241,A,1,198,52.23,"[190, 0, 8, 0]",AG,0.04,chrX,...,1,35,37.29,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
4,chrX,216248,A,1,213,51.35,"[204, 0, 9, 0]",AG,0.04,chrX,...,1,41,36.59,"[41, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8828,chr8,145066886,A,1,2337,51.77,"[0, 0, 2337, 0]",AG,1.00,chr8,...,1,106,36.79,"[0, 0, 106, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
8829,chr8,145665516,A,0,71,47.69,"[1, 0, 70, 0]",AG,0.99,chr8,...,0,116,36.78,"[0, 0, 116, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
8830,chr8,145677011,A,0,126,47.74,"[0, 0, 126, 0]",AG,1.00,chr8,...,0,98,36.48,"[0, 0, 98, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
8831,chr8,145693720,A,1,160,54.27,"[0, 0, 160, 0]",AG,1.00,chr8,...,1,133,35.52,"[0, 0, 133, 0]",AG,1.00,Not-Editing,0,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...


In [6]:
# couple 2
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/a549/SRR12492045.SRR12492046.control/DnaRna_535670354/outTable_535670354.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/a549/SRR12492027.SRR12492028.siADARs/DnaRna_396704193/outTable_396704193.gz"
wt_name = "SRR12492045.SRR12492046.control" + f".{os.path.basename(wt_fp)}"
ko_name = "SRR12492027.SRR12492028.siADARs" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple2 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple2

SRR12492045.SRR12492046.control.outTable_535670354.gz
SRR12492027.SRR12492028.siADARs.outTable_396704193.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Iteration on tabix outTables finished. Elapsed time:  0:06:08.248317
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549/SRR12492045.SRR12492046.control.outTable_535670354.gz_vs_SRR12492027.SRR12492028.siADARs.outTable_396704193.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrY,21152719,A,0,2539,54.76,"[3, 0, 2536, 0]",AG,1.00,chrY,...,1.00,chrY,21152719,A,0,20,37.55,"[0, 0, 20, 0]",AG,1.00
1,chrX,216177,A,1,194,51.60,"[188, 0, 6, 0]",AG,0.03,chrX,...,0.00,chrX,216177,A,1,47,36.77,"[47, 0, 0, 0]",-,0.00
2,chrX,216215,A,1,212,51.56,"[137, 0, 75, 0]",AG,0.35,chrX,...,0.08,chrX,216215,A,1,47,36.57,"[47, 0, 0, 0]",-,0.00
3,chrX,216226,A,1,232,51.86,"[225, 0, 7, 0]",AG,0.03,chrX,...,0.00,chrX,216226,A,1,41,36.37,"[41, 0, 0, 0]",-,0.00
4,chrX,216238,A,1,231,52.09,"[228, 0, 3, 0]",AG,0.01,chrX,...,0.00,chrX,216238,A,1,32,38.22,"[32, 0, 0, 0]",-,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14308,chr8,145736752,A,0,117,50.01,"[114, 0, 3, 0]",AG,0.03,chr8,...,0.00,chr8,145736752,A,0,94,35.23,"[94, 0, 0, 0]",-,0.00
14309,chr8,145738255,A,0,263,50.96,"[258, 0, 5, 0]",AG,0.02,chr8,...,0.00,chr8,145738255,A,0,131,35.46,"[131, 0, 0, 0]",-,0.00
14310,chr8,145738265,A,0,257,50.17,"[254, 0, 3, 0]",AG,0.01,chr8,...,0.00,chr8,145738265,A,0,126,36.64,"[126, 0, 0, 0]",-,0.00
14311,chr8,145742879,A,0,108,47.81,"[58, 0, 50, 0]",AG,0.46,chr8,...,0.44,chr8,145742879,A,0,51,35.49,"[29, 0, 22, 0]",AG,0.43


Pos: (6840, 29)
Neg: (2498, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,216177,A,1,194,51.60,"[188, 0, 6, 0]",AG,0.03,chrX,...,1,47,36.77,"[47, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
1,chrX,216226,A,1,232,51.86,"[225, 0, 7, 0]",AG,0.03,chrX,...,1,41,36.37,"[41, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
2,chrX,216238,A,1,231,52.09,"[228, 0, 3, 0]",AG,0.01,chrX,...,1,32,38.22,"[32, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
3,chrX,216239,A,1,237,51.42,"[228, 0, 9, 0]",AG,0.04,chrX,...,1,31,37.87,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
4,chrX,216241,A,1,229,51.96,"[221, 0, 8, 0]",AG,0.03,chrX,...,1,35,37.29,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9333,chr8,145150832,A,1,2357,49.45,"[1, 0, 2356, 0]",AG,1.00,chr8,...,1,101,34.99,"[0, 0, 101, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
9334,chr8,145665516,A,0,96,47.65,"[3, 0, 93, 0]",AG,0.97,chr8,...,0,116,36.78,"[0, 0, 116, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
9335,chr8,145677011,A,0,149,48.46,"[0, 0, 149, 0]",AG,1.00,chr8,...,0,98,36.48,"[0, 0, 98, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
9336,chr8,145693720,A,1,157,57.26,"[0, 0, 157, 0]",AG,1.00,chr8,...,1,133,35.52,"[0, 0, 133, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...


Class        Class_binary
Editing      1               6840
Not-Editing  0               2498
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549/SRR12492045.SRR12492046.control.outTable_535670354.gz_vs_SRR12492027.SRR12492028.siADARs.outTable_396704193.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,216177,A,1,194,51.60,"[188, 0, 6, 0]",AG,0.03,chrX,...,1,47,36.77,"[47, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
1,chrX,216226,A,1,232,51.86,"[225, 0, 7, 0]",AG,0.03,chrX,...,1,41,36.37,"[41, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
2,chrX,216238,A,1,231,52.09,"[228, 0, 3, 0]",AG,0.01,chrX,...,1,32,38.22,"[32, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
3,chrX,216239,A,1,237,51.42,"[228, 0, 9, 0]",AG,0.04,chrX,...,1,31,37.87,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
4,chrX,216241,A,1,229,51.96,"[221, 0, 8, 0]",AG,0.03,chrX,...,1,35,37.29,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9333,chr8,145150832,A,1,2357,49.45,"[1, 0, 2356, 0]",AG,1.00,chr8,...,1,101,34.99,"[0, 0, 101, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
9334,chr8,145665516,A,0,96,47.65,"[3, 0, 93, 0]",AG,0.97,chr8,...,0,116,36.78,"[0, 0, 116, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
9335,chr8,145677011,A,0,149,48.46,"[0, 0, 149, 0]",AG,1.00,chr8,...,0,98,36.48,"[0, 0, 98, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...
9336,chr8,145693720,A,1,157,57.26,"[0, 0, 157, 0]",AG,1.00,chr8,...,1,133,35.52,"[0, 0, 133, 0]",AG,1.00,Not-Editing,0,SRR12492045.SRR12492046.control.outTable_53567...,SRR12492027.SRR12492028.siADARs.outTable_39670...


In [7]:
# couple 3
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/a549/SRR12492047.SRR12492048.control/DnaRna_192318299/outTable_192318299.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/a549/SRR12492029.SRR12492030.siADARs/DnaRna_436061877/outTable_436061877.gz"
wt_name = "SRR12492047.SRR12492048.control" + f".{os.path.basename(wt_fp)}"
ko_name = "SRR12492029.SRR12492030.siADARs" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple3 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple3

SRR12492047.SRR12492048.control.outTable_192318299.gz
SRR12492029.SRR12492030.siADARs.outTable_436061877.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Iteration on tabix outTables finished. Elapsed time:  0:06:16.068324
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549/SRR12492047.SRR12492048.control.outTable_192318299.gz_vs_SRR12492029.SRR12492030.siADARs.outTable_436061877.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrY,21153609,A,0,5083,54.07,"[0, 0, 5083, 0]",AG,1.00,chrY,...,1.00,chrY,21153609,A,0,38,37.66,"[0, 0, 38, 0]",AG,1.00
1,chrX,216254,A,1,273,48.92,"[261, 0, 12, 0]",AG,0.04,chrX,...,0.00,chrX,216254,A,1,37,37.54,"[37, 0, 0, 0]",-,0.00
2,chrX,216260,A,1,284,48.82,"[261, 0, 23, 0]",AG,0.08,chrX,...,0.04,chrX,216260,A,1,40,37.12,"[40, 0, 0, 0]",-,0.00
3,chrX,216269,A,1,313,50.32,"[304, 0, 9, 0]",AG,0.03,chrX,...,0.00,chrX,216269,A,1,35,36.34,"[35, 0, 0, 0]",-,0.00
4,chrX,216272,A,1,332,49.94,"[312, 0, 20, 0]",AG,0.06,chrX,...,0.00,chrX,216272,A,1,35,37.43,"[35, 0, 0, 0]",-,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12695,chr8,145734874,A,1,559,45.57,"[553, 0, 6, 0]",AG,0.01,chr8,...,0.00,chr8,145734874,A,1,98,35.52,"[98, 0, 0, 0]",-,0.00
12696,chr8,145734997,A,1,515,46.69,"[512, 0, 3, 0]",AG,0.01,chr8,...,0.00,chr8,145734997,A,1,52,36.35,"[52, 0, 0, 0]",-,0.00
12697,chr8,145742879,A,0,147,46.37,"[70, 0, 77, 0]",AG,0.52,chr8,...,0.42,chr8,145742879,A,0,51,35.49,"[29, 0, 22, 0]",AG,0.43
12698,chr8,146076443,A,0,484,48.97,"[481, 0, 3, 0]",AG,0.01,chr8,...,0.00,chr8,146076443,A,0,91,35.47,"[91, 0, 0, 0]",-,0.00


Pos: (6634, 29)
Neg: (2201, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,216254,A,1,273,48.92,"[261, 0, 12, 0]",AG,0.04,chrX,...,1,37,37.54,"[37, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
1,chrX,216269,A,1,313,50.32,"[304, 0, 9, 0]",AG,0.03,chrX,...,1,35,36.34,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
2,chrX,216272,A,1,332,49.94,"[312, 0, 20, 0]",AG,0.06,chrX,...,1,35,37.43,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
3,chrX,216273,A,1,331,49.74,"[320, 0, 11, 0]",AG,0.03,chrX,...,1,34,37.21,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
4,chrX,216309,A,1,425,54.68,"[416, 0, 9, 0]",AG,0.02,chrX,...,1,35,35.94,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8830,chr8,144992361,A,0,1099,50.93,"[604, 0, 495, 0]",AG,0.45,chr8,...,0,79,34.58,"[33, 0, 46, 0]",AG,0.58,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8831,chr8,145001031,A,0,926,50.92,"[513, 0, 413, 0]",AG,0.45,chr8,...,0,75,35.93,"[43, 0, 32, 0]",AG,0.43,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8832,chr8,145677011,A,0,182,46.22,"[0, 0, 182, 0]",AG,1.00,chr8,...,0,98,36.48,"[0, 0, 98, 0]",AG,1.00,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8833,chr8,145693720,A,1,157,55.40,"[0, 0, 157, 0]",AG,1.00,chr8,...,1,133,35.52,"[0, 0, 133, 0]",AG,1.00,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...


Class        Class_binary
Editing      1               6634
Not-Editing  0               2201
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549/SRR12492047.SRR12492048.control.outTable_192318299.gz_vs_SRR12492029.SRR12492030.siADARs.outTable_436061877.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,216254,A,1,273,48.92,"[261, 0, 12, 0]",AG,0.04,chrX,...,1,37,37.54,"[37, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
1,chrX,216269,A,1,313,50.32,"[304, 0, 9, 0]",AG,0.03,chrX,...,1,35,36.34,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
2,chrX,216272,A,1,332,49.94,"[312, 0, 20, 0]",AG,0.06,chrX,...,1,35,37.43,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
3,chrX,216273,A,1,331,49.74,"[320, 0, 11, 0]",AG,0.03,chrX,...,1,34,37.21,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
4,chrX,216309,A,1,425,54.68,"[416, 0, 9, 0]",AG,0.02,chrX,...,1,35,35.94,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8830,chr8,144992361,A,0,1099,50.93,"[604, 0, 495, 0]",AG,0.45,chr8,...,0,79,34.58,"[33, 0, 46, 0]",AG,0.58,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8831,chr8,145001031,A,0,926,50.92,"[513, 0, 413, 0]",AG,0.45,chr8,...,0,75,35.93,"[43, 0, 32, 0]",AG,0.43,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8832,chr8,145677011,A,0,182,46.22,"[0, 0, 182, 0]",AG,1.00,chr8,...,0,98,36.48,"[0, 0, 98, 0]",AG,1.00,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8833,chr8,145693720,A,1,157,55.40,"[0, 0, 157, 0]",AG,1.00,chr8,...,1,133,35.52,"[0, 0, 133, 0]",AG,1.00,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...


In [8]:
# concat into a uniq dataset
bonafide_final_full = pd.concat([couple1, couple2, couple3])
bonafide_final_full

Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,216177,A,1,193,51.64,"[189, 0, 4, 0]",AG,0.02,chrX,...,1,47,36.77,"[47, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
1,chrX,216207,A,1,188,52.04,"[184, 0, 4, 0]",AG,0.02,chrX,...,1,45,36.64,"[45, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
2,chrX,216235,A,1,188,51.60,"[185, 0, 3, 0]",AG,0.02,chrX,...,1,31,37.39,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
3,chrX,216241,A,1,198,52.23,"[190, 0, 8, 0]",AG,0.04,chrX,...,1,35,37.29,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
4,chrX,216248,A,1,213,51.35,"[204, 0, 9, 0]",AG,0.04,chrX,...,1,41,36.59,"[41, 0, 0, 0]",-,0.00,Editing,1,SRR12492043.SRR12492044.control.outTable_77333...,SRR12492025.SRR12492026.siADARs.outTable_30261...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8830,chr8,144992361,A,0,1099,50.93,"[604, 0, 495, 0]",AG,0.45,chr8,...,0,79,34.58,"[33, 0, 46, 0]",AG,0.58,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8831,chr8,145001031,A,0,926,50.92,"[513, 0, 413, 0]",AG,0.45,chr8,...,0,75,35.93,"[43, 0, 32, 0]",AG,0.43,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8832,chr8,145677011,A,0,182,46.22,"[0, 0, 182, 0]",AG,1.00,chr8,...,0,98,36.48,"[0, 0, 98, 0]",AG,1.00,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...
8833,chr8,145693720,A,1,157,55.40,"[0, 0, 157, 0]",AG,1.00,chr8,...,1,133,35.52,"[0, 0, 133, 0]",AG,1.00,Not-Editing,0,SRR12492047.SRR12492048.control.outTable_19231...,SRR12492029.SRR12492030.siADARs.outTable_43606...


In [9]:
# save to disk bonafide final
output_file = os.path.join(output_folder, "bonafide_final_MERGED.tsv")
print("Save to disk bonafide final MERGED (all couples):", output_file)
bonafide_final_full.to_csv(output_file, sep="\t", index=None)

Save to disk bonafide final MERGED (all couples): /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/a549/bonafide_final_MERGED.tsv


In [10]:
bonafide_final_full.Class.value_counts()

Editing        19843
Not-Editing     7163
Name: Class, dtype: int64

In [11]:
bonafide_final_full.groupby("Class")[["wt_Frequency", "ko_Frequency", "gFrequency"]].describe().T

Unnamed: 0,Class,Editing,Not-Editing
wt_Frequency,count,19843.0,7163.0
wt_Frequency,mean,0.037847,0.740363
wt_Frequency,std,0.048974,0.307439
wt_Frequency,min,0.01,0.01
wt_Frequency,25%,0.01,0.47
wt_Frequency,50%,0.02,1.0
wt_Frequency,75%,0.04,1.0
wt_Frequency,max,0.64,1.0
ko_Frequency,count,19843.0,7163.0
ko_Frequency,mean,0.0,0.740804
