In [1]:
##################
### Author: Adriano Fonzino. email: adriano.fonzino@uniba.it
##################
import pandas as pd
import numpy as np
import pysam, gzip, os
from datetime import datetime

In [2]:
# define util function
def extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp):
    wt = pysam.TabixFile(wt_fp)
    ko = pysam.TabixFile(ko_fp)
    wgs = pysam.TabixFile(wgs_fp)
    bonafide = []
    start_time = datetime.now()
    with gzip.open(wt_fp) as wt_table:
        for c,s in enumerate(wt_table):
            site = s.decode("utf-8").rstrip().split("\t")
            if c % 50000000 == 0:
                print("Evaluated sites:", c)
            if site[0] == "Region":
                # store header
                header = site[:9]
            else:
                if site[0].startswith("chr"):
                    if site[0] != "chrM":
                        if site[2] == "A":
                            if site[4] != "-":
                                if int(site[4]) >= min_cov:
                                    if site[7] == "AG" and float(site[8]) >= 0.01:
                                        vector = eval(site[6])
                                        if vector[2] >= min_ag_subs:
                                            ko_query = [i for i in ko.fetch(site[0], int(site[1])-1, int(site[1]))]
                                            if len(ko_query) == 1:
                                                ko_query = ko_query[0].split("\t")
                                                if ko_query[4] != "-":
                                                    if int(ko_query[4]) >= min_cov:
                                                        wgs_query = [i for i in wgs.fetch(site[0], int(site[1])-1, int(site[1]))]
                                                        if len(wgs_query) == 1:
                                                            wgs_query = wgs_query[0].split("\t")
                                                            if wgs_query[9] != "-":
                                                                if int(wgs_query[9]) >= min_cov_wgs:
                                                                    whole_site = site[:9] + ko_query[:9] + wgs_query[0:4] + wgs_query[9:]
                                                                    bonafide.append(whole_site)
    print("Iteration on tabix outTables finished. Elapsed time: ", datetime.now()-start_time)
    columns=["wt_"+i for i in header]+["ko_"+i for i in header]+["g"+i for i in header]
    bonafide = pd.DataFrame(bonafide, columns=columns)
    
    # save to disk bonafide candidates
    output_file = os.path.join(output_folder, wt_name + "_vs_" + ko_name + ".bonafide_candidates.tsv")
    print("Save to disk bonafide candidates:", output_file)
    bonafide.to_csv(output_file, sep="\t", index=None)
    
    # load from disk to infer dtypes
    bonafide = pd.read_table(output_file)
    
    # drop unstranded
    bonafide = bonafide[(bonafide["wt_Strand"]!=2)&(bonafide["ko_Strand"]!=2)&(bonafide["gStrand"]!=2)].copy()
    
    # select concordand for strand for wt and ko
    # select only strand concordand sites
    bonafide = bonafide[bonafide["wt_Strand"] == bonafide["ko_Strand"]].copy()
    bonafide = bonafide[bonafide["ko_Strand"] == bonafide["gStrand"]].copy()
    bonafide.reset_index(inplace=True, drop=True)
    display(bonafide)
    pos = bonafide[(bonafide["wt_AllSubs"]=="AG")&(bonafide["ko_AllSubs"]=="-")&(bonafide["gAllSubs"]=="-")].copy()
    pos["Class"] = "Editing"
    pos["Class_binary"] = 1
    print("Pos:", pos.shape)
    neg = bonafide[(bonafide["wt_AllSubs"]=="AG")&(bonafide["ko_AllSubs"]=="AG")&(bonafide["gAllSubs"]=="AG")].copy()
    # select negs with 0.01 AG freq and min_ag_subs
    neg = neg.query("ko_Frequency >= 0.01")
    mask = []
    for cand in neg["ko_BaseCount[A,C,G,T]"]:
        vector = eval(cand)
        if vector[2] >= min_ag_subs:
            mask.append(True)
        else:
            mask.append(False)
    neg = neg[mask].copy()
    neg["Class"] = "Not-Editing"
    neg["Class_binary"] = 0
    print("Neg:", neg.shape)
    # merge pos and neg and add samples id
    bonafide_final = pd.concat([pos, neg])
    bonafide_final.reset_index(inplace=True, drop=True)
    bonafide_final["wt_sample"] = wt_name
    bonafide_final["ko_sample"] = ko_name
    
    display(bonafide_final)
    print(bonafide_final[["Class", "Class_binary"]].value_counts())
    
    # save to disk bonafide final
    output_file = os.path.join(output_folder, wt_name + "_vs_" + ko_name + ".bonafide_final.tsv")
    print("Save to disk bonafide final:", output_file)
    bonafide_final.to_csv(output_file, sep="\t", index=None)
    
    wt.close()
    ko.close()
    wgs.close()
    return bonafide_final

In [3]:
# define output folder
output_folder = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata"

In [4]:
# define wgs tabix outTable file path
wgs_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR10129631.WGS.wildtype/DnaRna_51058007/outTable_51058007.gz"

# define common filters
min_cov = 30
min_cov_wgs = 10
min_ag_subs = 2

In [5]:
# couple 1 (KO1 vs WT1)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR5564274.wildtype/DnaRna_724242056/outTable_724242056.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR5564272.ADAR1_KO/DnaRna_816573740/outTable_816573740.gz"
wt_name = "SRR5564274.wildtype" + f".{os.path.basename(wt_fp)}"
ko_name = "SRR5564272.ADAR1_KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple1 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple1

SRR5564274.wildtype.outTable_724242056.gz
SRR5564272.ADAR1_KO.outTable_816573740.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Evaluated sites: 300000000
Evaluated sites: 350000000
Evaluated sites: 400000000
Evaluated sites: 450000000
Iteration on tabix outTables finished. Elapsed time:  0:10:43.535257
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564274.wildtype.outTable_724242056.gz_vs_SRR5564272.ADAR1_KO.outTable_816573740.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,217844,A,1,32,36.72,"[29, 0, 3, 0]",AG,0.09,chrX,...,0.00,chrX,217844,A,1,35,40.11,"[35, 0, 0, 0]",-,0.0
1,chrX,217852,A,1,36,35.89,"[25, 0, 11, 0]",AG,0.31,chrX,...,0.00,chrX,217852,A,1,33,40.64,"[33, 0, 0, 0]",-,0.0
2,chrX,217898,A,1,38,36.18,"[33, 0, 5, 0]",AG,0.13,chrX,...,0.01,chrX,217898,A,1,32,42.38,"[32, 0, 0, 0]",-,0.0
3,chrX,217930,A,1,44,35.32,"[42, 0, 2, 0]",AG,0.05,chrX,...,0.00,chrX,217930,A,1,33,40.55,"[33, 0, 0, 0]",-,0.0
4,chrX,217946,A,1,42,35.50,"[39, 0, 3, 0]",AG,0.07,chrX,...,0.00,chrX,217946,A,1,38,40.82,"[38, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8764,chr8,145066886,A,1,83,36.94,"[0, 0, 83, 0]",AG,1.00,chr8,...,1.00,chr8,145066886,A,1,54,40.98,"[0, 0, 54, 0]",AG,1.0
8765,chr8,145150832,A,1,169,37.99,"[0, 0, 169, 0]",AG,1.00,chr8,...,1.00,chr8,145150832,A,1,33,40.00,"[0, 0, 33, 0]",AG,1.0
8766,chr8,146200271,A,0,38,34.95,"[36, 0, 2, 0]",AG,0.05,chr8,...,0.00,chr8,146200271,A,0,53,40.91,"[53, 0, 0, 0]",-,0.0
8767,chr8,146280005,A,1,153,36.48,"[151, 0, 2, 0]",AG,0.01,chr8,...,0.00,chr8,146280005,A,1,41,40.15,"[41, 0, 0, 0]",-,0.0


Pos: (4775, 29)
Neg: (2249, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,217844,A,1,32,36.72,"[29, 0, 3, 0]",AG,0.09,chrX,...,1,35,40.11,"[35, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
1,chrX,217852,A,1,36,35.89,"[25, 0, 11, 0]",AG,0.31,chrX,...,1,33,40.64,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
2,chrX,217930,A,1,44,35.32,"[42, 0, 2, 0]",AG,0.05,chrX,...,1,33,40.55,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
3,chrX,217946,A,1,42,35.50,"[39, 0, 3, 0]",AG,0.07,chrX,...,1,38,40.82,"[38, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
4,chrX,218542,A,1,32,36.38,"[30, 0, 2, 0]",AG,0.06,chrX,...,1,33,39.82,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019,chr8,124027628,A,0,103,35.65,"[0, 0, 103, 0]",AG,1.00,chr8,...,0,53,40.45,"[0, 0, 53, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7020,chr8,124031541,A,0,130,37.04,"[53, 0, 77, 0]",AG,0.59,chr8,...,0,40,39.83,"[16, 0, 24, 0]",AG,0.6,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7021,chr8,132982824,A,1,33,35.03,"[0, 0, 33, 0]",AG,1.00,chr8,...,1,40,41.92,"[0, 0, 40, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7022,chr8,145066886,A,1,83,36.94,"[0, 0, 83, 0]",AG,1.00,chr8,...,1,54,40.98,"[0, 0, 54, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz


Class        Class_binary
Editing      1               4775
Not-Editing  0               2249
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564274.wildtype.outTable_724242056.gz_vs_SRR5564272.ADAR1_KO.outTable_816573740.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,217844,A,1,32,36.72,"[29, 0, 3, 0]",AG,0.09,chrX,...,1,35,40.11,"[35, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
1,chrX,217852,A,1,36,35.89,"[25, 0, 11, 0]",AG,0.31,chrX,...,1,33,40.64,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
2,chrX,217930,A,1,44,35.32,"[42, 0, 2, 0]",AG,0.05,chrX,...,1,33,40.55,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
3,chrX,217946,A,1,42,35.50,"[39, 0, 3, 0]",AG,0.07,chrX,...,1,38,40.82,"[38, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
4,chrX,218542,A,1,32,36.38,"[30, 0, 2, 0]",AG,0.06,chrX,...,1,33,39.82,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7019,chr8,124027628,A,0,103,35.65,"[0, 0, 103, 0]",AG,1.00,chr8,...,0,53,40.45,"[0, 0, 53, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7020,chr8,124031541,A,0,130,37.04,"[53, 0, 77, 0]",AG,0.59,chr8,...,0,40,39.83,"[16, 0, 24, 0]",AG,0.6,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7021,chr8,132982824,A,1,33,35.03,"[0, 0, 33, 0]",AG,1.00,chr8,...,1,40,41.92,"[0, 0, 40, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
7022,chr8,145066886,A,1,83,36.94,"[0, 0, 83, 0]",AG,1.00,chr8,...,1,54,40.98,"[0, 0, 54, 0]",AG,1.0,Not-Editing,0,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz


In [6]:
# couple 2 (KO2 vs WT2)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR5564275.wildtype/DnaRna_580067564/outTable_580067564.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR5564273.ADAR1_KO/DnaRna_718392497/outTable_718392497.gz"
wt_name = "SRR5564275.wildtype" + f".{os.path.basename(wt_fp)}"
ko_name = "SRR5564273.ADAR1_KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple2 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple2

SRR5564275.wildtype.outTable_580067564.gz
SRR5564273.ADAR1_KO.outTable_718392497.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Evaluated sites: 300000000
Evaluated sites: 350000000
Evaluated sites: 400000000
Evaluated sites: 450000000
Iteration on tabix outTables finished. Elapsed time:  0:11:30.641829
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564275.wildtype.outTable_580067564.gz_vs_SRR5564273.ADAR1_KO.outTable_718392497.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,217741,A,1,62,38.73,"[58, 0, 4, 0]",AG,0.06,chrX,...,0.00,chrX,217741,A,1,31,40.71,"[31, 0, 0, 0]",-,0.0
1,chrX,217757,A,1,61,38.70,"[41, 0, 20, 0]",AG,0.33,chrX,...,0.03,chrX,217757,A,1,32,39.97,"[32, 0, 0, 0]",-,0.0
2,chrX,217778,A,1,55,39.98,"[53, 0, 2, 0]",AG,0.04,chrX,...,0.00,chrX,217778,A,1,28,40.71,"[28, 0, 0, 0]",-,0.0
3,chrX,217844,A,1,85,38.59,"[81, 0, 4, 0]",AG,0.05,chrX,...,0.00,chrX,217844,A,1,35,40.11,"[35, 0, 0, 0]",-,0.0
4,chrX,217852,A,1,84,38.21,"[54, 0, 30, 0]",AG,0.36,chrX,...,0.02,chrX,217852,A,1,33,40.64,"[33, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10565,chr8,145150832,A,1,361,37.37,"[0, 0, 361, 0]",AG,1.00,chr8,...,1.00,chr8,145150832,A,1,33,40.00,"[0, 0, 33, 0]",AG,1.0
10566,chr8,145160591,A,1,189,38.28,"[187, 0, 2, 0]",AG,0.01,chr8,...,0.00,chr8,145160591,A,1,49,39.82,"[49, 0, 0, 0]",-,0.0
10567,chr8,145162430,A,1,189,35.37,"[187, 0, 2, 0]",AG,0.01,chr8,...,0.00,chr8,145162430,A,1,51,39.61,"[51, 0, 0, 0]",-,0.0
10568,chr8,145665516,A,0,89,35.35,"[3, 0, 86, 0]",AG,0.97,chr8,...,1.00,chr8,145665516,A,0,59,40.75,"[0, 0, 59, 0]",AG,1.0


Pos: (6220, 29)
Neg: (2295, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,217741,A,1,62,38.73,"[58, 0, 4, 0]",AG,0.06,chrX,...,1,31,40.71,"[31, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
1,chrX,217778,A,1,55,39.98,"[53, 0, 2, 0]",AG,0.04,chrX,...,1,28,40.71,"[28, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
2,chrX,217844,A,1,85,38.59,"[81, 0, 4, 0]",AG,0.05,chrX,...,1,35,40.11,"[35, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
3,chrX,217875,A,1,79,36.59,"[75, 0, 4, 0]",AG,0.05,chrX,...,1,32,40.25,"[32, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
4,chrX,217930,A,1,81,37.02,"[79, 0, 2, 0]",AG,0.02,chrX,...,1,33,40.55,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8510,chr8,132982824,A,1,60,35.33,"[0, 0, 60, 0]",AG,1.00,chr8,...,1,40,41.92,"[0, 0, 40, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8511,chr8,144732418,A,1,83,35.73,"[0, 0, 83, 0]",AG,1.00,chr8,...,1,35,39.74,"[0, 0, 35, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8512,chr8,145066886,A,1,284,36.31,"[0, 0, 284, 0]",AG,1.00,chr8,...,1,54,40.98,"[0, 0, 54, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8513,chr8,145150832,A,1,361,37.37,"[0, 0, 361, 0]",AG,1.00,chr8,...,1,33,40.00,"[0, 0, 33, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz


Class        Class_binary
Editing      1               6220
Not-Editing  0               2295
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564275.wildtype.outTable_580067564.gz_vs_SRR5564273.ADAR1_KO.outTable_718392497.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,217741,A,1,62,38.73,"[58, 0, 4, 0]",AG,0.06,chrX,...,1,31,40.71,"[31, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
1,chrX,217778,A,1,55,39.98,"[53, 0, 2, 0]",AG,0.04,chrX,...,1,28,40.71,"[28, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
2,chrX,217844,A,1,85,38.59,"[81, 0, 4, 0]",AG,0.05,chrX,...,1,35,40.11,"[35, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
3,chrX,217875,A,1,79,36.59,"[75, 0, 4, 0]",AG,0.05,chrX,...,1,32,40.25,"[32, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
4,chrX,217930,A,1,81,37.02,"[79, 0, 2, 0]",AG,0.02,chrX,...,1,33,40.55,"[33, 0, 0, 0]",-,0.0,Editing,1,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8510,chr8,132982824,A,1,60,35.33,"[0, 0, 60, 0]",AG,1.00,chr8,...,1,40,41.92,"[0, 0, 40, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8511,chr8,144732418,A,1,83,35.73,"[0, 0, 83, 0]",AG,1.00,chr8,...,1,35,39.74,"[0, 0, 35, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8512,chr8,145066886,A,1,284,36.31,"[0, 0, 284, 0]",AG,1.00,chr8,...,1,54,40.98,"[0, 0, 54, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz
8513,chr8,145150832,A,1,361,37.37,"[0, 0, 361, 0]",AG,1.00,chr8,...,1,33,40.00,"[0, 0, 33, 0]",AG,1.0,Not-Editing,0,SRR5564275.wildtype.outTable_580067564.gz,SRR5564273.ADAR1_KO.outTable_718392497.gz


In [7]:
# couple 3 (KO3 vs WT3)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR5564276.wildtype/DnaRna_181728208/outTable_181728208.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR5564268.ADAR1_KO/DnaRna_854894021/outTable_854894021.gz"
wt_name = "SRR5564276.wildtype" + f".{os.path.basename(wt_fp)}"
ko_name = "SRR5564268.ADAR1_KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple3 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple3

SRR5564276.wildtype.outTable_181728208.gz
SRR5564268.ADAR1_KO.outTable_854894021.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Evaluated sites: 300000000
Evaluated sites: 350000000
Evaluated sites: 400000000
Iteration on tabix outTables finished. Elapsed time:  0:10:26.893669
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564276.wildtype.outTable_181728208.gz_vs_SRR5564268.ADAR1_KO.outTable_854894021.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,205483,A,1,180,36.29,"[178, 0, 2, 0]",AG,0.01,chrX,...,0.00,chrX,205483,A,1,34,40.85,"[34, 0, 0, 0]",-,0.00
1,chrX,215935,A,1,68,38.91,"[66, 0, 2, 0]",AG,0.03,chrX,...,0.00,chrX,215935,A,1,30,39.40,"[30, 0, 0, 0]",-,0.00
2,chrX,216802,A,1,51,37.86,"[49, 0, 2, 0]",AG,0.04,chrX,...,0.00,chrX,216802,A,1,34,39.32,"[34, 0, 0, 0]",-,0.00
3,chrX,216807,A,1,53,37.62,"[50, 0, 3, 0]",AG,0.06,chrX,...,0.00,chrX,216807,A,1,31,39.45,"[31, 0, 0, 0]",-,0.00
4,chrX,217393,A,1,43,35.02,"[26, 0, 17, 0]",AG,0.40,chrX,...,0.00,chrX,217393,A,1,31,39.61,"[31, 0, 0, 0]",-,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13630,chr21,47742133,A,0,76,35.95,"[74, 0, 2, 0]",AG,0.03,chr21,...,0.00,chr21,47742133,A,0,35,39.51,"[35, 0, 0, 0]",-,0.00
13631,chr21,47742139,A,0,65,35.32,"[60, 0, 5, 0]",AG,0.08,chr21,...,0.00,chr21,47742139,A,0,38,40.55,"[38, 0, 0, 0]",-,0.00
13632,chr21,47851753,A,1,62,35.55,"[44, 0, 18, 0]",AG,0.29,chr21,...,0.39,chr21,47851753,A,1,39,40.38,"[29, 0, 10, 0]",AG,0.26
13633,chr21,47862624,A,1,43,37.19,"[0, 0, 43, 0]",AG,1.00,chr21,...,1.00,chr21,47862624,A,1,46,41.33,"[0, 0, 46, 0]",AG,1.00


Pos: (8375, 29)
Neg: (2694, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,205483,A,1,180,36.29,"[178, 0, 2, 0]",AG,0.01,chrX,...,1,34,40.85,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
1,chrX,215935,A,1,68,38.91,"[66, 0, 2, 0]",AG,0.03,chrX,...,1,30,39.40,"[30, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
2,chrX,216802,A,1,51,37.86,"[49, 0, 2, 0]",AG,0.04,chrX,...,1,34,39.32,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
3,chrX,216807,A,1,53,37.62,"[50, 0, 3, 0]",AG,0.06,chrX,...,1,31,39.45,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
4,chrX,217393,A,1,43,35.02,"[26, 0, 17, 0]",AG,0.40,chrX,...,1,31,39.61,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11064,chr21,47635176,A,0,134,36.46,"[74, 0, 60, 0]",AG,0.45,chr21,...,0,34,38.62,"[26, 0, 8, 0]",AG,0.24,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11065,chr21,47705237,A,0,139,36.01,"[31, 0, 108, 0]",AG,0.78,chr21,...,0,46,40.98,"[21, 0, 25, 0]",AG,0.54,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11066,chr21,47851753,A,1,62,35.55,"[44, 0, 18, 0]",AG,0.29,chr21,...,1,39,40.38,"[29, 0, 10, 0]",AG,0.26,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11067,chr21,47862624,A,1,43,37.19,"[0, 0, 43, 0]",AG,1.00,chr21,...,1,46,41.33,"[0, 0, 46, 0]",AG,1.00,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz


Class        Class_binary
Editing      1               8375
Not-Editing  0               2694
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/SRR5564276.wildtype.outTable_181728208.gz_vs_SRR5564268.ADAR1_KO.outTable_854894021.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,205483,A,1,180,36.29,"[178, 0, 2, 0]",AG,0.01,chrX,...,1,34,40.85,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
1,chrX,215935,A,1,68,38.91,"[66, 0, 2, 0]",AG,0.03,chrX,...,1,30,39.40,"[30, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
2,chrX,216802,A,1,51,37.86,"[49, 0, 2, 0]",AG,0.04,chrX,...,1,34,39.32,"[34, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
3,chrX,216807,A,1,53,37.62,"[50, 0, 3, 0]",AG,0.06,chrX,...,1,31,39.45,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
4,chrX,217393,A,1,43,35.02,"[26, 0, 17, 0]",AG,0.40,chrX,...,1,31,39.61,"[31, 0, 0, 0]",-,0.00,Editing,1,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11064,chr21,47635176,A,0,134,36.46,"[74, 0, 60, 0]",AG,0.45,chr21,...,0,34,38.62,"[26, 0, 8, 0]",AG,0.24,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11065,chr21,47705237,A,0,139,36.01,"[31, 0, 108, 0]",AG,0.78,chr21,...,0,46,40.98,"[21, 0, 25, 0]",AG,0.54,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11066,chr21,47851753,A,1,62,35.55,"[44, 0, 18, 0]",AG,0.29,chr21,...,1,39,40.38,"[29, 0, 10, 0]",AG,0.26,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11067,chr21,47862624,A,1,43,37.19,"[0, 0, 43, 0]",AG,1.00,chr21,...,1,46,41.33,"[0, 0, 46, 0]",AG,1.00,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz


In [8]:
# concat into a uniq dataset
bonafide_final_full = pd.concat([couple1, couple2, couple3])
bonafide_final_full

Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,217844,A,1,32,36.72,"[29, 0, 3, 0]",AG,0.09,chrX,...,1,35,40.11,"[35, 0, 0, 0]",-,0.00,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
1,chrX,217852,A,1,36,35.89,"[25, 0, 11, 0]",AG,0.31,chrX,...,1,33,40.64,"[33, 0, 0, 0]",-,0.00,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
2,chrX,217930,A,1,44,35.32,"[42, 0, 2, 0]",AG,0.05,chrX,...,1,33,40.55,"[33, 0, 0, 0]",-,0.00,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
3,chrX,217946,A,1,42,35.50,"[39, 0, 3, 0]",AG,0.07,chrX,...,1,38,40.82,"[38, 0, 0, 0]",-,0.00,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
4,chrX,218542,A,1,32,36.38,"[30, 0, 2, 0]",AG,0.06,chrX,...,1,33,39.82,"[33, 0, 0, 0]",-,0.00,Editing,1,SRR5564274.wildtype.outTable_724242056.gz,SRR5564272.ADAR1_KO.outTable_816573740.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11064,chr21,47635176,A,0,134,36.46,"[74, 0, 60, 0]",AG,0.45,chr21,...,0,34,38.62,"[26, 0, 8, 0]",AG,0.24,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11065,chr21,47705237,A,0,139,36.01,"[31, 0, 108, 0]",AG,0.78,chr21,...,0,46,40.98,"[21, 0, 25, 0]",AG,0.54,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11066,chr21,47851753,A,1,62,35.55,"[44, 0, 18, 0]",AG,0.29,chr21,...,1,39,40.38,"[29, 0, 10, 0]",AG,0.26,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz
11067,chr21,47862624,A,1,43,37.19,"[0, 0, 43, 0]",AG,1.00,chr21,...,1,46,41.33,"[0, 0, 46, 0]",AG,1.00,Not-Editing,0,SRR5564276.wildtype.outTable_181728208.gz,SRR5564268.ADAR1_KO.outTable_854894021.gz


In [9]:
# save to disk bonafide final
output_file = os.path.join(output_folder, "bonafide_final_MERGED.tsv")
print("Save to disk bonafide final MERGED (all couples):", output_file)
bonafide_final_full.to_csv(output_file, sep="\t", index=None)

Save to disk bonafide final MERGED (all couples): /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_publicdata/bonafide_final_MERGED.tsv


In [10]:
bonafide_final_full.Class.value_counts()

Editing        19370
Not-Editing     7238
Name: Class, dtype: int64

In [11]:
bonafide_final_full.groupby("Class")[["wt_Frequency", "ko_Frequency", "gFrequency"]].describe().T

Unnamed: 0,Class,Editing,Not-Editing
wt_Frequency,count,19370.0,7238.0
wt_Frequency,mean,0.065661,0.727962
wt_Frequency,std,0.091351,0.31053
wt_Frequency,min,0.01,0.01
wt_Frequency,25%,0.01,0.45
wt_Frequency,50%,0.04,0.99
wt_Frequency,75%,0.07,1.0
wt_Frequency,max,1.0,1.0
ko_Frequency,count,19370.0,7238.0
ko_Frequency,mean,0.0,0.727358
