In [1]:
##################
### Author: Adriano Fonzino. email: adriano.fonzino@uniba.it
##################
import pandas as pd
import numpy as np
import pysam, gzip, os
from datetime import datetime

In [2]:
# define util function
def extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp):
    wt = pysam.TabixFile(wt_fp)
    ko = pysam.TabixFile(ko_fp)
    wgs = pysam.TabixFile(wgs_fp)
    bonafide = []
    start_time = datetime.now()
    with gzip.open(wt_fp) as wt_table:
        for c,s in enumerate(wt_table):
            site = s.decode("utf-8").rstrip().split("\t")
            if c % 50000000 == 0:
                print("Evaluated sites:", c)
            if site[0] == "Region":
                # store header
                header = site[:9]
            else:
                if site[0].startswith("chr"):
                    if site[0] != "chrM":
                        if site[2] == "A":
                            if site[4] != "-":
                                if int(site[4]) >= min_cov:
                                    if site[7] == "AG" and float(site[8]) >= 0.01:
                                        vector = eval(site[6])
                                        if vector[2] >= min_ag_subs:
                                            ko_query = [i for i in ko.fetch(site[0], int(site[1])-1, int(site[1]))]
                                            if len(ko_query) == 1:
                                                ko_query = ko_query[0].split("\t")
                                                if ko_query[4] != "-":
                                                    if int(ko_query[4]) >= min_cov:
                                                        wgs_query = [i for i in wgs.fetch(site[0], int(site[1])-1, int(site[1]))]
                                                        if len(wgs_query) == 1:
                                                            wgs_query = wgs_query[0].split("\t")
                                                            if wgs_query[9] != "-":
                                                                if int(wgs_query[9]) >= min_cov_wgs:
                                                                    whole_site = site[:9] + ko_query[:9] + wgs_query[0:4] + wgs_query[9:]
                                                                    bonafide.append(whole_site)
    print("Iteration on tabix outTables finished. Elapsed time: ", datetime.now()-start_time)
    columns=["wt_"+i for i in header]+["ko_"+i for i in header]+["g"+i for i in header]
    bonafide = pd.DataFrame(bonafide, columns=columns)
    
    # save to disk bonafide candidates
    output_file = os.path.join(output_folder, wt_name + "_vs_" + ko_name + ".bonafide_candidates.tsv")
    print("Save to disk bonafide candidates:", output_file)
    bonafide.to_csv(output_file, sep="\t", index=None)
    
    # load from disk to infer dtypes
    bonafide = pd.read_table(output_file)
    
    # drop unstranded
    bonafide = bonafide[(bonafide["wt_Strand"]!=2)&(bonafide["ko_Strand"]!=2)&(bonafide["gStrand"]!=2)].copy()
    
    # select concordand for strand for wt and ko
    # select only strand concordand sites
    bonafide = bonafide[bonafide["wt_Strand"] == bonafide["ko_Strand"]].copy()
    bonafide = bonafide[bonafide["ko_Strand"] == bonafide["gStrand"]].copy()
    bonafide.reset_index(inplace=True, drop=True)
    display(bonafide)
    pos = bonafide[(bonafide["wt_AllSubs"]=="AG")&(bonafide["ko_AllSubs"]=="-")&(bonafide["gAllSubs"]=="-")].copy()
    pos["Class"] = "Editing"
    pos["Class_binary"] = 1
    print("Pos:", pos.shape)
    neg = bonafide[(bonafide["wt_AllSubs"]=="AG")&(bonafide["ko_AllSubs"]=="AG")&(bonafide["gAllSubs"]=="AG")].copy()
    # select negs with 0.01 AG freq and min_ag_subs
    neg = neg.query("ko_Frequency >= 0.01")
    mask = []
    for cand in neg["ko_BaseCount[A,C,G,T]"]:
        vector = eval(cand)
        if vector[2] >= min_ag_subs:
            mask.append(True)
        else:
            mask.append(False)
    neg = neg[mask].copy()
    neg["Class"] = "Not-Editing"
    neg["Class_binary"] = 0
    print("Neg:", neg.shape)
    # merge pos and neg and add samples id
    bonafide_final = pd.concat([pos, neg])
    bonafide_final.reset_index(inplace=True, drop=True)
    bonafide_final["wt_sample"] = wt_name
    bonafide_final["ko_sample"] = ko_name
    
    display(bonafide_final)
    print(bonafide_final[["Class", "Class_binary"]].value_counts())
    
    # save to disk bonafide final
    output_file = os.path.join(output_folder, wt_name + "_vs_" + ko_name + ".bonafide_final.tsv")
    print("Save to disk bonafide final:", output_file)
    bonafide_final.to_csv(output_file, sep="\t", index=None)
    
    wt.close()
    ko.close()
    wgs.close()
    return bonafide_final

In [3]:
# define output folder
output_folder = "/lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori"

In [4]:
# define wgs tabix outTable file path
wgs_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_adar_samples/SRR10129631.WGS.wildtype/DnaRna_59095329/outTable_59095329.gz"

# define common filters
min_cov = 50
min_cov_wgs = 10
min_ag_subs = 3

In [5]:
# couple 1 (KO1 vs OvE1)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK_ADAR1_p110_wt_1.OvE/DnaRna_530905096/outTable_530905096.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_KO1.KO/DnaRna_905657585/outTable_905657585.gz"
wt_name = "HEK_ADAR1_p110_wt_1.OvE" + f".{os.path.basename(wt_fp)}"
ko_name = "HEK293T_KO1.KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple1 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple1

HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz
HEK293T_KO1.KO.outTable_905657585.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Iteration on tabix outTables finished. Elapsed time:  0:09:38.686462
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,299493,A,1,383,46.11,"[372, 0, 11, 0]",AG,0.03,chrX,...,0.00,chrX,299493,A,1,30,40.47,"[30, 0, 0, 0]",-,0.0
1,chrX,299510,A,1,388,45.23,"[281, 0, 107, 0]",AG,0.28,chrX,...,0.00,chrX,299510,A,1,36,41.00,"[36, 0, 0, 0]",-,0.0
2,chrX,299513,A,1,385,45.16,"[381, 0, 4, 0]",AG,0.01,chrX,...,0.00,chrX,299513,A,1,34,41.35,"[34, 0, 0, 0]",-,0.0
3,chrX,299540,A,1,365,47.04,"[359, 0, 6, 0]",AG,0.02,chrX,...,0.00,chrX,299540,A,1,37,41.11,"[37, 0, 0, 0]",-,0.0
4,chrX,299543,A,1,368,47.25,"[365, 0, 3, 0]",AG,0.01,chrX,...,0.00,chrX,299543,A,1,39,42.33,"[39, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46885,chr8,144976278,A,0,490,48.16,"[487, 0, 3, 0]",AG,0.01,chr8,...,0.00,chr8,144976278,A,0,35,39.74,"[35, 0, 0, 0]",-,0.0
46886,chr8,144977014,A,0,556,45.21,"[548, 0, 8, 0]",AG,0.01,chr8,...,0.03,chr8,144977014,A,0,40,41.20,"[40, 0, 0, 0]",-,0.0
46887,chr8,144977019,A,0,548,45.91,"[538, 0, 10, 0]",AG,0.02,chr8,...,0.03,chr8,144977019,A,0,42,41.57,"[42, 0, 0, 0]",-,0.0
46888,chr8,144977023,A,0,545,46.35,"[535, 0, 10, 0]",AG,0.02,chr8,...,0.03,chr8,144977023,A,0,39,42.26,"[39, 0, 0, 0]",-,0.0


Pos: (36152, 29)
Neg: (3337, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,383,46.11,"[372, 0, 11, 0]",AG,0.03,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
1,chrX,299510,A,1,388,45.23,"[281, 0, 107, 0]",AG,0.28,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
2,chrX,299513,A,1,385,45.16,"[381, 0, 4, 0]",AG,0.01,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
3,chrX,299540,A,1,365,47.04,"[359, 0, 6, 0]",AG,0.02,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
4,chrX,299543,A,1,368,47.25,"[365, 0, 3, 0]",AG,0.01,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39484,chr8,144451628,A,0,299,51.34,"[244, 0, 55, 0]",AG,0.18,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39485,chr8,144454002,A,0,59,45.15,"[0, 0, 59, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39486,chr8,144460013,A,0,57,51.93,"[41, 0, 16, 0]",AG,0.28,chr8,...,0,33,39.91,"[20, 0, 13, 0]",AG,0.39,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39487,chr8,144468337,A,1,420,48.59,"[0, 0, 420, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz


Class        Class_binary
Editing      1               36152
Not-Editing  0                3337
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,383,46.11,"[372, 0, 11, 0]",AG,0.03,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
1,chrX,299510,A,1,388,45.23,"[281, 0, 107, 0]",AG,0.28,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
2,chrX,299513,A,1,385,45.16,"[381, 0, 4, 0]",AG,0.01,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
3,chrX,299540,A,1,365,47.04,"[359, 0, 6, 0]",AG,0.02,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
4,chrX,299543,A,1,368,47.25,"[365, 0, 3, 0]",AG,0.01,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39484,chr8,144451628,A,0,299,51.34,"[244, 0, 55, 0]",AG,0.18,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39485,chr8,144454002,A,0,59,45.15,"[0, 0, 59, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39486,chr8,144460013,A,0,57,51.93,"[41, 0, 16, 0]",AG,0.28,chr8,...,0,33,39.91,"[20, 0, 13, 0]",AG,0.39,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
39487,chr8,144468337,A,1,420,48.59,"[0, 0, 420, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz


In [6]:
# couple 2 (KO2 vs OvE2)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK_ADAR1_p110_wt_2.OvE/DnaRna_814257267/outTable_814257267.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_KO2.KO/DnaRna_364841872/outTable_364841872.gz"
wt_name = "HEK_ADAR1_p110_wt_2.OvE" + f".{os.path.basename(wt_fp)}"
ko_name = "HEK293T_KO2.KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple2 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple2

HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz
HEK293T_KO2.KO.outTable_364841872.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Iteration on tabix outTables finished. Elapsed time:  0:09:40.769580
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,299493,A,1,324,45.15,"[320, 0, 4, 0]",AG,0.01,chrX,...,0.0,chrX,299493,A,1,30,40.47,"[30, 0, 0, 0]",-,0.0
1,chrX,299510,A,1,318,45.16,"[236, 0, 82, 0]",AG,0.26,chrX,...,0.0,chrX,299510,A,1,36,41.00,"[36, 0, 0, 0]",-,0.0
2,chrX,299513,A,1,314,45.80,"[305, 0, 9, 0]",AG,0.03,chrX,...,0.0,chrX,299513,A,1,34,41.35,"[34, 0, 0, 0]",-,0.0
3,chrX,299543,A,1,303,45.42,"[294, 0, 9, 0]",AG,0.03,chrX,...,0.0,chrX,299543,A,1,39,42.33,"[39, 0, 0, 0]",-,0.0
4,chrX,299551,A,1,288,47.07,"[283, 0, 5, 0]",AG,0.02,chrX,...,0.0,chrX,299551,A,1,38,41.45,"[38, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45864,chr8,144974956,A,0,715,47.78,"[701, 0, 14, 0]",AG,0.02,chr8,...,0.0,chr8,144974956,A,0,50,41.06,"[50, 0, 0, 0]",-,0.0
45865,chr8,144975042,A,0,785,43.57,"[781, 0, 4, 0]",AG,0.01,chr8,...,0.0,chr8,144975042,A,0,41,39.22,"[41, 0, 0, 0]",-,0.0
45866,chr8,144975093,A,0,761,47.66,"[753, 0, 8, 0]",AG,0.01,chr8,...,0.0,chr8,144975093,A,0,41,40.15,"[41, 0, 0, 0]",-,0.0
45867,chr8,144975890,A,0,328,40.41,"[324, 0, 4, 0]",AG,0.01,chr8,...,0.0,chr8,144975890,A,0,37,39.70,"[37, 0, 0, 0]",-,0.0


Pos: (34025, 29)
Neg: (3458, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,324,45.15,"[320, 0, 4, 0]",AG,0.01,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
1,chrX,299510,A,1,318,45.16,"[236, 0, 82, 0]",AG,0.26,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
2,chrX,299513,A,1,314,45.80,"[305, 0, 9, 0]",AG,0.03,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
3,chrX,299543,A,1,303,45.42,"[294, 0, 9, 0]",AG,0.03,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
4,chrX,299551,A,1,288,47.07,"[283, 0, 5, 0]",AG,0.02,chrX,...,1,38,41.45,"[38, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37478,chr8,144451628,A,0,276,48.17,"[219, 0, 57, 0]",AG,0.21,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37479,chr8,144457131,A,0,90,46.32,"[11, 0, 79, 0]",AG,0.88,chr8,...,0,35,39.86,"[14, 0, 21, 0]",AG,0.60,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37480,chr8,144458636,A,0,95,51.41,"[73, 0, 22, 0]",AG,0.23,chr8,...,0,36,42.17,"[19, 0, 17, 0]",AG,0.47,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37481,chr8,144468337,A,1,396,49.43,"[0, 0, 396, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz


Class        Class_binary
Editing      1               34025
Not-Editing  0                3458
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,324,45.15,"[320, 0, 4, 0]",AG,0.01,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
1,chrX,299510,A,1,318,45.16,"[236, 0, 82, 0]",AG,0.26,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
2,chrX,299513,A,1,314,45.80,"[305, 0, 9, 0]",AG,0.03,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
3,chrX,299543,A,1,303,45.42,"[294, 0, 9, 0]",AG,0.03,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
4,chrX,299551,A,1,288,47.07,"[283, 0, 5, 0]",AG,0.02,chrX,...,1,38,41.45,"[38, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37478,chr8,144451628,A,0,276,48.17,"[219, 0, 57, 0]",AG,0.21,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37479,chr8,144457131,A,0,90,46.32,"[11, 0, 79, 0]",AG,0.88,chr8,...,0,35,39.86,"[14, 0, 21, 0]",AG,0.60,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37480,chr8,144458636,A,0,95,51.41,"[73, 0, 22, 0]",AG,0.23,chr8,...,0,36,42.17,"[19, 0, 17, 0]",AG,0.47,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz
37481,chr8,144468337,A,1,396,49.43,"[0, 0, 396, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_2.OvE.outTable_814257267.gz,HEK293T_KO2.KO.outTable_364841872.gz


In [7]:
# couple 3 (KO3 vs OvE3)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK_ADAR1_p110_wt_3.OvE/DnaRna_208420383/outTable_208420383.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_KO3.KO/DnaRna_597789462/outTable_597789462.gz"
wt_name = "HEK_ADAR1_p110_wt_3.OvE" + f".{os.path.basename(wt_fp)}"
ko_name = "HEK293T_KO3.KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple3 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple3

HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz
HEK293T_KO3.KO.outTable_597789462.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Iteration on tabix outTables finished. Elapsed time:  0:08:33.944579
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,299510,A,1,297,46.47,"[222, 0, 75, 0]",AG,0.25,chrX,...,0.00,chrX,299510,A,1,36,41.00,"[36, 0, 0, 0]",-,0.0
1,chrX,299513,A,1,292,46.71,"[287, 0, 5, 0]",AG,0.02,chrX,...,0.00,chrX,299513,A,1,34,41.35,"[34, 0, 0, 0]",-,0.0
2,chrX,299548,A,1,262,46.18,"[96, 0, 166, 0]",AG,0.63,chrX,...,0.06,chrX,299548,A,1,36,42.81,"[36, 0, 0, 0]",-,0.0
3,chrX,299559,A,1,256,50.97,"[251, 0, 5, 0]",AG,0.02,chrX,...,0.00,chrX,299559,A,1,36,42.69,"[36, 0, 0, 0]",-,0.0
4,chrX,299568,A,1,262,51.22,"[253, 0, 9, 0]",AG,0.03,chrX,...,0.00,chrX,299568,A,1,34,40.47,"[34, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43422,chr8,144977023,A,0,429,46.02,"[422, 0, 7, 0]",AG,0.02,chr8,...,0.00,chr8,144977023,A,0,39,42.26,"[39, 0, 0, 0]",-,0.0
43423,chr8,144977097,A,0,632,46.09,"[628, 0, 4, 0]",AG,0.01,chr8,...,0.00,chr8,144977097,A,0,35,40.74,"[35, 0, 0, 0]",-,0.0
43424,chr8,144977113,A,0,633,47.99,"[627, 0, 6, 0]",AG,0.01,chr8,...,0.00,chr8,144977113,A,0,35,39.77,"[35, 0, 0, 0]",-,0.0
43425,chr8,144977166,A,0,729,48.06,"[725, 0, 4, 0]",AG,0.01,chr8,...,0.00,chr8,144977166,A,0,42,39.95,"[42, 0, 0, 0]",-,0.0


Pos: (31893, 29)
Neg: (3391, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299510,A,1,297,46.47,"[222, 0, 75, 0]",AG,0.25,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
1,chrX,299513,A,1,292,46.71,"[287, 0, 5, 0]",AG,0.02,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
2,chrX,299559,A,1,256,50.97,"[251, 0, 5, 0]",AG,0.02,chrX,...,1,36,42.69,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
3,chrX,299568,A,1,262,51.22,"[253, 0, 9, 0]",AG,0.03,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
4,chrX,299571,A,1,269,50.89,"[262, 0, 7, 0]",AG,0.03,chrX,...,1,37,42.32,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35279,chr8,144458020,A,0,51,49.10,"[0, 0, 51, 0]",AG,1.00,chr8,...,0,57,41.00,"[0, 0, 57, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35280,chr8,144458072,A,0,52,48.37,"[0, 0, 52, 0]",AG,1.00,chr8,...,0,48,41.33,"[0, 0, 48, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35281,chr8,144468337,A,1,377,48.61,"[0, 0, 377, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35282,chr8,144843461,A,1,122,50.05,"[24, 0, 98, 0]",AG,0.80,chr8,...,1,46,40.46,"[19, 0, 27, 0]",AG,0.59,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz


Class        Class_binary
Editing      1               31893
Not-Editing  0                3391
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299510,A,1,297,46.47,"[222, 0, 75, 0]",AG,0.25,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
1,chrX,299513,A,1,292,46.71,"[287, 0, 5, 0]",AG,0.02,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
2,chrX,299559,A,1,256,50.97,"[251, 0, 5, 0]",AG,0.02,chrX,...,1,36,42.69,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
3,chrX,299568,A,1,262,51.22,"[253, 0, 9, 0]",AG,0.03,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
4,chrX,299571,A,1,269,50.89,"[262, 0, 7, 0]",AG,0.03,chrX,...,1,37,42.32,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35279,chr8,144458020,A,0,51,49.10,"[0, 0, 51, 0]",AG,1.00,chr8,...,0,57,41.00,"[0, 0, 57, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35280,chr8,144458072,A,0,52,48.37,"[0, 0, 52, 0]",AG,1.00,chr8,...,0,48,41.33,"[0, 0, 48, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35281,chr8,144468337,A,1,377,48.61,"[0, 0, 377, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz
35282,chr8,144843461,A,1,122,50.05,"[24, 0, 98, 0]",AG,0.80,chr8,...,1,46,40.46,"[19, 0, 27, 0]",AG,0.59,Not-Editing,0,HEK_ADAR1_p110_wt_3.OvE.outTable_208420383.gz,HEK293T_KO3.KO.outTable_597789462.gz


In [8]:
# couple 4 (KO1 vs WT1)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_WT1.WT/DnaRna_599710609/outTable_599710609.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_KO1.KO/DnaRna_905657585/outTable_905657585.gz"
wt_name = "HEK293T_WT1.WT" + f".{os.path.basename(wt_fp)}"
ko_name = "HEK293T_KO1.KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple4 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple4

HEK293T_WT1.WT.outTable_599710609.gz
HEK293T_KO1.KO.outTable_905657585.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Evaluated sites: 350000000
Iteration on tabix outTables finished. Elapsed time:  0:10:35.784502
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT1.WT.outTable_599710609.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,299493,A,1,371,44.71,"[365, 0, 6, 0]",AG,0.02,chrX,...,0.0,chrX,299493,A,1,30,40.47,"[30, 0, 0, 0]",-,0.0
1,chrX,299510,A,1,367,44.73,"[332, 0, 35, 0]",AG,0.10,chrX,...,0.0,chrX,299510,A,1,36,41.00,"[36, 0, 0, 0]",-,0.0
2,chrX,299513,A,1,367,44.63,"[356, 0, 11, 0]",AG,0.03,chrX,...,0.0,chrX,299513,A,1,34,41.35,"[34, 0, 0, 0]",-,0.0
3,chrX,299540,A,1,361,46.97,"[356, 0, 5, 0]",AG,0.01,chrX,...,0.0,chrX,299540,A,1,37,41.11,"[37, 0, 0, 0]",-,0.0
4,chrX,299543,A,1,367,46.54,"[364, 0, 3, 0]",AG,0.01,chrX,...,0.0,chrX,299543,A,1,39,42.33,"[39, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36345,chr8,144975093,A,0,387,50.61,"[384, 0, 3, 0]",AG,0.01,chr8,...,0.0,chr8,144975093,A,0,41,40.15,"[41, 0, 0, 0]",-,0.0
36346,chr8,144975974,A,0,405,50.03,"[401, 0, 4, 0]",AG,0.01,chr8,...,0.0,chr8,144975974,A,0,39,40.72,"[39, 0, 0, 0]",-,0.0
36347,chr8,144976539,A,0,458,46.62,"[455, 0, 3, 0]",AG,0.01,chr8,...,0.0,chr8,144976539,A,0,30,39.90,"[30, 0, 0, 0]",-,0.0
36348,chr8,144977113,A,0,463,46.75,"[460, 0, 3, 0]",AG,0.01,chr8,...,0.0,chr8,144977113,A,0,35,39.77,"[35, 0, 0, 0]",-,0.0


Pos: (26275, 29)
Neg: (3332, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,371,44.71,"[365, 0, 6, 0]",AG,0.02,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
1,chrX,299510,A,1,367,44.73,"[332, 0, 35, 0]",AG,0.10,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
2,chrX,299513,A,1,367,44.63,"[356, 0, 11, 0]",AG,0.03,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
3,chrX,299540,A,1,361,46.97,"[356, 0, 5, 0]",AG,0.01,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
4,chrX,299543,A,1,367,46.54,"[364, 0, 3, 0]",AG,0.01,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29602,chr8,144451628,A,0,550,47.21,"[430, 0, 120, 0]",AG,0.22,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29603,chr8,144454002,A,0,136,43.96,"[0, 0, 136, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29604,chr8,144460013,A,0,82,52.49,"[69, 0, 13, 0]",AG,0.16,chr8,...,0,33,39.91,"[20, 0, 13, 0]",AG,0.39,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29605,chr8,144468337,A,1,937,47.23,"[3, 0, 934, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz


Class        Class_binary
Editing      1               26275
Not-Editing  0                3332
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT1.WT.outTable_599710609.gz_vs_HEK293T_KO1.KO.outTable_905657585.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,371,44.71,"[365, 0, 6, 0]",AG,0.02,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
1,chrX,299510,A,1,367,44.73,"[332, 0, 35, 0]",AG,0.10,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
2,chrX,299513,A,1,367,44.63,"[356, 0, 11, 0]",AG,0.03,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
3,chrX,299540,A,1,361,46.97,"[356, 0, 5, 0]",AG,0.01,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
4,chrX,299543,A,1,367,46.54,"[364, 0, 3, 0]",AG,0.01,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29602,chr8,144451628,A,0,550,47.21,"[430, 0, 120, 0]",AG,0.22,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29603,chr8,144454002,A,0,136,43.96,"[0, 0, 136, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29604,chr8,144460013,A,0,82,52.49,"[69, 0, 13, 0]",AG,0.16,chr8,...,0,33,39.91,"[20, 0, 13, 0]",AG,0.39,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz
29605,chr8,144468337,A,1,937,47.23,"[3, 0, 934, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT1.WT.outTable_599710609.gz,HEK293T_KO1.KO.outTable_905657585.gz


In [9]:
# couple 5 (KO2 vs WT2)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_WT2.WT/DnaRna_572868058/outTable_572868058.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_KO2.KO/DnaRna_364841872/outTable_364841872.gz"
wt_name = "HEK293T_WT2.WT" + f".{os.path.basename(wt_fp)}"
ko_name = "HEK293T_KO2.KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple5 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple5

HEK293T_WT2.WT.outTable_572868058.gz
HEK293T_KO2.KO.outTable_364841872.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Evaluated sites: 200000000
Evaluated sites: 250000000
Iteration on tabix outTables finished. Elapsed time:  0:08:11.969366
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT2.WT.outTable_572868058.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,299510,A,1,272,43.13,"[239, 0, 33, 0]",AG,0.12,chrX,...,0.00,chrX,299510,A,1,36,41.00,"[36, 0, 0, 0]",-,0.0
1,chrX,299513,A,1,263,43.09,"[257, 0, 6, 0]",AG,0.02,chrX,...,0.00,chrX,299513,A,1,34,41.35,"[34, 0, 0, 0]",-,0.0
2,chrX,299540,A,1,291,44.30,"[288, 0, 3, 0]",AG,0.01,chrX,...,0.00,chrX,299540,A,1,37,41.11,"[37, 0, 0, 0]",-,0.0
3,chrX,299548,A,1,270,43.62,"[119, 0, 151, 0]",AG,0.56,chrX,...,0.07,chrX,299548,A,1,36,42.81,"[36, 0, 0, 0]",-,0.0
4,chrX,299559,A,1,272,46.03,"[267, 0, 5, 0]",AG,0.02,chrX,...,0.00,chrX,299559,A,1,36,42.69,"[36, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31205,chr8,144974950,A,0,361,46.19,"[354, 0, 7, 0]",AG,0.02,chr8,...,0.00,chr8,144974950,A,0,45,40.98,"[45, 0, 0, 0]",-,0.0
31206,chr8,144974956,A,0,349,48.98,"[342, 0, 7, 0]",AG,0.02,chr8,...,0.00,chr8,144974956,A,0,50,41.06,"[50, 0, 0, 0]",-,0.0
31207,chr8,144977014,A,0,309,48.34,"[299, 0, 10, 0]",AG,0.03,chr8,...,0.01,chr8,144977014,A,0,40,41.20,"[40, 0, 0, 0]",-,0.0
31208,chr8,144977019,A,0,325,48.14,"[310, 0, 15, 0]",AG,0.05,chr8,...,0.02,chr8,144977019,A,0,42,41.57,"[42, 0, 0, 0]",-,0.0


Pos: (20360, 29)
Neg: (3460, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299510,A,1,272,43.13,"[239, 0, 33, 0]",AG,0.12,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
1,chrX,299513,A,1,263,43.09,"[257, 0, 6, 0]",AG,0.02,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
2,chrX,299540,A,1,291,44.30,"[288, 0, 3, 0]",AG,0.01,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
3,chrX,299559,A,1,272,46.03,"[267, 0, 5, 0]",AG,0.02,chrX,...,1,36,42.69,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
4,chrX,299568,A,1,271,46.60,"[263, 0, 8, 0]",AG,0.03,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23815,chr8,144454002,A,0,86,45.73,"[0, 0, 86, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23816,chr8,144457131,A,0,78,45.86,"[20, 0, 58, 0]",AG,0.74,chr8,...,0,35,39.86,"[14, 0, 21, 0]",AG,0.60,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23817,chr8,144458636,A,0,90,44.97,"[77, 0, 13, 0]",AG,0.14,chr8,...,0,36,42.17,"[19, 0, 17, 0]",AG,0.47,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23818,chr8,144468337,A,1,665,48.87,"[0, 0, 665, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz


Class        Class_binary
Editing      1               20360
Not-Editing  0                3460
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT2.WT.outTable_572868058.gz_vs_HEK293T_KO2.KO.outTable_364841872.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299510,A,1,272,43.13,"[239, 0, 33, 0]",AG,0.12,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
1,chrX,299513,A,1,263,43.09,"[257, 0, 6, 0]",AG,0.02,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
2,chrX,299540,A,1,291,44.30,"[288, 0, 3, 0]",AG,0.01,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
3,chrX,299559,A,1,272,46.03,"[267, 0, 5, 0]",AG,0.02,chrX,...,1,36,42.69,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
4,chrX,299568,A,1,271,46.60,"[263, 0, 8, 0]",AG,0.03,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23815,chr8,144454002,A,0,86,45.73,"[0, 0, 86, 0]",AG,1.00,chr8,...,0,34,38.82,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23816,chr8,144457131,A,0,78,45.86,"[20, 0, 58, 0]",AG,0.74,chr8,...,0,35,39.86,"[14, 0, 21, 0]",AG,0.60,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23817,chr8,144458636,A,0,90,44.97,"[77, 0, 13, 0]",AG,0.14,chr8,...,0,36,42.17,"[19, 0, 17, 0]",AG,0.47,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz
23818,chr8,144468337,A,1,665,48.87,"[0, 0, 665, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT2.WT.outTable_572868058.gz,HEK293T_KO2.KO.outTable_364841872.gz


In [10]:
# couple 6 (KO3 vs WT3)
wt_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_WT3.WT/DnaRna_110067244/outTable_110067244.gz"
ko_fp = "/lustre/bio_running/A_to_I_Pietro/model_test_2023/hek_pecori/HEK293T_KO3.KO/DnaRna_597789462/outTable_597789462.gz"
wt_name = "HEK293T_WT3.WT" + f".{os.path.basename(wt_fp)}"
ko_name = "HEK293T_KO3.KO" + f".{os.path.basename(ko_fp)}"
print(wt_name)
print(ko_name)

couple6 = extract_bonafide(wt_fp, ko_fp, wt_name, ko_name, wgs_fp)
couple6

HEK293T_WT3.WT.outTable_110067244.gz
HEK293T_KO3.KO.outTable_597789462.gz
Evaluated sites: 0
Evaluated sites: 50000000
Evaluated sites: 100000000
Evaluated sites: 150000000
Iteration on tabix outTables finished. Elapsed time:  0:05:45.616095
Save to disk bonafide candidates: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT3.WT.outTable_110067244.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_candidates.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,ko_Frequency,gRegion,gPosition,gReference,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency
0,chrX,299431,A,1,229,49.54,"[226, 0, 3, 0]",AG,0.01,chrX,...,0.00,chrX,299431,A,1,35,40.20,"[35, 0, 0, 0]",-,0.0
1,chrX,299510,A,1,144,45.65,"[121, 0, 23, 0]",AG,0.16,chrX,...,0.00,chrX,299510,A,1,36,41.00,"[36, 0, 0, 0]",-,0.0
2,chrX,299548,A,1,133,47.57,"[67, 0, 66, 0]",AG,0.50,chrX,...,0.06,chrX,299548,A,1,36,42.81,"[36, 0, 0, 0]",-,0.0
3,chrX,299562,A,1,135,52.17,"[131, 0, 4, 0]",AG,0.03,chrX,...,0.01,chrX,299562,A,1,34,42.56,"[34, 0, 0, 0]",-,0.0
4,chrX,299568,A,1,137,52.31,"[132, 0, 5, 0]",AG,0.04,chrX,...,0.00,chrX,299568,A,1,34,40.47,"[34, 0, 0, 0]",-,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26412,chr8,144974950,A,0,230,46.54,"[223, 0, 7, 0]",AG,0.03,chr8,...,0.00,chr8,144974950,A,0,45,40.98,"[45, 0, 0, 0]",-,0.0
26413,chr8,144977014,A,0,185,48.07,"[180, 0, 5, 0]",AG,0.03,chr8,...,0.00,chr8,144977014,A,0,40,41.20,"[40, 0, 0, 0]",-,0.0
26414,chr8,144977019,A,0,179,48.44,"[173, 0, 6, 0]",AG,0.03,chr8,...,0.00,chr8,144977019,A,0,42,41.57,"[42, 0, 0, 0]",-,0.0
26415,chr8,144977023,A,0,175,48.76,"[169, 0, 6, 0]",AG,0.03,chr8,...,0.00,chr8,144977023,A,0,39,42.26,"[39, 0, 0, 0]",-,0.0


Pos: (16835, 29)
Neg: (3229, 29)


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299431,A,1,229,49.54,"[226, 0, 3, 0]",AG,0.01,chrX,...,1,35,40.20,"[35, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
1,chrX,299510,A,1,144,45.65,"[121, 0, 23, 0]",AG,0.16,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
2,chrX,299568,A,1,137,52.31,"[132, 0, 5, 0]",AG,0.04,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
3,chrX,299572,A,1,137,51.77,"[125, 0, 12, 0]",AG,0.09,chrX,...,1,40,42.10,"[40, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
4,chrX,299574,A,1,133,51.55,"[130, 0, 3, 0]",AG,0.02,chrX,...,1,37,42.43,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20059,chr8,144425879,A,0,158,40.91,"[124, 0, 34, 0]",AG,0.22,chr8,...,0,39,41.62,"[28, 0, 11, 0]",AG,0.28,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20060,chr8,144440133,A,0,509,47.75,"[1, 0, 508, 0]",AG,1.00,chr8,...,0,59,40.75,"[0, 0, 59, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20061,chr8,144451628,A,0,220,49.34,"[169, 0, 51, 0]",AG,0.23,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20062,chr8,144468337,A,1,422,46.23,"[0, 0, 422, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz


Class        Class_binary
Editing      1               16835
Not-Editing  0                3229
dtype: int64
Save to disk bonafide final: /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/HEK293T_WT3.WT.outTable_110067244.gz_vs_HEK293T_KO3.KO.outTable_597789462.gz.bonafide_final.tsv


Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299431,A,1,229,49.54,"[226, 0, 3, 0]",AG,0.01,chrX,...,1,35,40.20,"[35, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
1,chrX,299510,A,1,144,45.65,"[121, 0, 23, 0]",AG,0.16,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
2,chrX,299568,A,1,137,52.31,"[132, 0, 5, 0]",AG,0.04,chrX,...,1,34,40.47,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
3,chrX,299572,A,1,137,51.77,"[125, 0, 12, 0]",AG,0.09,chrX,...,1,40,42.10,"[40, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
4,chrX,299574,A,1,133,51.55,"[130, 0, 3, 0]",AG,0.02,chrX,...,1,37,42.43,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20059,chr8,144425879,A,0,158,40.91,"[124, 0, 34, 0]",AG,0.22,chr8,...,0,39,41.62,"[28, 0, 11, 0]",AG,0.28,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20060,chr8,144440133,A,0,509,47.75,"[1, 0, 508, 0]",AG,1.00,chr8,...,0,59,40.75,"[0, 0, 59, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20061,chr8,144451628,A,0,220,49.34,"[169, 0, 51, 0]",AG,0.23,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20062,chr8,144468337,A,1,422,46.23,"[0, 0, 422, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz


In [11]:
# concat into a uniq dataset
bonafide_final_full = pd.concat([couple1, couple2, couple3, couple4, couple5, couple6])
bonafide_final_full

Unnamed: 0,wt_Region,wt_Position,wt_Reference,wt_Strand,wt_Coverage-q30,wt_MeanQ,"wt_BaseCount[A,C,G,T]",wt_AllSubs,wt_Frequency,ko_Region,...,gStrand,gCoverage-q30,gMeanQ,"gBaseCount[A,C,G,T]",gAllSubs,gFrequency,Class,Class_binary,wt_sample,ko_sample
0,chrX,299493,A,1,383,46.11,"[372, 0, 11, 0]",AG,0.03,chrX,...,1,30,40.47,"[30, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
1,chrX,299510,A,1,388,45.23,"[281, 0, 107, 0]",AG,0.28,chrX,...,1,36,41.00,"[36, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
2,chrX,299513,A,1,385,45.16,"[381, 0, 4, 0]",AG,0.01,chrX,...,1,34,41.35,"[34, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
3,chrX,299540,A,1,365,47.04,"[359, 0, 6, 0]",AG,0.02,chrX,...,1,37,41.11,"[37, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
4,chrX,299543,A,1,368,47.25,"[365, 0, 3, 0]",AG,0.01,chrX,...,1,39,42.33,"[39, 0, 0, 0]",-,0.00,Editing,1,HEK_ADAR1_p110_wt_1.OvE.outTable_530905096.gz,HEK293T_KO1.KO.outTable_905657585.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20059,chr8,144425879,A,0,158,40.91,"[124, 0, 34, 0]",AG,0.22,chr8,...,0,39,41.62,"[28, 0, 11, 0]",AG,0.28,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20060,chr8,144440133,A,0,509,47.75,"[1, 0, 508, 0]",AG,1.00,chr8,...,0,59,40.75,"[0, 0, 59, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20061,chr8,144451628,A,0,220,49.34,"[169, 0, 51, 0]",AG,0.23,chr8,...,0,43,40.12,"[35, 0, 8, 0]",AG,0.19,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz
20062,chr8,144468337,A,1,422,46.23,"[0, 0, 422, 0]",AG,1.00,chr8,...,1,34,40.50,"[0, 0, 34, 0]",AG,1.00,Not-Editing,0,HEK293T_WT3.WT.outTable_110067244.gz,HEK293T_KO3.KO.outTable_597789462.gz


In [12]:
# save to disk bonafide final
output_file = os.path.join(output_folder, "bonafide_final_MERGED.tsv")
print("Save to disk bonafide final MERGED (all couples):", output_file)
bonafide_final_full.to_csv(output_file, sep="\t", index=None)

Save to disk bonafide final MERGED (all couples): /lustre/bio_running/new_basecaller/REDINET_TEST_30_07_2024/REDInet/Package/Results/hek_pecori/bonafide_final_MERGED.tsv


In [13]:
bonafide_final_full.Class.value_counts()

Editing        165540
Not-Editing     20207
Name: Class, dtype: int64

In [14]:
bonafide_final_full.groupby("Class")[["wt_Frequency", "ko_Frequency", "gFrequency"]].describe().T

Unnamed: 0,Class,Editing,Not-Editing
wt_Frequency,count,165540.0,20207.0
wt_Frequency,mean,0.068067,0.773307
wt_Frequency,std,0.105301,0.29368
wt_Frequency,min,0.01,0.01
wt_Frequency,25%,0.01,0.54
wt_Frequency,50%,0.03,1.0
wt_Frequency,75%,0.07,1.0
wt_Frequency,max,1.0,1.0
ko_Frequency,count,165540.0,20207.0
ko_Frequency,mean,0.0,0.778779
