In [1]:
import pandas as pd
import numpy as np
import os, sys, pysam, gzip, subprocess, shlex, time
from datetime import datetime
from tqdm import tqdm
from multiprocessing import Pool
from sklearn.preprocessing import OneHotEncoder

def give_inputs(cell_line):
    
    if cell_line == "HEK293T":  
        samples = [["outTable_599710609", "outTable_905657585", "outTable_208420383"],
                   ["outTable_572868058", "outTable_364841872", "outTable_814257267"],
                   ["outTable_110067244", "outTable_597789462", "outTable_530905096"]]

        rmsk_file = "rmsk_hg38.sorted.gtf.gz"
        refseq_file = "hg38.110.ncbiRefSeq.sorted.gtf.gz"

    elif cell_line == "HEK":
        samples = [["outTable_724242056", "outTable_816573740"],
                   ["outTable_580067564", "outTable_718392497"],
                   ["outTable_181728208", "outTable_854894021"]]
        rmsk_file = "rmsk.sorted.gtf.gz"
        refseq_file = "hg19.ncbiRefSeq.sorted.gtf.gz"

    else:
        samples = [["outTable_192318299", "outTable_436061877"],
                   ["outTable_535670354", "outTable_396704193"],
                   ["outTable_773331943", "outTable_302610513"]]
        rmsk_file = "rmsk.sorted.gtf.gz"
        refseq_file = "hg19.ncbiRefSeq.sorted.gtf.gz"
    
    return samples, rmsk_file, refseq_file

def extraction(prefix):    
    
    cov_threshold = 50
    AGfreq_threshold = 0.01
    AG_min = 3
    interval = 101

    starttime = datetime.now()

    editing = []
    with gzip.open(prefix+".gz") as redi:
        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if line[0].find("chr") != -1:
                if line[4] != "-":
                    if int(line[4]) >= cov_threshold:
                        if line[2] == "A":
                            if line[7] == "AG":    
                                AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                                if AG_rna >= AGfreq_threshold:
                                    if eval(line[6])[2] >= AG_min:
                                        editing.append(line)
                          
            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}")
    print("Total evaluated rows:", c)
    editing = pd.DataFrame(editing)
    print("Total extracted Candidates Editing sites for current sample:", editing.shape[0])
    stoptime = datetime.now()
    print(f"[{datetime.now()}] Extraction of Editing Candidates finished for current sample. Elapsed time: {stoptime-starttime}.")
    columns = ["Region", "Position", "Ref", "Strand", "Cov", "Qual", "Bases", "AllSubs", "Freq", "gCov", "gQual", "g[A,C,G,T]", "gAllSubs", "gFreq"]
    editing.columns = columns
    print(f"[{datetime.now()}] Starting extraction of intervals.")
    ohe = OneHotEncoder()
    ohe.fit(np.array(["A", "C", "G", "T"]).reshape(-1, 1))

    intervals = []
    starttime_preds = datetime.now()
    total_extracted = 0
    features_extracted_filepath = prefix+ "_feature_vectors.tsv"
    features_extracted = open(features_extracted_filepath, "w")

    df = editing.query("Region != 'chrM'")
    print(f"[{datetime.now()}] Loading reditable with tabix and pysam:", prefix)
    start_time = datetime.now()
    srr = pysam.TabixFile(prefix+".gz")
    with tqdm(total=df.shape[0], position=0, leave=True) as pbar:
        for site in df.itertuples():
            start = int(site.Position) - ((interval-1)/2)
            stop = int(site.Position) + ((interval-1)/2)
            AGrna = eval(site.Bases)[2]/sum(eval(site.Bases))
            srr_interval = []
            for s in srr.fetch(site.Region, start-1, stop):
                srr_interval.append(s.split("\t"))
            srr_interval = pd.DataFrame(srr_interval, columns=columns)
            if srr_interval.shape[0] == interval and len(set(srr_interval["Strand"])) == 1:
                intervals.append([site.Region, site.Position, site.Ref, site.Strand, AGrna, site.Bases, start, stop, stop-start + 1, srr_interval.shape[0]])
                total_extracted += 1
                strand = site.Strand
                seq = srr_interval.Ref.values.reshape(-1,1)
                seq_ohe = ohe.transform(seq).toarray().T
                vects_freqs = []
                strands = []
                vects = []
                for vect in srr_interval["Bases"]:
                    vect = np.array(eval(vect))
                    cov = sum(vect)
                    vect_freqs = vect / cov
                    vects_freqs.append(vect_freqs)
                    vects.append(vect)
                vects_freqs = np.array(vects_freqs).T
                vects = np.array(vects).T
                encoded_site = pd.concat([pd.DataFrame(seq_ohe), pd.DataFrame(vects_freqs)])
                encoded_site.reset_index(drop=True, inplace=True)
                if strand == 0: 
                    encoded_site = pd.DataFrame(np.flip(encoded_site.values, axis=1))
                encoded_site.to_csv(features_extracted, mode="a", sep="\t", header = None, index=None)
            pbar.update(1)
    intervals = pd.DataFrame(intervals)
    print(f"[{datetime.now()}] Total extracted Editing sites: {total_extracted}.")
    stop_time_global = datetime.now()
    print(f"[{datetime.now()}] Features Extraction Finished. Elapsed time {datetime.now()-starttime_preds}.")
    features_extracted.close()
    
    intervals.columns = ["Region", "Position", "RefBase", "Strand", "FreqAGrna", "BasesCounts", "Start", "Stop", "Intlen", "TabixLen"]
    intervals.to_csv(prefix + "_intervals.tsv", sep="\t", index=None)
    print(f"[{datetime.now()}] Computation Finished. Total Elapsed time: {datetime.now()-starttime}")
    
def candidates_bona_fide_extraction(names, path, cells):    

    starttime = datetime.now()
    sites = []
    
    
    cov_threshold = 10
    rna_cov_threshold = 50
    AGfreq_threshold = 0.01
    AG_min = 3
    
    wgs = pysam.TabixFile(f"{path}/{cells}_WGS.gz")
    with gzip.open(f"{path}/{names[0]}.gz") as redi:
        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if line[0].find("chr") != -1:
                if line[0] != "chrM":
                    if line[4] != "-":
                        if int(line[4]) >= rna_cov_threshold:
                            if line[2] == "A":
                                if "AG" in line[7] == "AG":    
                                    AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                                    if AG_rna >= AGfreq_threshold:
                                        if eval(line[6])[2] >= AG_min:
                                            region = line[0]
                                            start = int(line[1])-1
                                            stop = int(line[1])
                                            for ROW_WGS in wgs.fetch(region, start, stop):
                                                row_wgs = ROW_WGS.split("\t")
                                                if row_wgs[9] !=  "-":
                                                    if int(row_wgs[9])>=cov_threshold:
                                                        if "AG" in row_wgs[12]:
                                                            sites.append([line[0], line[1], 0])
                                                        else:
                                                            if row_wgs[12] == "-":
                                                                if line[7] == "AG":
                                                                    sites.append([line[0], line[1], 1]) 

            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}")
                                                                            
    with gzip.open(f"{path}/{names[1]}.gz") as redi:
        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if line[0].find("chr") != -1:
                if line[0] != "chrM":
                    if line[4] != "-":
                        if int(line[4]) >= rna_cov_threshold:
                            if line[2] == "A":
                                if "AG" in line[7] == "AG":    
                                    AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                                    if AG_rna >= AGfreq_threshold:
                                        if eval(line[6])[2] >= AG_min:
                                            region = line[0]
                                            start = int(line[1])-1
                                            stop = int(line[1])
                                            for ROW_WGS in wgs.fetch(region, start, stop):
                                                row_wgs = ROW_WGS.split("\t")
                                                if row_wgs[9] !=  "-":
                                                    if int(row_wgs[9])>=cov_threshold:
                                                        if "AG" in row_wgs[12]:
                                                            sites.append([line[0], line[1], 0])
                                                        else:
                                                            if row_wgs[12] == "-":
                                                                if line[7] == "AG":
                                                                    sites.append([line[0], line[1], 1]) 
            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}")
    print("Total evaluated rows:", c)
    sites = pd.DataFrame(sites)
    columns = ["Region", "Position", "Class"]
    sites.columns = columns
    sites.drop_duplicates(subset=columns, keep="first", inplace=True)
    print("Total extracted candidates bona fide Sites for current samples:", sites.shape[0])
    stoptime = datetime.now()
    print(f"[{datetime.now()}] Extraction of candidates sites finished for current sample. Elapsed time: {stoptime-starttime}.")
    
    sites.to_csv(f"{path}/{names[0]}_{names[1]}_candidates_bona_fide_sites.tsv", sep="\t", index=None)

u_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/scripts/utilities"


In [None]:
cells = "HEK"

tables, rmsk, refseq = give_inputs(cells)
    
path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/{}".format(cells)

inputs = []
for i in range(len(tables)):
    for j in range(len(tables[i])):
        inputs.append(os.path.join(path, f"{tables[i][j]}"))

with Pool(9) as pool:
    pool.map(extraction, inputs)
    
inputs = []
if cells == "HEK293T":
    for i in range(3):
            inputs.append([[tables[i][0], tables[i][1]], path, cells])
            inputs.append([[tables[i][2], tables[i][1]], path, cells])
else:
    for i in range(3):
            inputs.append([[tables[i][0], tables[i][1]], path, cells])
    
with Pool(6) as pool:
    pool.starmap(candidates_bona_fide_extraction, inputs) 
    
for file_name in os.listdir(path):
    if file_name.find("_candidates_bona_fide_sites.tsv") !=-1:
        df = pd.read_csv(os.path.join(path, file_name), sep="\t")
        df.to_csv(os.path.join(path, file_name), sep="\t", index=None, header=False)
        name = file_name.replace(".tsv", "")
        cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -n rmsk -i {}/{}.tsv -o {}/{}.out.rmsk -u".format(u_path, u_path, rmsk, path, name, path, name)
        args = shlex.split(cmd_sh)
        p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/.conda/envs/tf/bin"))
    
time.sleep(60)

for name in os.listdir(path):
    if name.find("rmsk") != -1:
        cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -i {}/{} -o {}/{}.refseq -u".format(u_path, u_path, refseq, path, name, path, name)
        args = shlex.split(cmd_sh)
        p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/.conda/envs/tf/bin"))
        
time.sleep(60)

cols = ["Region", "Position", "Class", "RMSK-Rep", "RMSK-Reg", "RefSeq-Rep", "RefSeq-Reg"]
for file_name in os.listdir(path):
    if file_name.find(".refseq") !=-1:
        df = pd.read_table(os.path.join(path, file_name), header=None)
        name = file_name.replace(".out.rmsk.refseq", "_annoted.tsv")
        df.columns = cols
        df.to_csv(os.path.join(path, name), sep="\t", index=None)
        
for file_name in os.listdir(path):
    if file_name.find("candidates_bona_fide_sites_annoted.tsv") !=-1:
        bona_fide = pd.read_csv(os.path.join(path, file_name), sep="\t")
        rep = bona_fide[(bona_fide.iloc[:, 3] != "-") & (bona_fide.iloc[:, 4] != "-")]
        non_rep = bona_fide[(bona_fide.iloc[:, 3] == "-") & (bona_fide.iloc[:, 4] == "-")]

        del bona_fide

        non_rep_n = non_rep[non_rep.iloc[:, 2]==0]
        non_rep_p = non_rep[non_rep.iloc[:, 2]==1]

        del non_rep

        non_rep_p = non_rep_p[(non_rep_p.iloc[:, 5] != "-") & (non_rep_p.iloc[:, 6] != "-")]
        bona_fide = pd.concat([rep, non_rep_n, non_rep_p])

        del rep, non_rep_n, non_rep_p
        
        name = file_name.replace("candidates_bona_fide_sites_annoted.tsv", "bona_fide_sites.tsv")
        bona_fide = bona_fide.sort_values(["Region", "Position"])
        bona_fide.to_csv(os.path.join(path, name), sep="\t", index=None)

In [None]:
cells = "a549"

tables, rmsk, refseq = give_inputs(cells)
    
path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/{}".format(cells)

inputs = []
for i in range(len(tables)):
    for j in range(len(tables[i])):
        inputs.append(os.path.join(path, f"{tables[i][j]}"))

with Pool(9) as pool:
    pool.map(extraction, inputs)
    
inputs = []
if cells == "HEK293T":
    for i in range(3):
            inputs.append([[tables[i][0], tables[i][1]], path, cells])
            inputs.append([[tables[i][2], tables[i][1]], path, cells])
else:
    for i in range(3):
            inputs.append([[tables[i][0], tables[i][1]], path, cells])
    
with Pool(6) as pool:
    pool.starmap(candidates_bona_fide_extraction, inputs) 
    
for file_name in os.listdir(path):
    if file_name.find("_candidates_bona_fide_sites.tsv") !=-1:
        df = pd.read_csv(os.path.join(path, file_name), sep="\t")
        df.to_csv(os.path.join(path, file_name), sep="\t", index=None, header=False)
        name = file_name.replace(".tsv", "")
        cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -n rmsk -i {}/{}.tsv -o {}/{}.out.rmsk -u".format(u_path, u_path, rmsk, path, name, path, name)
        args = shlex.split(cmd_sh)
        p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/.conda/envs/tf/bin"))
    
time.sleep(60)

for name in os.listdir(path):
    if name.find("rmsk") != -1:
        cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -i {}/{} -o {}/{}.refseq -u".format(u_path, u_path, refseq, path, name, path, name)
        args = shlex.split(cmd_sh)
        p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/.conda/envs/tf/bin"))
        
time.sleep(60)

cols = ["Region", "Position", "Class", "RMSK-Rep", "RMSK-Reg", "RefSeq-Rep", "RefSeq-Reg"]
for file_name in os.listdir(path):
    if file_name.find(".refseq") !=-1:
        df = pd.read_table(os.path.join(path, file_name), header=None)
        name = file_name.replace(".out.rmsk.refseq", "_annoted.tsv")
        df.columns = cols
        df.to_csv(os.path.join(path, name), sep="\t", index=None)
        
for file_name in os.listdir(path):
    if file_name.find("candidates_bona_fide_sites_annoted.tsv") !=-1:
        bona_fide = pd.read_csv(os.path.join(path, file_name), sep="\t")
        rep = bona_fide[(bona_fide.iloc[:, 3] != "-") & (bona_fide.iloc[:, 4] != "-")]
        non_rep = bona_fide[(bona_fide.iloc[:, 3] == "-") & (bona_fide.iloc[:, 4] == "-")]

        del bona_fide

        non_rep_n = non_rep[non_rep.iloc[:, 2]==0]
        non_rep_p = non_rep[non_rep.iloc[:, 2]==1]

        del non_rep

        non_rep_p = non_rep_p[(non_rep_p.iloc[:, 5] != "-") & (non_rep_p.iloc[:, 6] != "-")]
        bona_fide = pd.concat([rep, non_rep_n, non_rep_p])

        del rep, non_rep_n, non_rep_p
        
        name = file_name.replace("candidates_bona_fide_sites_annoted.tsv", "bona_fide_sites.tsv")
        bona_fide = bona_fide.sort_values(["Region", "Position"])
        bona_fide.to_csv(os.path.join(path, name), sep="\t", index=None)

In [2]:
cells = "HEK293T"

tables, rmsk, refseq = give_inputs(cells)
    
path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/{}".format(cells)

inputs = []
for i in range(len(tables)):
    for j in range(len(tables[i])):
        inputs.append(os.path.join(path, f"{tables[i][j]}"))

with Pool(9) as pool:
    pool.map(extraction, inputs)
    
inputs = []
if cells == "HEK293T":
    for i in range(3):
            inputs.append([[tables[i][0], tables[i][1]], path, cells])
            inputs.append([[tables[i][2], tables[i][1]], path, cells])
else:
    for i in range(3):
            inputs.append([[tables[i][0], tables[i][1]], path, cells])
    
with Pool(6) as pool:
    pool.starmap(candidates_bona_fide_extraction, inputs) 
    
for file_name in os.listdir(path):
    if file_name.find("_candidates_bona_fide_sites.tsv") !=-1:
        df = pd.read_csv(os.path.join(path, file_name), sep="\t")
        df.to_csv(os.path.join(path, file_name), sep="\t", index=None, header=False)
        name = file_name.replace(".tsv", "")
        cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -n rmsk -i {}/{}.tsv -o {}/{}.out.rmsk -u".format(u_path, u_path, rmsk, path, name, path, name)
        args = shlex.split(cmd_sh)
        p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/.conda/envs/tf/bin"))
    
time.sleep(60)

for name in os.listdir(path):
    if name.find("rmsk") != -1:
        cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -i {}/{} -o {}/{}.refseq -u".format(u_path, u_path, refseq, path, name, path, name)
        args = shlex.split(cmd_sh)
        p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/.conda/envs/tf/bin"))
        
time.sleep(60)

cols = ["Region", "Position", "Class", "RMSK-Rep", "RMSK-Reg", "RefSeq-Rep", "RefSeq-Reg"]
for file_name in os.listdir(path):
    if file_name.find(".refseq") !=-1:
        df = pd.read_table(os.path.join(path, file_name), header=None)
        name = file_name.replace(".out.rmsk.refseq", "_annoted.tsv")
        df.columns = cols
        df.to_csv(os.path.join(path, name), sep="\t", index=None)
        
for file_name in os.listdir(path):
    if file_name.find("candidates_bona_fide_sites_annoted.tsv") !=-1:
        bona_fide = pd.read_csv(os.path.join(path, file_name), sep="\t")
        rep = bona_fide[(bona_fide.iloc[:, 3] != "-") & (bona_fide.iloc[:, 4] != "-")]
        non_rep = bona_fide[(bona_fide.iloc[:, 3] == "-") & (bona_fide.iloc[:, 4] == "-")]

        del bona_fide

        non_rep_n = non_rep[non_rep.iloc[:, 2]==0]
        non_rep_p = non_rep[non_rep.iloc[:, 2]==1]

        del non_rep

        non_rep_p = non_rep_p[(non_rep_p.iloc[:, 5] != "-") & (non_rep_p.iloc[:, 6] != "-")]
        bona_fide = pd.concat([rep, non_rep_n, non_rep_p])

        del rep, non_rep_n, non_rep_p
        
        name = file_name.replace("candidates_bona_fide_sites_annoted.tsv", "bona_fide_sites.tsv")
        bona_fide = bona_fide.sort_values(["Region", "Position"])
        bona_fide.to_csv(os.path.join(path, name), sep="\t", index=None)
        
for i in range(3):
    wt = pd.read_csv(os.path.join(path, f"{tables[i][0]}_{tables[i][1]}_bona_fide_sites_first_filtering.tsv"), sep="\t")
    ove = pd.read_csv(os.path.join(path, f"{tables[i][2]}_{tables[i][1]}_bona_fide_sites_first_filtering.tsv"), sep="\t")
    
    wt_rep = wt[(wt.loc[:, "RMSK-Rep"]!="-") & (wt.loc[:, "RMSK-Reg"]!="-")]
    wt_not_rep_p = wt[(wt.loc[:, "Class"]==1) & (wt.loc[:, "RMSK-Rep"]=="-") & (wt.loc[:, "RMSK-Reg"]=="-")]
    wt_not_rep_n = wt[(wt.loc[:, "Class"]==0) & (wt.loc[:, "RMSK-Rep"]=="-") & (wt.loc[:, "RMSK-Reg"]=="-")]
    
    ove_rep = ove[(ove.loc[:, "RMSK-Rep"]!="-") & (ove.loc[:, "RMSK-Reg"]!="-")]
    ove_not_rep_p = ove[(ove.loc[:, "Class"]==1) & (ove.loc[:, "RMSK-Rep"]=="-") & (ove.loc[:, "RMSK-Reg"]=="-")]
    ove_not_rep_n = ove[(ove.loc[:, "Class"]==0) & (ove.loc[:, "RMSK-Rep"]=="-") & (ove.loc[:, "RMSK-Reg"]=="-")]
    
    not_rep_p = wt_not_rep_p.merge(ove_not_rep_p, how="inner", on=cols)
    
    wt = pd.concat([wt_rep, not_rep_p, wt_not_rep_n], axis=0)
    ove = pd.concat([ove_rep, not_rep_p, ove_not_rep_n], axis=0)
    
    wt.to_csv(os.path.join(path, f"{tables[i][0]}_{tables[i][1]}_bona_fide_sites.tsv"), sep="\t", index=None)
    ove.to_csv(os.path.join(path, f"{tables[i][2]}_{tables[i][1]}_bona_fide_sites.tsv"), sep="\t", index=None)

	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0




	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
Total evaluated rows: 194677901
Total extracted Candidates Editing sites 

 19%|█▉        | 2195/11701 [00:08<00:27, 349.57it/s]

	Sites evaluated: 200000000
Total evaluated rows: 199831579
Total extracted Candidates Editing sites for current sample: 24397
[2024-03-13 15:20:58.671977] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:28.110396.
[2024-03-13 15:20:58.675555] Starting extraction of intervals.


 19%|█▉        | 2231/11701 [00:08<00:27, 347.30it/s]

[2024-03-13 15:20:58.697273] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_110067244


  0%|          | 41/24391 [00:00<01:00, 402.56it/s]s]

	Sites evaluated: 200000000


  0%|          | 120/24391 [00:00<01:12, 334.34it/s]]

	Sites evaluated: 200000000


  1%|          | 295/24391 [00:01<02:00, 199.92it/s]]

	Sites evaluated: 200000000


  3%|▎         | 830/24391 [00:02<01:01, 381.90it/s]]

	Sites evaluated: 200000000


  4%|▎         | 869/24391 [00:03<01:18, 300.26it/s]]

	Sites evaluated: 200000000


 32%|███▏      | 3744/11701 [00:12<00:22, 349.82it/s]

	Sites evaluated: 200000000


 60%|██████    | 7061/11701 [00:22<00:13, 337.61it/s]

Total evaluated rows: 205675117
Total extracted Candidates Editing sites for current sample: 11897
[2024-03-13 15:21:13.160204] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:42.599172.


 18%|█▊        | 4476/24391 [00:14<01:02, 319.39it/s]

[2024-03-13 15:21:13.165135] Starting extraction of intervals.


 61%|██████    | 7106/11701 [00:22<00:12, 367.50it/s]

[2024-03-13 15:21:13.189624] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_364841872


 22%|██▏       | 5300/24391 [00:16<00:55, 342.30it/s]

Total evaluated rows: 208788107


 22%|██▏       | 5342/24391 [00:16<00:52, 363.08it/s]

Total extracted Candidates Editing sites for current sample: 11086
[2024-03-13 15:21:15.618514] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:45.056736.
[2024-03-13 15:21:15.621970] Starting extraction of intervals.


 68%|██████▊   | 7916/11701 [00:25<00:11, 330.84it/s]

[2024-03-13 15:21:15.638323] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_597789462


100%|██████████| 11701/11701 [00:36<00:00, 324.17it/s]


[2024-03-13 15:21:26.599138] Total extracted Editing sites: 5094.
[2024-03-13 15:21:26.603694] Features Extraction Finished. Elapsed time 0:00:36.220906.


 32%|███▏      | 3497/11079 [00:10<00:26, 289.39it/s]

[2024-03-13 15:21:26.635652] Computation Finished. Total Elapsed time: 0:06:56.075431


100%|██████████| 11079/11079 [00:33<00:00, 331.62it/s]


[2024-03-13 15:21:49.201906] Total extracted Editing sites: 4770.
[2024-03-13 15:21:49.206691] Features Extraction Finished. Elapsed time 0:00:33.580917.
[2024-03-13 15:21:49.237716] Computation Finished. Total Elapsed time: 0:07:18.675954


100%|██████████| 11891/11891 [00:37<00:00, 317.86it/s]
 69%|██████▊   | 16741/24391 [00:51<00:21, 359.58it/s]

[2024-03-13 15:21:50.689859] Total extracted Editing sites: 5077.
[2024-03-13 15:21:50.693687] Features Extraction Finished. Elapsed time 0:00:37.523664.
[2024-03-13 15:21:50.725102] Computation Finished. Total Elapsed time: 0:07:20.164089


100%|██████████| 24391/24391 [01:14<00:00, 327.62it/s]


[2024-03-13 15:22:13.313147] Total extracted Editing sites: 9960.
[2024-03-13 15:22:13.318799] Features Extraction Finished. Elapsed time 0:01:14.639913.
[2024-03-13 15:22:13.372075] Computation Finished. Total Elapsed time: 0:07:42.810522
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
Total evaluated rows: 266283468
Total extracted Candidates Editing sites for current sample: 54617
[2024-03-13 15:23:11.794610] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:08:41.234186.
[2024-03-13 15:23:11.798365] Starting extraction of intervals.
[2024-03-13 15:23:11.840604] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_208420383


 22%|██▏       | 12113/54614 [00:39<02:12, 319.63it/s]

Total evaluated rows: 288775068


 22%|██▏       | 12147/54614 [00:39<02:11, 324.10it/s]

Total extracted Candidates Editing sites for current sample: 37723
[2024-03-13 15:23:51.209800] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:09:20.649137.
[2024-03-13 15:23:51.214471] Starting extraction of intervals.
[2024-03-13 15:23:51.246582] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_572868058


 15%|█▍        | 5650/37720 [00:19<02:17, 232.72it/s]]

Total evaluated rows: 298335470
Total extracted Candidates Editing sites for current sample: 66236
[2024-03-13 15:24:10.803807] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:09:40.242378.
[2024-03-13 15:24:10.808023] Starting extraction of intervals.


 33%|███▎      | 18212/54614 [00:58<02:02, 296.05it/s]

[2024-03-13 15:24:10.854412] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_814257267


 15%|█▌        | 5748/37720 [00:19<01:46, 300.91it/s]][W::hts_idx_load3] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_814257267.gz.tbi
  0%|          | 312/66233 [00:00<03:12, 342.90it/s]]]

Total evaluated rows: 298198400


 16%|█▌        | 5997/37720 [00:20<01:53, 278.42it/s]

Total extracted Candidates Editing sites for current sample: 67410
[2024-03-13 15:24:11.936566] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:09:41.374607.
[2024-03-13 15:24:11.941197] Starting extraction of intervals.

  1%|          | 347/66233 [00:01<03:28, 315.65it/s]




 34%|███▍      | 18540/54614 [01:00<02:10, 276.60it/s]

[2024-03-13 15:24:11.984454] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_530905096


  1%|          | 380/66233 [00:01<04:51, 225.64it/s]s][W::hts_idx_load3] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_530905096.gz.tbi
  3%|▎         | 2032/67408 [00:06<03:18, 329.63it/s]]

	Sites evaluated: 300000000

 38%|███▊      | 20591/54614 [01:06<01:37, 350.20it/s]




100%|██████████| 37720/37720 [02:03<00:00, 305.26it/s]


[2024-03-13 15:25:55.021996] Total extracted Editing sites: 15241.
[2024-03-13 15:25:55.027303] Features Extraction Finished. Elapsed time 0:02:03.809630.


 93%|█████████▎| 50818/54614 [02:43<00:12, 298.21it/s]

[2024-03-13 15:25:55.106218] Computation Finished. Total Elapsed time: 0:11:24.545583


 46%|████▌     | 31052/67408 [01:44<01:48, 333.63it/s]

	Sites evaluated: 350000000


100%|██████████| 54614/54614 [02:55<00:00, 311.85it/s]


[2024-03-13 15:26:07.103990] Total extracted Editing sites: 22910.
[2024-03-13 15:26:07.109484] Features Extraction Finished. Elapsed time 0:02:55.307312.


 52%|█████▏    | 34669/66233 [01:56<01:30, 348.46it/s]

[2024-03-13 15:26:07.220994] Computation Finished. Total Elapsed time: 0:11:36.660601


 63%|██████▎   | 42668/67408 [02:21<01:41, 243.20it/s]

Total evaluated rows: 369524874


 65%|██████▍   | 42837/66233 [02:22<01:11, 325.44it/s]

Total extracted Candidates Editing sites for current sample: 54971
[2024-03-13 15:26:33.754138] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:12:03.194117.
[2024-03-13 15:26:33.758422] Starting extraction of intervals.


 63%|██████▎   | 42695/67408 [02:21<01:39, 249.39it/s]

[2024-03-13 15:26:33.799813] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_599710609


100%|██████████| 66233/66233 [03:39<00:00, 301.32it/s]


[2024-03-13 15:27:50.739831] Total extracted Editing sites: 27907.
[2024-03-13 15:27:50.743865] Features Extraction Finished. Elapsed time 0:03:39.931942.


 98%|█████████▊| 66092/67408 [03:38<00:04, 278.90it/s]

[2024-03-13 15:27:50.871458] Computation Finished. Total Elapsed time: 0:13:20.310058


100%|██████████| 67408/67408 [03:42<00:00, 302.42it/s]
 43%|████▎     | 23384/54968 [01:20<03:02, 173.06it/s]

[2024-03-13 15:27:54.957750] Total extracted Editing sites: 28354.
[2024-03-13 15:27:54.962866] Features Extraction Finished. Elapsed time 0:03:43.018178.


 43%|████▎     | 23422/54968 [01:21<02:19, 226.34it/s]

[2024-03-13 15:27:55.096165] Computation Finished. Total Elapsed time: 0:13:24.534234


100%|██████████| 54968/54968 [03:07<00:00, 292.79it/s]


[2024-03-13 15:29:41.756411] Total extracted Editing sites: 22284.
[2024-03-13 15:29:41.760744] Features Extraction Finished. Elapsed time 0:03:07.998795.
[2024-03-13 15:29:41.867681] Computation Finished. Total Elapsed time: 0:15:11.307689
	Sites evaluated: 0	Sites evaluated: 0
	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0




	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 0
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites ev

Pysam version used: 0.21.0
Script time --> START: 13/03/2024 15:50:11
Pysam version used: 0.21.0
Script time --> START: 13/03/2024 15:50:12
Pysam version used: 0.21.0
Script time --> START: 13/03/2024 15:50:12
Pysam version used: 0.21.0
Script time --> START: 13/03/2024 15:50:12
Pysam version used: 0.21.0
Script time --> START: 13/03/2024 15:50:12
Pysam version used: 0.21.0
Script time --> START: 13/03/2024 15:50:12
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_110067244_outTable_597789462_candidates_bona_fide_sites.out.rmsk
Script time --> END: 13/03/2024 15:50:17
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_572868058_outTable_364841872_candidates_bona_fide_sites.out.rmsk
Script time --> END: 13/03/2024 15:50:19
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_599710609_outTable_905657585_candidates_b