In [1]:
# Copyright (c) 2023-2024 Pietro Luca Mazzacuva <pietroluca.mazzacuva@unicampus.it>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import pandas as pd
import numpy as np
import os, sys, pysam, gzip, subprocess, shlex, time
from datetime import datetime
from tqdm import tqdm
from multiprocessing import Pool
from sklearn.preprocessing import OneHotEncoder

def give_inputs(cell_line):
    
    files_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/{}".format(cells)
    
    if cell_line == "HEK293T":  
        samples = [["outTable_599710609", "outTable_905657585", "outTable_208420383"],
                   ["outTable_572868058", "outTable_364841872", "outTable_814257267"],
                   ["outTable_110067244", "outTable_597789462", "outTable_530905096"]]

        rmsk_file = "rmsk_hg38.sorted.gtf.gz"
        refseq_file = "hg38.110.ncbiRefSeq.sorted.gtf.gz"

    elif cell_line == "HEK":
        samples = [["outTable_724242056", "outTable_816573740"],
                   ["outTable_580067564", "outTable_718392497"],
                   ["outTable_181728208", "outTable_854894021"]]
        rmsk_file = "rmsk.sorted.gtf.gz"
        refseq_file = "hg19.ncbiRefSeq.sorted.gtf.gz"

    else:
        samples = [["outTable_192318299", "outTable_436061877"],
                   ["outTable_535670354", "outTable_396704193"],
                   ["outTable_773331943", "outTable_302610513"]]
        rmsk_file = "rmsk.sorted.gtf.gz"
        refseq_file = "hg19.ncbiRefSeq.sorted.gtf.gz"
    
    return samples, rmsk_file, refseq_file, files_path

def extraction(prefix,  AG_min, AGfreq_threshold, cov_threshold, interval):    
    
    starttime = datetime.now()

    editing = []
                                         
    with gzip.open(prefix+".gz") as redi:
        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if line[2] == "A":
                if line[4] != "-":
                    if int(line[4]) >= cov_threshold:
                        if "AG" in line[7]:    
                            AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                            if AG_rna >= AGfreq_threshold:
                                if eval(line[6])[2] >= AG_min:
                                    editing.append(line)

            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}")
    print("Total evaluated rows:", c)
    editing = pd.DataFrame(editing)
    print("Total extracted Candidates Editing sites for current sample:", editing.shape[0])
    stoptime = datetime.now()
    print(f"[{datetime.now()}] Extraction of Editing Candidates finished for current sample. Elapsed time: {stoptime-starttime}.")
    columns = ["Region", "Position", "Ref", "Strand", "Cov", "Qual", "Bases", "AllSubs", "Freq", "gCov", "gQual", "g[A,C,G,T]", "gAllSubs", "gFreq"]
    editing.columns = columns
    print(f"[{datetime.now()}] Starting extraction of intervals.")
    ohe = OneHotEncoder()
    ohe.fit(np.array(["A", "C", "G", "T"]).reshape(-1, 1))

    intervals = []
    starttime_preds = datetime.now()
    total_extracted = 0
    features_extracted_filepath = prefix+ "_feature_vectors.tsv"
    features_extracted = open(features_extracted_filepath, "w")

    df = editing.query("Region != 'chrM'")
    print(f"[{datetime.now()}] Loading reditable with tabix and pysam:", prefix)
    start_time = datetime.now()
    srr = pysam.TabixFile(prefix+".gz")
    with tqdm(total=df.shape[0], position=0, leave=True) as pbar:
        for site in df.itertuples():
            start = int(site.Position) - ((interval-1)/2)
            stop = int(site.Position) + ((interval-1)/2)
            AGrna = eval(site.Bases)[2]/sum(eval(site.Bases))
            srr_interval = []
            for s in srr.fetch(site.Region, start-1, stop):
                srr_interval.append(s.split("\t"))
            srr_interval = pd.DataFrame(srr_interval, columns=columns)
            if srr_interval.shape[0] == interval and len(set(srr_interval["Strand"])) == 1:
                intervals.append([site.Region, site.Position, site.Ref, site.Strand, AGrna, site.Bases, start, stop, stop-start + 1, srr_interval.shape[0]])
                total_extracted += 1
                strand = site.Strand
                seq = srr_interval.Ref.values.reshape(-1,1)
                seq_ohe = ohe.transform(seq).toarray().T
                vects_freqs = []
                strands = []
                vects = []
                for vect in srr_interval["Bases"]:
                    vect = np.array(eval(vect))
                    cov = sum(vect)
                    vect_freqs = vect / cov
                    vects_freqs.append(vect_freqs)
                    vects.append(vect)
                vects_freqs = np.array(vects_freqs).T
                vects = np.array(vects).T
                encoded_site = pd.concat([pd.DataFrame(seq_ohe), pd.DataFrame(vects_freqs)])
                encoded_site.reset_index(drop=True, inplace=True)
                if strand == 0: 
                    encoded_site = pd.DataFrame(np.flip(encoded_site.values, axis=1))
                encoded_site.to_csv(features_extracted, mode="a", sep="\t", header = None, index=None)
            pbar.update(1)
    intervals = pd.DataFrame(intervals)
    print(f"[{datetime.now()}] Total extracted Editing sites: {total_extracted}.")
    stop_time_global = datetime.now()
    print(f"[{datetime.now()}] Features Extraction Finished. Elapsed time {datetime.now()-starttime_preds}.")
    features_extracted.close()
    
    intervals.columns = ["Region", "Position", "RefBase", "Strand", "FreqAGrna", "BasesCounts", "Start", "Stop", "Intlen", "TabixLen"]
    intervals.to_csv(prefix + "_intervals.tsv", sep="\t", index=None)
    print(f"[{datetime.now()}] Computation Finished. Total Elapsed time: {datetime.now()-starttime}")

def candidates_bona_fide_extraction(name1, name2, path, cells, AG_min, AGfreq_threshold, cov_threshold, rna_cov_threshold):    

    starttime = datetime.now()
    sites = []
    
    wgs = pysam.TabixFile(f"{path}/{cells}_WGS.gz")   
    inactive = pysam.TabixFile(f"{path}/{name2}.gz")
    with gzip.open(f"{path}/{name1}.gz") as redi:

        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if line[0].find("chr") != -1:
                if line[2] == "A":
                    if line[4] != "-":
                        if int(line[4]) >= cov_threshold:
                            if "AG" in line[7]:    
                                AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                                if AG_rna >= AGfreq_threshold:
                                    if eval(line[6])[2] >= AG_min:
                                        region = line[0]
                                        start = int(line[1])-1
                                        stop = int(line[1])
                                        for ROW_inactive in inactive.fetch(region, start, stop):
                                            row_inactive = ROW_inactive.split("\t")
                                            if row_inactive[4] != "-":
                                                if int(row_inactive[4]) >= rna_cov_threshold:
                                                    sub_inactive = eval(row_inactive[6])[2]
                                                    for ROW_WGS in wgs.fetch(region, start, stop):
                                                        row_wgs = ROW_WGS.split("\t")
                                                        if row_wgs[9] !=  "-":
                                                            if int(row_wgs[9])>=cov_threshold:
                                                                sub_wgs = eval(row_wgs[11])[2]
                                                                if sub_wgs == 0:
                                                                    if sub_inactive == 0:
                                                                        sites.append([line[0], line[1], 1])
                                                                else:
                                                                    if sub_inactive >= AG_min:
                                                                        sites.append([line[0], line[1], 0])
                                                                                                               
            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}")
     
    sites = pd.DataFrame(sites)
    sites.columns = ["Region", "Position", "Class"]
    positives = sites[sites.loc[:, "Class"]==1].shape[0]
    negatives = sites[sites.loc[:, "Class"]==0].shape[0]
    print("Total evaluated rows:", c)
    print(f"Total extracted candidates bona fide Sites for {name1} {name2} samples: {sites.shape[0]}")
    print(f"Total candidates positives bona fide Sites for {name1} {name2} samples: {positives}")
    print(f"Total candidates negatives bona fide Sites for {name1} {name2} samples: {negatives}")
    stoptime = datetime.now()
    
    print(f"[{datetime.now()}] Extraction of candidates bonafide sites finished for {name1} {name2} samples. Elapsed time: {stoptime-starttime}.")
    sites.to_csv(f"{path}/{name1}_{name2}_candidates_bona_fide_sites.tsv", sep="\t", index=None)
                                                                                 
def bonafide_identification(path, rmsk, refseq):
                           
    u_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/scripts/utilities"                                                                            
                                                                                 
    for file_name in os.listdir(path):
        if file_name.find("_candidates_bona_fide_sites.tsv") !=-1:
            df = pd.read_csv(os.path.join(path, file_name), sep="\t")
            df.to_csv(os.path.join(path, file_name), sep="\t", index=None, header=False)
            name = file_name.replace(".tsv", "")
            cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -n rmsk -i {}/{}.tsv -o {}/{}.out.rmsk -u".format(u_path, u_path, rmsk, path, name, path, name)
            args = shlex.split(cmd_sh)
            p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/anaconda3/envs/tensorflow-gpu/bin"))

    time.sleep(300)

    for name in os.listdir(path):
        if name.find("rmsk") != -1:
            cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/{} -i {}/{} -o {}/{}.refseq -u".format(u_path, u_path, refseq, path, name, path, name)
            args = shlex.split(cmd_sh)
            p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/anaconda3/envs/tensorflow-gpu/bin"))

    time.sleep(300)

    cols = ["Region", "Position", "Class", "RMSK-Rep", "RMSK-Reg", "RefSeq-Rep", "RefSeq-Reg"]
    for file_name in os.listdir(path):
        if file_name.find(".refseq") !=-1:
            df = pd.read_table(os.path.join(path, file_name), header=None)
            name = file_name.replace(".out.rmsk.refseq", "_annoted.tsv")
            df.columns = cols
            df.to_csv(os.path.join(path, name), sep="\t", index=None)

    for file_name in os.listdir(path):
        if file_name.find("candidates_bona_fide_sites_annoted.tsv") !=-1:
            bona_fide = pd.read_csv(os.path.join(path, file_name), sep="\t")
            rep = bona_fide[(bona_fide.iloc[:, 3] != "-") & (bona_fide.iloc[:, 4] != "-")]
            non_rep = bona_fide[(bona_fide.iloc[:, 3] == "-") & (bona_fide.iloc[:, 4] == "-")]

            del bona_fide

            non_rep_n = non_rep[non_rep.iloc[:, 2]==0]
            non_rep_p = non_rep[non_rep.iloc[:, 2]==1]

            del non_rep

            non_rep_p = non_rep_p[(non_rep_p.iloc[:, 5] != "-") & (non_rep_p.iloc[:, 6] != "-")]
            bona_fide = pd.concat([rep, non_rep_n, non_rep_p])

            del rep, non_rep_n, non_rep_p

            name = file_name.replace("candidates_bona_fide_sites_annoted.tsv", "bona_fide_sites.tsv")
            bona_fide = bona_fide.sort_values(["Region", "Position"])
            bona_fide.to_csv(os.path.join(path, name), sep="\t", index=None)

In [2]:
min_dna_cov = 10
min_rna_cov = 50
min_AG_rate = 0.01
min_G = 3
seq_lenght = 101

cells = "a549"

samples, rmsk_file_name, refseq_file_name, filespath = give_inputs(cells)
    
inputs = []
for i in range(len(samples)):
    for j in range(len(samples[i])):
        inputs.append([os.path.join(filespath, f"{samples[i][j]}"), min_G, min_AG_rate, min_rna_cov, seq_lenght])

with Pool(9) as pool:
    pool.starmap(extraction, inputs)
    
inputs = []
for i in range(3):
    inputs.append([samples[i][0], samples[i][1], filespath, cells, min_G, min_AG_rate, min_dna_cov, min_rna_cov])

with Pool(6) as pool:
    pool.starmap(candidates_bona_fide_extraction, inputs)     

bonafide_identification(filespath, rmsk_file_name, refseq_file_name)

	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0


	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
Total evaluated rows: 216398797
Total extracted Candidates Editing sites for current sample: 6686
[2024-07-26 02:08:59.269623] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:05:51.648

  0%|          | 22/6673 [00:00<00:31, 213.35it/s][W::hts_idx_load2] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_436061877.gz.tbi
100%|██████████| 6673/6673 [00:18<00:00, 352.68it/s]


[2024-07-26 02:09:18.320402] Total extracted Editing sites: 2823.
[2024-07-26 02:09:18.321805] Features Extraction Finished. Elapsed time 0:00:19.046335.
[2024-07-26 02:09:18.340506] Computation Finished. Total Elapsed time: 0:06:10.719114
Total evaluated rows: 235536788
Total extracted Candidates Editing sites for current sample: 16915
[2024-07-26 02:09:32.739629] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:25.118002.
[2024-07-26 02:09:32.741382] Starting extraction of intervals.
[2024-07-26 02:09:32.754313] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_535670354


  0%|          | 20/16904 [00:00<01:34, 179.13it/s][W::hts_idx_load2] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_535670354.gz.tbi
  2%|▏         | 276/16904 [00:00<00:40, 414.41it/s]

Total evaluated rows: 236217106
Total extracted Candidates Editing sites for current sample: 15740
[2024-07-26 02:09:33.617210] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:25.995222.
[2024-07-26 02:09:33.619052] Starting extraction of intervals.
[2024-07-26 02:09:33.631254] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_773331943


  0%|          | 20/15728 [00:00<01:20, 194.04it/s]][W::hts_idx_load2] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_773331943.gz.tbi
 12%|█▏        | 1835/15728 [00:05<00:40, 344.62it/s]

Total evaluated rows: 239780038
Total extracted Candidates Editing sites for current sample: 8840
[2024-07-26 02:09:38.861141] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:31.239291.


 13%|█▎        | 2190/16904 [00:06<00:46, 319.23it/s]

[2024-07-26 02:09:38.862909] Starting extraction of intervals.
[2024-07-26 02:09:38.872090] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_396704193


 10%|▉         | 864/8828 [00:02<00:21, 374.06it/s]s]

Total evaluated rows: 241233789
Total extracted Candidates Editing sites for current sample: 

 17%|█▋        | 2674/15728 [00:07<00:34, 383.82it/s]

9315
[2024-07-26 02:09:41.287051] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:33.664895.

 18%|█▊        | 3059/16904 [00:08<00:40, 345.33it/s]


[2024-07-26 02:09:41.289096] Starting extraction of intervals.
[2024-07-26 02:09:41.299469] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_302610513


 18%|█▊        | 1602/8828 [00:04<00:19, 366.54it/s]]

Total evaluated rows: 241316303
Total extracted Candidates Editing sites for current sample: 19104
[2024-07-26 02:09:43.231616] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:06:35.610306.
[2024-07-26 02:09:43.233672] Starting extraction of intervals.


 22%|██▏       | 3733/16904 [00:10<00:40, 325.28it/s]

[2024-07-26 02:09:43.247178] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_192318299


100%|██████████| 8828/8828 [00:23<00:00, 378.64it/s]s]


[2024-07-26 02:10:02.225161] Total extracted Editing sites: 3734.
[2024-07-26 02:10:02.226623] Features Extraction Finished. Elapsed time 0:00:23.361816.
[2024-07-26 02:10:02.271931] Computation Finished. Total Elapsed time: 0:06:54.650107


100%|██████████| 9304/9304 [00:24<00:00, 374.98it/s]s]


[2024-07-26 02:10:06.177406] Total extracted Editing sites: 3958.
[2024-07-26 02:10:06.178861] Features Extraction Finished. Elapsed time 0:00:24.887606.
[2024-07-26 02:10:06.202314] Computation Finished. Total Elapsed time: 0:06:58.580180

 77%|███████▋  | 12100/15728 [00:32<00:09, 365.83it/s]




100%|██████████| 15728/15728 [00:42<00:00, 373.92it/s]


[2024-07-26 02:10:15.745586] Total extracted Editing sites: 6688.
[2024-07-26 02:10:15.747250] Features Extraction Finished. Elapsed time 0:00:42.125606.
[2024-07-26 02:10:15.783542] Computation Finished. Total Elapsed time: 0:07:08.161574


100%|██████████| 16904/16904 [00:44<00:00, 377.04it/s]


[2024-07-26 02:10:17.642503] Total extracted Editing sites: 7089.
[2024-07-26 02:10:17.643959] Features Extraction Finished. Elapsed time 0:00:44.900081.
[2024-07-26 02:10:17.678892] Computation Finished. Total Elapsed time: 0:07:10.057284


100%|██████████| 19093/19093 [00:50<00:00, 376.33it/s]


[2024-07-26 02:10:34.082861] Total extracted Editing sites: 7976.
[2024-07-26 02:10:34.084546] Features Extraction Finished. Elapsed time 0:00:50.848787.
[2024-07-26 02:10:34.125533] Computation Finished. Total Elapsed time: 0:07:26.504246
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0


[W::hts_idx_load2] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_436061877.gz.tbi


	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
Total evaluated rows: 235536788
Total extracted candidates bona fide Sites for outTable_535670354 outTable_396704193 samples: 9105
Total candidates positives bona fide Sites for outTable_535670354 outTable_396704193 samples: 6060Total evaluated rows:
Total candidates negatives bona fide Sites for outTable_535670354 outTable_396704193 samples: 3045
 [2024-07-26 02:18:44.158986] Extraction of candidates bonafide sites finished for outTable_535670354 outTable_396704193 samples. Elapsed time: 0:08:09.935500.236217106

Total extracted candidates bona fide Sites for outTable_773331943 outTable_302610513 samples: 8961
Total candidates positives bona fide Sites for o

Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:19:03
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_192318299_outTable_436061877_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 02:19:24
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:19:03
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_773331943_outTable_302610513_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 02:19:26
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:19:03
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/a549/outTable_535670354_outTable_396704193_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 02:19:27
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:24:02
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_d

In [3]:
min_dna_cov = 10
min_rna_cov = 50
min_AG_rate = 0.01
min_G = 3
seq_lenght = 101

cells = "HEK293T"

samples, rmsk_file_name, refseq_file_name, filespath = give_inputs(cells)

inputs = []
for i in range(len(samples)):
    for j in range(len(samples[i])):
        inputs.append([os.path.join(filespath, f"{samples[i][j]}"), min_G, min_AG_rate, min_rna_cov, seq_lenght])

with Pool(9) as pool:
    pool.starmap(extraction, inputs)
    
inputs = []
for i in range(3):
    inputs.append([samples[i][0], samples[i][1], filespath, cells, min_G, min_AG_rate, min_dna_cov, min_rna_cov])
    inputs.append([samples[i][2], samples[i][1], filespath, cells, min_G, min_AG_rate, min_dna_cov, min_rna_cov])

with Pool(6) as pool:
    pool.starmap(candidates_bona_fide_extraction, inputs)     

bonafide_identification(filespath, rmsk_file_name, refseq_file_name)

	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
Total evaluated rows: 194677901
Total extracted Candidates Editing sites 

 22%|██▏       | 2626/12116 [00:08<00:25, 370.47it/s]

	Sites evaluated: 200000000

 22%|██▏       | 2671/12116 [00:08<00:24, 393.24it/s]




 30%|██▉       | 3585/12116 [00:11<00:21, 389.97it/s]

	Sites evaluated: 200000000


 30%|██▉       | 3625/12116 [00:11<00:23, 369.03it/s]

	Sites evaluated: 200000000
	Sites evaluated: 200000000


 34%|███▍      | 4095/12116 [00:12<00:20, 385.78it/s]

	Sites evaluated: 200000000


 34%|███▍      | 4147/12116 [00:12<00:18, 422.19it/s]

Total evaluated rows: 199831579
Total extracted Candidates Editing sites for current sample: 24950
[2024-07-26 02:34:31.030527] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:05:27.828328.
[2024-07-26 02:34:31.032403] Starting extraction of intervals.
[2024-07-26 02:34:31.047963] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_110067244


 38%|███▊      | 4641/12116 [00:13<00:20, 359.05it/s]

	Sites evaluated: 200000000


 44%|████▍     | 5323/12116 [00:15<00:16, 410.75it/s]

	Sites evaluated: 200000000


 61%|██████    | 7332/12116 [00:20<00:12, 380.73it/s]

Total evaluated rows: 

  7%|▋         | 1811/24936 [00:07<01:03, 363.37it/s]

205675117
Total extracted Candidates Editing sites for current sample: 12340
[2024-07-26 02:34:39.046937] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:05:35.845007.
[2024-07-26 02:34:39.048965] Starting extraction of intervals.
[2024-07-26 02:34:39.059941] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_364841872


  3%|▎         | 360/12324 [00:01<00:45, 262.64it/s]]

Total evaluated rows: 208788107
Total extracted Candidates Editing sites for current sample: 11497
[2024-07-26 02:34:41.168633] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:05:37.966320.
[2024-07-26 02:34:41.170384] Starting extraction of intervals.
[2024-07-26 02:34:41.180118] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_597789462


100%|██████████| 12116/12116 [00:32<00:00, 369.48it/s]


[2024-07-26 02:34:51.114058] Total extracted Editing sites: 5265.
[2024-07-26 02:34:51.115648] Features Extraction Finished. Elapsed time 0:00:32.893675.
[2024-07-26 02:34:51.145426] Computation Finished. Total Elapsed time: 0:05:47.943941


100%|██████████| 12324/12324 [00:33<00:00, 369.49it/s]


[2024-07-26 02:35:12.686625] Total extracted Editing sites: 5266.
[2024-07-26 02:35:12.688165] Features Extraction Finished. Elapsed time 0:00:33.636295.
[2024-07-26 02:35:12.717166] Computation Finished. Total Elapsed time: 0:06:09.515253


100%|██████████| 11482/11482 [00:32<00:00, 356.74it/s]


[2024-07-26 02:35:13.478918] Total extracted Editing sites: 4943.
[2024-07-26 02:35:13.480383] Features Extraction Finished. Elapsed time 0:00:32.307812.


 60%|█████▉    | 14864/24936 [00:42<00:26, 386.24it/s]

[2024-07-26 02:35:13.507009] Computation Finished. Total Elapsed time: 0:06:10.304713


100%|██████████| 24936/24936 [01:07<00:00, 367.18it/s]


[2024-07-26 02:35:39.405683] Total extracted Editing sites: 10159.
[2024-07-26 02:35:39.408142] Features Extraction Finished. Elapsed time 0:01:08.373746.
[2024-07-26 02:35:39.460398] Computation Finished. Total Elapsed time: 0:06:36.258221
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
Total evaluated rows: 266283468
Total extracted Candidates Editing sites for current sample: 56231
[2024-07-26 02:36:23.320695] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:07:20.119042.
[2024-07-26 02:36:23.323184] Starting extraction of intervals.
[2024-07-26 02:36:23.365294] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_208420383


 18%|█▊        | 10399/56216 [00:30<01:57, 388.88it/s]

Total evaluated rows: 288775068
Total extracted Candidates Editing sites for current sample: 39037
[2024-07-26 02:36:54.469171] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:07:51.267379.
[2024-07-26 02:36:54.470987] Starting extraction of intervals.
[2024-07-26 02:36:54.492693] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_572868058


 29%|██▉       | 16466/56216 [00:47<01:44, 380.32it/s]

Total evaluated rows: 298335470
Total extracted Candidates Editing sites for current sample: 68163
[2024-07-26 02:37:11.413010] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:08:08.210927.

 15%|█▌        | 5946/39024 [00:16<01:50, 298.55it/s]


[2024-07-26 02:37:11.414780] Starting extraction of intervals.
[2024-07-26 02:37:11.447786] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_814257267


 29%|██▉       | 16540/56216 [00:48<02:03, 322.26it/s][W::hts_idx_load2] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_814257267.gz.tbi
 30%|███       | 16866/56216 [00:48<01:40, 393.22it/s]

Total evaluated rows: 298198400
Total extracted Candidates Editing sites for current sample: 69467
[2024-07-26 02:37:12.543361] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:08:09.341047.
[2024-07-26 02:37:12.545044] Starting extraction of intervals.

  1%|          | 349/68149 [00:00<03:02, 371.03it/s]




 30%|███       | 16915/56216 [00:49<01:33, 420.30it/s]

[2024-07-26 02:37:12.578558] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_530905096


 30%|███       | 17001/56216 [00:49<01:34, 417.10it/s][W::hts_idx_load2] The index file is older than the data file: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_530905096.gz.tbi
 33%|███▎      | 18281/56216 [00:52<01:40, 376.78it/s]

	Sites evaluated: 300000000


 95%|█████████▌| 37090/39024 [01:43<00:05, 362.64it/s]

	Sites evaluated: 350000000


100%|██████████| 39024/39024 [01:49<00:00, 356.93it/s]


[2024-07-26 02:38:43.955937] Total extracted Editing sites: 15794.
[2024-07-26 02:38:43.957827] Features Extraction Finished. Elapsed time 0:01:49.484880.


 47%|████▋     | 32103/68149 [01:32<01:36, 371.85it/s]

[2024-07-26 02:38:44.035496] Computation Finished. Total Elapsed time: 0:09:40.833734


100%|██████████| 56216/56216 [02:38<00:00, 355.29it/s]


[2024-07-26 02:39:01.786862] Total extracted Editing sites: 23605.
[2024-07-26 02:39:01.788743] Features Extraction Finished. Elapsed time 0:02:38.462631.


 54%|█████▍    | 37790/69453 [01:49<01:15, 416.70it/s]

[2024-07-26 02:39:01.899408] Computation Finished. Total Elapsed time: 0:09:58.697789


 59%|█████▉    | 40903/69453 [01:57<01:15, 375.94it/s]

Total evaluated rows: 369524874
Total extracted Candidates Editing sites for current sample: 56798
[2024-07-26 02:39:10.444545] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:10:07.243209.

 61%|██████    | 41665/68149 [01:58<01:12, 363.80it/s]


[2024-07-26 02:39:10.446261] Starting extraction of intervals.
[2024-07-26 02:39:10.473980] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_599710609


100%|██████████| 68149/68149 [03:14<00:00, 351.13it/s]
 96%|█████████▋| 66946/69453 [03:13<00:07, 319.24it/s]

[2024-07-26 02:40:25.714782] Total extracted Editing sites: 28704.
[2024-07-26 02:40:25.716658] Features Extraction Finished. Elapsed time 0:03:14.299941.


 96%|█████████▋| 66979/69453 [03:13<00:08, 297.22it/s]

[2024-07-26 02:40:25.846722] Computation Finished. Total Elapsed time: 0:11:22.644663


100%|██████████| 69453/69453 [03:20<00:00, 346.46it/s]


[2024-07-26 02:40:33.149605] Total extracted Editing sites: 29211.
[2024-07-26 02:40:33.151731] Features Extraction Finished. Elapsed time 0:03:20.604811.


 48%|████▊     | 27155/56785 [01:22<01:16, 386.13it/s]

[2024-07-26 02:40:33.286016] Computation Finished. Total Elapsed time: 0:11:30.083733


100%|██████████| 56785/56785 [02:50<00:00, 332.95it/s]


[2024-07-26 02:42:01.251908] Total extracted Editing sites: 23062.
[2024-07-26 02:42:01.253946] Features Extraction Finished. Elapsed time 0:02:50.805704.
[2024-07-26 02:42:01.358765] Computation Finished. Total Elapsed time: 0:12:58.157452
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0	Sites evaluated: 0

	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
Total evaluated rows: 199831579
Total extracted candidates bona fide Sites for outTable_110067244 outTable_597789

Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:55:42
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_110067244_outTable_597789462_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 02:56:37
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:55:42
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_572868058_outTable_364841872_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 02:56:41
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:55:42
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK293T/outTable_599710609_outTable_905657585_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 02:56:51
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 02:55:42
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/inde

In [36]:
min_dna_cov = 10
min_rna_cov = 30
min_AG_rate = 0.01
min_G = 2
seq_lenght = 101

cells = "HEK"

samples, rmsk_file_name, refseq_file_name, filespath = give_inputs(cells)
    
inputs = []
for i in range(len(samples)):
    for j in range(len(samples[i])):
        inputs.append([os.path.join(filespath, f"{samples[i][j]}"), min_G, min_AG_rate, min_rna_cov, seq_lenght])

with Pool(9) as pool:
    pool.starmap(extraction, inputs)
    
inputs = []
for i in range(3):
    inputs.append([samples[i][0], samples[i][1], filespath, cells, min_G, min_AG_rate, min_dna_cov, min_rna_cov])

with Pool(6) as pool:
    pool.starmap(candidates_bona_fide_extraction, inputs)     

bonafide_identification(filespath, rmsk_file_name, refseq_file_name)

	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0	Sites evaluated: 0

	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 300000000
	Sites evaluated: 

 17%|█▋        | 884/5315 [00:07<00:17, 255.93it/s]

Total evaluated rows: 422584480
Total extracted Candidates Editing sites for current sample: 13236
[2024-07-26 01:13:16.339988] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:11:17.750366.
[2024-07-26 01:13:16.342021] Starting extraction of intervals.
[2024-07-26 01:13:16.356252] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_181728208


100%|██████████| 5315/5315 [00:26<00:00, 202.84it/s]]


[2024-07-26 01:13:35.303211] Total extracted Editing sites: 2275.
[2024-07-26 01:13:35.304847] Features Extraction Finished. Elapsed time 0:00:26.299238.
[2024-07-26 01:13:35.322283] Computation Finished. Total Elapsed time: 0:11:36.732979


 95%|█████████▌| 12596/13223 [00:40<00:01, 358.58it/s]

	Sites evaluated: 450000000


 97%|█████████▋| 12877/13223 [00:41<00:00, 362.13it/s]

	Sites evaluated: 450000000


100%|██████████| 13223/13223 [00:42<00:00, 308.80it/s]


[2024-07-26 01:13:59.317995] Total extracted Editing sites: 5727.
[2024-07-26 01:13:59.319477] Features Extraction Finished. Elapsed time 0:00:42.975225.
[2024-07-26 01:13:59.351890] Computation Finished. Total Elapsed time: 0:12:00.762286
	Sites evaluated: 450000000
	Sites evaluated: 450000000
Total evaluated rows: 452378572
Total extracted Candidates Editing sites for current sample: 8716
[2024-07-26 01:14:05.888374] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:12:07.299556.
[2024-07-26 01:14:05.889963] Starting extraction of intervals.
[2024-07-26 01:14:05.913438] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_724242056


 55%|█████▍    | 4758/8703 [00:21<00:13, 289.60it/s]

Total evaluated rows: 469580145
Total extracted Candidates Editing sites for current sample: 6749
[2024-07-26 01:14:28.567701] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:12:29.978706.
[2024-07-26 01:14:28.569167] Starting extraction of intervals.
[2024-07-26 01:14:28.580087] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_816573740


100%|██████████| 8703/8703 [00:36<00:00, 236.35it/s]


[2024-07-26 01:14:42.827701] Total extracted Editing sites: 3636.
[2024-07-26 01:14:42.829221] Features Extraction Finished. Elapsed time 0:00:36.937226.
[2024-07-26 01:14:42.851109] Computation Finished. Total Elapsed time: 0:12:44.262308


 91%|█████████ | 6106/6737 [00:23<00:01, 349.26it/s]

Total evaluated rows: 483808594
Total extracted Candidates Editing sites for current sample: 12924


 91%|█████████ | 6142/6737 [00:23<00:02, 287.50it/s]

[2024-07-26 01:14:52.321966] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:12:53.732797.
[2024-07-26 01:14:52.324085] Starting extraction of intervals.
[2024-07-26 01:14:52.345980] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_580067564


100%|██████████| 6737/6737 [00:26<00:00, 253.78it/s]


[2024-07-26 01:14:55.207704] Total extracted Editing sites: 2909.
[2024-07-26 01:14:55.209059] Features Extraction Finished. Elapsed time 0:00:26.638099.
[2024-07-26 01:14:55.227999] Computation Finished. Total Elapsed time: 0:12:56.639017


 63%|██████▎   | 8193/12912 [00:30<00:18, 250.23it/s]

	Sites evaluated: 500000000


100%|██████████| 12912/12912 [00:47<00:00, 273.37it/s]


[2024-07-26 01:15:39.661464] Total extracted Editing sites: 5601.
[2024-07-26 01:15:39.663265] Features Extraction Finished. Elapsed time 0:00:47.337177.
[2024-07-26 01:15:39.700134] Computation Finished. Total Elapsed time: 0:13:41.110998
Total evaluated rows: 519011879
Total extracted Candidates Editing sites for current sample: 8560
[2024-07-26 01:15:53.493626] Extraction of Editing Candidates finished for current sample. Elapsed time: 0:13:54.903883.
[2024-07-26 01:15:53.495171] Starting extraction of intervals.
[2024-07-26 01:15:53.505571] Loading reditable with tabix and pysam: /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_854894021


100%|██████████| 8542/8542 [00:41<00:00, 206.81it/s]


[2024-07-26 01:16:34.879787] Total extracted Editing sites: 3867.
[2024-07-26 01:16:34.881134] Features Extraction Finished. Elapsed time 0:00:41.384062.
[2024-07-26 01:16:34.905574] Computation Finished. Total Elapsed time: 0:14:36.315844
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 250000000
	Sites evaluated: 300000000
	Sites evaluated: 300000000
	Sites evaluated: 300000000
	Sites evaluated: 350000000
	Sites evaluated: 350000000
	Sites evaluated: 350000000
	Sites evaluated: 400000000
	Sites evaluated: 400000000
	Sites evaluated: 400000000
Total evaluated rows: 422584480

Pysam version used: 0.15.4
Script time --> START: 26/07/2024 01:31:51
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_580067564_outTable_718392497_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 01:32:13
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 01:31:51
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_724242056_outTable_816573740_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 01:32:15
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 01:31:51
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_datasets/HEK/outTable_181728208_outTable_854894021_candidates_bona_fide_sites.out.rmsk
Script time --> END: 26/07/2024 01:32:29
Pysam version used: 0.15.4
Script time --> START: 26/07/2024 01:36:50
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/independent_data