In [8]:
import os, sys, time, gzip 
import pandas as pd
import os.path
from multiprocessing import Pool

def strandness_evaluation(input_path, sample_id):
    TypeOfRun = "Unstranded"
    with gzip.open(os.path.join(input_path, sample_id, sample_id+".gz")) as redi:
        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}", flush=True)
            if str(line[3]) != "2":
                TypeOfRun = "Stranded"
                break
    return TypeOfRun, sample_id

execution_start = time.time()

tissue = "fallopian_tube"

tables_path = "/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra"
files_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_dataset_files".format(tissue, tissue)
clusters_path =  "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_clusters".format(tissue, tissue)

samples_read = pd.read_csv(os.path.join(files_path, f"{tissue}_REDIportal.csv"))
print("N° candidates samples:", flush=True)
print(f"{len(samples_read)}", flush=True)

samples_read.dropna(inplace=True)
samples_read.reset_index(drop=True, inplace=True)
samples_read['Type of Run'] = "-"

print("N° samples with WGS:", flush=True)
print(f"{len(samples_read)}", flush=True)

inputs = []
for sample in samples_read.loc[:, "Sample"].tolist():
    inputs.append([tables_path, sample])

with Pool(32) as pool:
    for strandness, ID in pool.starmap(strandness_evaluation, inputs):
        samples_read.at[samples_read.loc[samples_read['Sample'] == ID].index[0], 'Type of Run'] = strandness

samples_read.to_csv (os.path.join(files_path, tissue+"_strandness_filtered.csv"), index=None)

samples_read = samples_read[samples_read.iloc[:,14] == 'Unstranded']
print("N° unstranded samples with WGS:", flush=True)
print(f"{len(samples_read)}", flush=True)


for sample in samples_read.loc[:, "Sample"].tolist():

    if os.path.exists(os.path.join(tables_path, sample, "EditingGood/editing.gz"))==False:
        print(f"Sample {sample} doesen't have the file editing.gz", flush=True)
        samples_read.drop(samples_read[samples_read.loc[:, "Sample"] ==  sample].index, axis=0, inplace=True)

    elif os.path.exists(os.path.join(tables_path, sample, sample+"_dna.txt.gz"))==False:
        print(f"Sample {sample} doesen't have the file {sample}_dna.txt.gz", flush=True)
        samples_read.drop(samples_read[samples_read.loc[:, "Sample"] ==  sample].index, axis=0, inplace=True)

samples_read.to_csv(os.path.join(files_path, tissue+"_complete_samples.csv"), index=None)

print(f"N° viable samples:", flush=True)
print(f"{len(samples_read)}", flush=True)

n_clusters = len(samples_read) // 40
clusters_dimensions = []
if n_clusters > 0:
    rest = len(samples_read) % 40
    additional = rest // n_clusters
    if additional > 0:
        if additional <= 10:
            rest_add = rest % n_clusters
            if rest_add > n_clusters:
                rest_add_add = rest_add // n_clusters
                rest_add_rest = rest_add % n_clusters
                lenght_cluster = 40 + additional + rest_add_add
                for n in range(n_clusters):
                    if rest_add > 0:
                        clusters_dimensions.append(lenght_cluster + 1)
                        rest_add_rest = rest_add_rest - 1
                    else:
                        clusters_dimensions.append(lenght_cluster)
            else:
                lenght_cluster = 40 + additional
                for n in range(n_clusters):
                    if rest_add > 0:
                        clusters_dimensions.append(lenght_cluster + 1)
                        rest_add = rest_add - 1
                    else:
                        clusters_dimensions.append(lenght_cluster)
        else:
            max_lenght_cluster = 40
            min_lenght_cluster = rest
            for n in range(n_clusters):
                clusters_dimensions.append(max_lenght_cluster)
            n_clusters = n_clusters +1
            clusters_dimensions.append(min_lenght_cluster)
    else:
        lenght_cluster = 40
        for n in range(n_clusters):
            if rest > 0:
                clusters_dimensions.append(lenght_cluster + 1)
                rest = rest - 1
            else:
                clusters_dimensions.append(lenght_cluster)
else:
    clusters_dimensions.append(len(samples_read))
    n_clusters = 1

samples_read = samples_read.reset_index(drop=True)
start = 0
end = 0
lenght = 0

for cluster in range(n_clusters):
    end = start + clusters_dimensions[cluster]
    new_cluster = samples_read.iloc[start:end]
    new_cluster.to_csv(os.path.join(clusters_path, f"{tissue}_cluster_{cluster}.csv"), index=None)
    start = end
    lenght = lenght + clusters_dimensions[cluster]

print(f"N° samples in clusters:", flush=True)
print(f"{lenght}", flush=True)
print("N° clusters:", flush=True)
print(f"{n_clusters}", flush=True)

execution_end = time.time()
print(f"Time elapsed: {(execution_end-execution_start)/60} minutes")


N° candidates samples:
7
N° samples with WGS:
7
	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0


	Sites evaluated: 0
	Sites evaluated: 0


	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 250000000
	Sites 

In [20]:
import os, gzip, sys, time
import pandas as pd
import numpy as np
from multiprocessing import Pool

def positives_finder(sample_id, input_path, output_path):

    df = pd.read_table(os.path.join(input_path, sample_id, "EditingGood/editing.gz"), sep="\t", 
                       compression="gzip", header=None, usecols=[0, 1, 7, 14, 15]) 
    print(f"Sample {sample_id} uploaded", flush=True)
    df.columns = ["Region", "Position", "Sub", "Rep", "Reg"]
    df = df[(df.loc[:, "Sub"]=="AG") | (df.loc[:, "Sub"]=="TC")]
    alu = df[df.loc[:,"Reg"].str.find("Alu")!=-1]
    not_rep = df[(df.loc[:,"Rep"].str.find("-")!=-1) & (df.loc[:, "Reg"].str.find("-")!=-1)]
    not_alu = df[(df.loc[:, "Rep"].str.find("-")==-1) & (df.loc[:, "Reg"].str.find("-")==-1) 
                 & (df.loc[:, "Reg"].str.find("Alu")==-1)]

    if alu.shape[0] > 0:
        alu.reset_index(drop=True, inplace=True)
        alu.loc[:, "Ref_Base"] = "T"
        alu.loc[:, "Annotation"] = "ALU"
        alu.loc[:, "Class"] = "ALU_POSITIVE"
        index = alu[alu.loc[:, "Sub"] == "AG"].index.tolist()
        alu.iloc[index, 5] = "A"

    if not_alu.shape[0] > 0:
        not_alu.reset_index(drop=True, inplace=True)   
        not_alu.loc[:, "Ref_Base"] = "T"
        not_alu.loc[:, "Annotation"] = "NOT_ALU"
        not_alu.loc[:, "Class"] = "NOT_ALU_POSITIVE" 
        index = not_alu[not_alu.loc[:, "Sub"] == "AG"].index.tolist()
        not_alu.iloc[index, 5] = "A"
        alu = pd.concat([alu, not_alu], axis=0)

    if not_rep.shape[0] > 0:
        not_rep.reset_index(drop=True, inplace=True)   
        not_rep.loc[:, "Ref_Base"] = "T"
        not_rep.loc[:, "Annotation"] = "NOT_REP"
        not_rep.loc[:, "Class"] = "NOT_REP_POSITIVE" 
        index = not_rep[not_rep.loc[:, "Sub"] == "AG"].index.tolist()
        not_rep.iloc[index, 5] = "A"
        alu = pd.concat([alu, not_rep], axis=0)
       
    del not_alu, not_rep

    alu.drop(["Sub", "Rep", "Reg"], axis=1, inplace=True)

    recoding = pd.read_csv("/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/additional_recoding_list.tsv", sep="\t")
   
    df = pd.read_table(os.path.join(input_path, sample_id, sample_id+"_dna.txt.gz"), sep="\t", 
                       compression="gzip", header=None, usecols=[0, 1, 7, 9, 12])
    df.columns = ["Region", "Position", "Sub_RNA", "G_Cov", "Sub_WGS"]
    
    not_rep = df[((df["Sub_RNA"]=="AG") | (df["Sub_RNA"]=="TC")) & (df["Sub_WGS"]=="-") & (df["G_Cov"]!="-")]
    del df

    not_rep = not_rep.merge(recoding, how="inner", on=["Region", "Position"])
    if not_rep.shape[0] > 0:
        not_rep.drop(["G_Cov", "Sub_WGS"], axis=1, inplace=True)
        not_rep.reset_index(drop=True, inplace=True)   
        not_rep.loc[:, "Ref_Base"] = "T"
        not_rep.loc[:, "Annotation"] = "NOT_ALU"
        not_rep.loc[:, "Class"] = "NOT_REP_POSITIVE" 
        index = not_rep[not_rep.loc[:, "Sub_RNA"] == "AG"].index.tolist()
        not_rep.iloc[index, 3] = "A"
        not_rep.drop("Sub_RNA", axis=1, inplace=True)
    alu = pd.concat([alu, not_rep], axis=0)

    print(f"N° positives canditates in sample {sample_id}: {alu.shape[0]}", flush=True)
    alu.to_csv(os.path.join(output_path, f"{sample_id}_positives_candidates_list.tsv"), sep="\t", index=None)
    return "Sample {} compleated".format(sample_id)
                
tissue, cluster_number = "fallopian_tube", "0"

tables_path = "/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra"
samples_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_samples_lists".format(tissue, tissue)

samples = pd.read_csv(f"/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{tissue}/{tissue}_clusters/{tissue}_cluster_{cluster_number}.csv", usecols=["Sample"])
samples = samples["Sample"].tolist()

start = time.time()

with Pool(10) as pool:
    for result in pool.starmap(positives_finder, [[sample, tables_path, samples_path] for sample in samples]):
        print(result, flush=True)
        
end = time.time()
print(f"Time elapsed: {round((end-start)/60, 1)} minutes", flush=True)


Sample SRR1101693 uploaded
Sample SRR1074140 uploaded
Sample SRR811938 uploaded
Sample SRR1082520 uploaded
Sample SRR1083776 uploaded
Sample SRR1071359 uploaded
Sample SRR1076584 uploaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

N° positives canditates in sample SRR1101693: 32041
N° positives canditates in sample SRR1082520: 49730


  return list(itertools.starmap(args[0], args[1]))


N° positives canditates in sample SRR811938: 64853


  return list(itertools.starmap(args[0], args[1]))


N° positives canditates in sample SRR1083776: 57361
N° positives canditates in sample SRR1074140: 72567
N° positives canditates in sample SRR1076584: 91555
N° positives canditates in sample SRR1071359: 126386
Sample SRR1071359 compleated
Sample SRR1074140 compleated
Sample SRR1076584 compleated
Sample SRR1082520 compleated
Sample SRR1083776 compleated
Sample SRR1101693 compleated
Sample SRR811938 compleated
Time elapsed: 5.4 minutes


In [21]:
import os, gzip, sys
import pandas as pd
import numpy as np
from multiprocessing import Pool

def finder(sample_id, input_path, output_path):

    regions_positions = np.empty((0,5))
    with gzip.open(os.path.join(input_path, sample_id, sample_id+"_dna.txt.gz")) as redi:
        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if line[2] == "A":
                if line[4] != "-" and line[9] != "-":
                    if int(line[4]) >= 50 and int(line[9]) >= 10:
                        if "AG" in line[7]:
                            if "AG" in line[12]:
                                AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                                AG_wgs = eval(line[11])[2]/sum(eval(line[11]))
                                if AG_wgs >= 0.4:
                                    if (AG_rna / AG_wgs <= 1.05) and (AG_rna / AG_wgs >= 0.95):
                                        regions_positions = np.append(regions_positions, [[line[0], int(line[1]), "A", "NOT_ANNOTED", "SNP"]], axis=0)
            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}")

    print(f"N° SNPs candidates in sample {sample_id}: {regions_positions.shape[0]}", flush=True)
    coordinates_list = pd.DataFrame(data=regions_positions)
    coordinates_list.columns = ["Region", "Position", "Ref_Base", "Annotation", "Class"]
    coordinates_list.to_csv(os.path.join(output_path, f"{sample_id}_snps_candidates_list.tsv"), sep="\t", index=None)
    del coordinates_list, regions_positions

    return "Sample {} compleated".format(sample_id)

tissue, cluster_number = "fallopian_tube", "0"

tables_path = "/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra"
samples_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_samples_lists".format(tissue, tissue)

samples = pd.read_csv(f"/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{tissue}/{tissue}_clusters/{tissue}_cluster_{cluster_number}.csv", usecols=["Sample"])
samples = samples["Sample"].tolist()


with Pool(15) as pool:
    for result in pool.starmap(finder, [[sample, tables_path, samples_path] for sample in samples]):
        print(result, flush=True)


	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0	Sites evaluated: 0
	Sites evaluated: 0





	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
	Sites evaluated: 200000000
N° SNPs candidates in sample SRR1082520: 940
N° SNPs candidates in sample SRR110169

In [15]:
import os, sys, time
import pandas as pd
from multiprocessing import Pool

def zeros_finder(sample_id, input_path, output_path):
    zeros = pd.read_table(os.path.join(input_path, sample_id, sample_id+"_dna.txt.gz"), compression="gzip", 
                          header=None, sep="\t", usecols=[0,1,2,4,7,9,12], low_memory=False)
    print(f"Sample {sample_id} uploaded", flush=True)
    zeros.columns = [f"{i}" for i in range(zeros.shape[1])]
    zeros = zeros[(zeros["2"]=="A") & (zeros["3"]!="-") & (zeros["5"]!="-") & (zeros["4"]=="-") & (zeros["6"]=="-")]
    zeros.reset_index(drop=True, inplace=True)
    zeros["3"] = zeros["3"].astype("int32")
    zeros["5"] = zeros["5"].astype("int32")
    zeros = zeros[(zeros["3"]>=200) & (zeros["5"]>=30)]
    zeros = zeros.loc[:, ["0", "1"]]
    zeros.columns = ["Region", "Position"]
    zeros["Ref_Base"] = "A"
    zeros["Annotation"] = "NOT_ANNOTED"
    zeros["Class"] = "ZERO_NEGATIVE"
    
    print(f"N° zero editing candidates in sample {sample_id}: {zeros.shape[0]}", flush=True)
    zeros.to_csv(os.path.join(output_path, f"{sample_id}_zeros_candidates_list.tsv"), sep="\t", index=None)
     
    return "Sample {} compleated".format(sample_id)  

tissue, cluster_number = "fallopian_tube", "0"

tables_path = "/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra"
samples_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_samples_lists".format(tissue, tissue)

samples = pd.read_csv(f"/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{tissue}/{tissue}_clusters/{tissue}_cluster_{cluster_number}.csv", usecols=["Sample"])
samples = samples["Sample"].tolist()

start = time.time()
with Pool(7) as pool:
    for result in pool.starmap(zeros_finder, [[sample, tables_path, samples_path] for sample in samples]):
        print(result, flush=True)
end = time.time()
print(f"Time elapsed: {(end-start)/60} minutes")

Sample SRR1082520 uploaded
Sample SRR1101693 uploaded
Sample SRR811938 uploaded
Sample SRR1083776 uploaded
Sample SRR1074140 uploaded
N° zero editing candidates in sample SRR1082520: 381421
N° zero editing candidates in sample SRR1101693: 173800
Sample SRR1076584 uploaded
N° zero editing candidates in sample SRR811938: 566117
N° zero editing candidates in sample SRR1083776: 492454
Sample SRR1071359 uploaded
N° zero editing candidates in sample SRR1074140: 474455
N° zero editing candidates in sample SRR1076584: 349353
N° zero editing candidates in sample SRR1071359: 411256
Sample SRR1071359 compleated
Sample SRR1074140 compleated
Sample SRR1076584 compleated
Sample SRR1082520 compleated
Sample SRR1083776 compleated
Sample SRR1101693 compleated
Sample SRR811938 compleated
Time elapsed: 6.655000257492065 minutes


In [16]:
import pandas as pd
import os, subprocess, shlex, time
from multiprocessing import Pool

def reader(path, sample):            
    temp = pd.read_csv(os.path.join(path, f"{str(sample)}_zeros_candidates_list.tsv"), 
                                                sep="\t", usecols=["Region", "Position"])
    temp["Sample"] = str(sample)
    temp.Position = temp.Position.astype("int32")
    temp.Region = temp.Region.astype("str")
    return temp, "\tSample {} uploaded".format(sample)

start = time.time()

tissue = "fallopian_tube"
tissues_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues"
tissue_path = os.path.join(tissues_path, f"{tissue}/{tissue}_samples_lists")
cluster_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_clusters".format(tissue, tissue)
u_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/scripts/utilities"
        
samples_list = pd.DataFrame()
for name in os.listdir(cluster_path):
    temp = pd.read_csv(f"{cluster_path}/{name}")
    samples_list = pd.concat([samples_list, temp], axis=0)
samples_list.reset_index(drop=True, inplace=True)

body_sites = samples_list.loc[:, "Body Site"].value_counts().to_dict()

for body_site in list(body_sites.keys()):
    print(f"Analisys on :{body_site}")

    body_sites_samples = samples_list[samples_list["Body Site"]==body_site]
    body_sites_samples = body_sites_samples["Sample"].tolist()

    df = pd.DataFrame()
    inputs = []
    for sample in body_sites_samples:
        inputs.append([tissue_path, sample])
    with Pool(15) as pool:
        for data, _ in pool.starmap(reader, inputs):
            print(_, flush=True)
            df = pd.concat([df, data], axis=0)
    df.reset_index(drop=True, inplace=True)

    dup = df.duplicated(subset = ["Region", "Position"], keep=False)
    index = dup[dup.loc[:]==True].index.tolist()
    df = df.iloc[index, :]
    df.reset_index(drop=True, inplace=True)
    contigs = df.iloc[:, 0].value_counts().index.tolist()
    
    final = pd.DataFrame()

    for contig in contigs:
        index = df[df.loc[:, "Region"]==contig].index.tolist()
        counts = df.iloc[index, 1].value_counts()
        temp = pd.DataFrame(data=[[contig for i in range(counts.shape[0])], counts.index.tolist(), counts.loc[:].tolist()]).T
        temp.columns = ["Region", "Position", "Raw Count"]
        temp.loc[:, "Raw Count"] = temp["Raw Count"].astype("int32")
        temp = temp[temp.loc[:, "Raw Count"]>1]
        temp.reset_index(drop=True, inplace=True)
        temp.loc[:, "Number of Samples (%)"] = round((temp.loc[:, "Raw Count"]/body_sites[body_site])*100, 2)
        temp = temp[temp.loc[:, "Number of Samples (%)"]>=70.0]
        temp.drop(["Raw Count", "Number of Samples (%)"], axis=1, inplace=True)
        final = pd.concat([final, temp], axis=0)
    
    file_name = "{}_zeros_counts_filtered".format(tissue)
    final.to_csv(f"{tissue_path}/{file_name}.tsv", sep="\t", index=None)
    cmd_sh = "python3 {}/AnnotateTablePython3.py -a {}/rmsk.sorted.gtf.gz -n rmsk -i {}/{}.tsv -o {}/{}.out.rmsk -u".format(u_path, u_path, tissue_path, file_name, tissue_path, file_name)
    args = shlex.split(cmd_sh)
    p = subprocess.Popen(args, env=dict(os.environ, PATH="/lustrehome/pietrolucamazzacuva/.conda/envs/tf/bin"))

time.sleep(60)

zeros = pd.read_csv(os.path.join(tissue_path, f"{tissue}_zeros_counts_filtered.out.rmsk"), sep="\t")
zeros=zeros[(zeros["rmsk_feat"]=="-") & (zeros["rmsk_gid"]=="-")]
zeros = zeros.loc[:, ["Region", "Position"]]
zeros.reset_index(drop=True, inplace=True)

for name in os.listdir(tissue_path):
    if name.find("zeros_candidates_list") != -1:
        temp = pd.read_csv(os.path.join(tissue_path, name), sep="\t")
        temp = temp.merge(zeros, how="inner", on=["Region", "Position"])
        temp.loc[:, "Annotation"] = "NOT_REP_ZERO"
        file_name = name.replace("zeros", "zeros_filtered")
        temp.to_csv(os.path.join(tissue_path, file_name), sep="\t", index=None)
        
end = time.time()
print(f"Time elapsed: {(end-start)/60} minutes")

Analisys on :Fallopian Tube
	Sample SRR1071359 uploaded
	Sample SRR1074140 uploaded
	Sample SRR1076584 uploaded
	Sample SRR1082520 uploaded
	Sample SRR1083776 uploaded
	Sample SRR1101693 uploaded
	Sample SRR811938 uploaded


Pysam version used: 0.21.0
Script time --> START: 08/03/2024 15:52:36
Table saved on /lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/fallopian_tube/fallopian_tube_samples_lists/fallopian_tube_zeros_counts_filtered.out.rmsk
Script time --> END: 08/03/2024 15:52:57


Time elapsed: 1.399435245990753 minutes


In [18]:
import os, gzip, sys, time, pysam
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from multiprocessing import Pool
from sklearn.preprocessing import OneHotEncoder

def extractor(sample_id, input_path_1, input_path_2, output_path, ohe, interval, suffix):
    
    intervals = []
    columns = ["Region", "Position", "Ref", "Strand", "Cov", "Qual", "[A,C,G,T]", "AllSubs", "Freq", "gCov", "gQual", "g[A,C,G,T]", "gAllSubs", "gFreq", "type", "sample"]
    if suffix == "zeros":
        df_sites_all = pd.read_csv(os.path.join(input_path_2, f"{sample_id}_{suffix}_filtered_candidates_list.tsv"), sep="\t")
    else:
        df_sites_all = pd.read_csv(os.path.join(input_path_2, f"{sample_id}_{suffix}_candidates_list.tsv"), sep="\t") 
    df_sites_all.columns = ["region", "position", "ref_base", "annotation", "label"]
    df = df_sites_all.query("region != 'chrM'")
    srr_filepath = os.path.join(input_path_1, sample, sample+'_dna.txt.gz')
    print(f"[{datetime.now()}] Loading reditable with tabix and pysam:", srr_filepath)
    start_time = datetime.now()
    srr = pysam.TabixFile(srr_filepath)

    features_extracted_filepath = "{}/{}_{}_features_{}_nucleotides.tsv".format(output_path, sample_id, suffix, interval)
    features_extracted = open(features_extracted_filepath, "w")

    with tqdm(total=df.shape[0], position=0, leave=True) as pbar:
        for site in df.itertuples():
            start = site.position - ((interval-1)/2)
            stop = site.position + ((interval-1)/2)
            srr_interval = []
            for s in srr.fetch(site.region, start-1, stop):
                srr_interval.append(s.split("\t"))
            srr_interval = pd.DataFrame(srr_interval, columns=columns[:-2])
            if srr_interval.shape[0] == interval:
                counts = srr_interval.AllSubs.value_counts()
                if "AG" in counts.index:
                    if counts.loc["AG"] >= 2:
                        intervals.append([site.region, site.position, site.ref_base, site.annotation, site.label, sample, start, stop, stop-start + 1, srr_interval.shape[0]])
                        seq = srr_interval.Ref.values.reshape(-1,1)
                        seq_ohe = ohe.transform(seq).toarray().T
                        vects_freqs = []
                        vects = []
                        for vect in srr_interval["[A,C,G,T]"]:
                            vect = np.array(eval(vect))
                            cov = sum(vect)
                            vect_freqs = vect / cov
                            vects_freqs.append(vect_freqs)
                            vects.append(vect)
                        vects_freqs = np.array(vects_freqs).T
                        vects = np.array(vects).T
                        site = pd.concat([pd.DataFrame(seq_ohe), pd.DataFrame(vects_freqs)])
                        site.to_csv(features_extracted, mode="a", sep="\t", header = None, index=None)
            pbar.update(1)
   
        intervals = pd.DataFrame(intervals, columns=["Region", "Position", "Ref_base", "Annotation","Type", "SRR", "Start", "Stop", "DeltaStartStop", "TabixIntervalLen"])
        intervals.to_csv(os.path.join(output_path, f"{sample_id}_{suffix}_feature_vectors_metadata.tsv"), sep="\t", index=None)
        lenght = len(intervals)
        print(f"[{datetime.now()}] Computation for sample {sample_id} finished. Elapsed time: {datetime.now()-start_time}")
        print(f"[{datetime.now()}] Total extracted sites: {lenght}. in Sample{sample}")

        return lenght

tissue, cluster_number, nucleotides_number, suffix = "fallopian_tube", 0, 101, "zeros"

tables_path = "/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra"
samples_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_samples_lists".format(tissue, tissue)
features_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_datasets".format(tissue, tissue)
if not os.path.isdir(features_path):
    os.mkdir(features_path)

samples = pd.read_csv(f"/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{tissue}/{tissue}_clusters/{tissue}_cluster_{cluster_number}.csv", usecols=["Sample"])
samples = samples["Sample"].tolist()
onehot = OneHotEncoder()
onehot.fit(np.array(["A", "C", "G", "T"]).reshape(-1, 1))

total_sites = 0
start_time_global = datetime.now()

with Pool(15) as pool:
    for sites in pool.starmap(extractor, [[sample, tables_path, samples_path, features_path, onehot, nucleotides_number, suffix] for sample in samples]):
        total_sites += sites
        
stop_time_global = datetime.now()
print(f"[{datetime.now()}] Computation Finished. Time elapsed {(stop_time_global-start_time_global)/60} minutes")
print(f"Cluster {cluster_number} Sites Extracted:{total_sites}", flush=True)

[2024-03-08 16:07:38.057434] Loading reditable with tabix and pysam:[2024-03-08 16:07:38.059824] Loading reditable with tabix and pysam:  /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz

[2024-03-08 16:07:38.075408] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 16:07:38.081661] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 16:07:38.096730] Loading reditable with tabix and pysam:

  0%|          | 0/99299 [00:00<?, ?it/s]

 /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 16:07:38.107697] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 16:07:38.110845] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz


 58%|█████▊    | 66946/115469 [03:44<03:03, 263.93it/s]

[2024-03-08 16:11:22.746155] Computation for sample SRR1101693 finished. Elapsed time: 0:03:44.678951
[2024-03-08 16:11:22.748209] Total extracted sites: 20300. in SampleSRR811938


100%|██████████| 66608/66608 [03:44<00:00, 296.53it/s]
 91%|█████████▏| 97435/106551 [05:27<00:22, 403.80it/s]

[2024-03-08 16:13:05.905478] Computation for sample SRR811938 finished. Elapsed time: 0:05:27.839343
[2024-03-08 16:13:05.907397] Total extracted sites: 27284. in SampleSRR811938


100%|██████████| 99299/99299 [05:27<00:00, 302.92it/s]
 82%|████████▏ | 100460/122524 [05:38<01:02, 353.73it/s]

[2024-03-08 16:13:16.565923] Computation for sample SRR1071359 finished. Elapsed time: 0:05:38.480109
[2024-03-08 16:13:16.567929] Total extracted sites: 29282. in SampleSRR811938


100%|██████████| 101957/101957 [05:38<00:00, 301.25it/s]
 87%|████████▋ | 106203/122524 [05:55<00:49, 327.48it/s]

[2024-03-08 16:13:34.095055] Computation for sample SRR1082520 finished. Elapsed time: 0:05:56.013626
[2024-03-08 16:13:34.097117] Total extracted sites: 30710. in SampleSRR811938


100%|██████████| 106551/106551 [05:55<00:00, 299.35it/s]
 94%|█████████▎| 114850/122524 [06:23<00:30, 247.61it/s]

[2024-03-08 16:14:01.283225] Computation for sample SRR1076584 finished. Elapsed time: 0:06:23.167744
[2024-03-08 16:14:01.285328] Total extracted sites: 32994. in SampleSRR811938


100%|██████████| 115469/115469 [06:23<00:00, 301.39it/s]
 99%|█████████▉| 121188/122524 [06:43<00:05, 236.88it/s]

[2024-03-08 16:14:21.589766] Computation for sample SRR1083776 finished. Elapsed time: 0:06:43.477010
[2024-03-08 16:14:21.591639] Total extracted sites: 34921. in SampleSRR811938


100%|██████████| 121432/121432 [06:43<00:00, 301.00it/s]
100%|█████████▉| 122523/122524 [06:47<00:00, 335.61it/s]

[2024-03-08 16:14:25.976642] Computation for sample SRR1074140 finished. Elapsed time: 0:06:47.874738
[2024-03-08 16:14:25.978509] Total extracted sites: 35336. in SampleSRR811938


100%|██████████| 122524/122524 [06:47<00:00, 300.43it/s]


[2024-03-08 16:14:26.131490] Computation Finished. Time elapsed 0:00:06.806897 minutes
Cluster 0 Sites Extracted:210827


In [22]:
import os, gzip, sys, time, pysam
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from multiprocessing import Pool
from sklearn.preprocessing import OneHotEncoder

def extractor(sample_id, input_path_1, input_path_2, output_path, ohe, interval, suffix):
    
    intervals = []
    columns = ["Region", "Position", "Ref", "Strand", "Cov", "Qual", "[A,C,G,T]", "AllSubs", "Freq", "gCov", "gQual", "g[A,C,G,T]", "gAllSubs", "gFreq", "type", "sample"]
    if suffix == "zeros":
        df_sites_all = pd.read_csv(os.path.join(input_path_2, f"{sample_id}_{suffix}_filtered_candidates_list.tsv"), sep="\t")
    else:
        df_sites_all = pd.read_csv(os.path.join(input_path_2, f"{sample_id}_{suffix}_candidates_list.tsv"), sep="\t") 
    df_sites_all.columns = ["region", "position", "ref_base", "annotation", "label"]
    df = df_sites_all.query("region != 'chrM'")
    srr_filepath = os.path.join(input_path_1, sample, sample+'_dna.txt.gz')
    print(f"[{datetime.now()}] Loading reditable with tabix and pysam:", srr_filepath)
    start_time = datetime.now()
    srr = pysam.TabixFile(srr_filepath)

    features_extracted_filepath = "{}/{}_{}_features_{}_nucleotides.tsv".format(output_path, sample_id, suffix, interval)
    features_extracted = open(features_extracted_filepath, "w")

    with tqdm(total=df.shape[0], position=0, leave=True) as pbar:
        for site in df.itertuples():
            start = site.position - ((interval-1)/2)
            stop = site.position + ((interval-1)/2)
            srr_interval = []
            for s in srr.fetch(site.region, start-1, stop):
                srr_interval.append(s.split("\t"))
            srr_interval = pd.DataFrame(srr_interval, columns=columns[:-2])
            if srr_interval.shape[0] == interval:
                counts = srr_interval.AllSubs.value_counts()
                if "AG" in counts.index:
                    if counts.loc["AG"] >= 2:
                        intervals.append([site.region, site.position, site.ref_base, site.annotation, site.label, sample, start, stop, stop-start + 1, srr_interval.shape[0]])
                        seq = srr_interval.Ref.values.reshape(-1,1)
                        seq_ohe = ohe.transform(seq).toarray().T
                        vects_freqs = []
                        vects = []
                        for vect in srr_interval["[A,C,G,T]"]:
                            vect = np.array(eval(vect))
                            cov = sum(vect)
                            vect_freqs = vect / cov
                            vects_freqs.append(vect_freqs)
                            vects.append(vect)
                        vects_freqs = np.array(vects_freqs).T
                        vects = np.array(vects).T
                        site = pd.concat([pd.DataFrame(seq_ohe), pd.DataFrame(vects_freqs)])
                        site.to_csv(features_extracted, mode="a", sep="\t", header = None, index=None)
            pbar.update(1)
   
        intervals = pd.DataFrame(intervals, columns=["Region", "Position", "Ref_base", "Annotation","Type", "SRR", "Start", "Stop", "DeltaStartStop", "TabixIntervalLen"])
        intervals.to_csv(os.path.join(output_path, f"{sample_id}_{suffix}_feature_vectors_metadata.tsv"), sep="\t", index=None)
        lenght = len(intervals)
        print(f"[{datetime.now()}] Computation for sample {sample_id} finished. Elapsed time: {datetime.now()-start_time}")
        print(f"[{datetime.now()}] Total extracted sites: {lenght}. in Sample{sample}")

        return lenght

tissue, cluster_number, nucleotides_number, suffix = "fallopian_tube", 0, 101, "positives"

tables_path = "/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra"
samples_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_samples_lists".format(tissue, tissue)
features_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_datasets".format(tissue, tissue)
if not os.path.isdir(features_path):
    os.mkdir(features_path)

samples = pd.read_csv(f"/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{tissue}/{tissue}_clusters/{tissue}_cluster_{cluster_number}.csv", usecols=["Sample"])
samples = samples["Sample"].tolist()
onehot = OneHotEncoder()
onehot.fit(np.array(["A", "C", "G", "T"]).reshape(-1, 1))

total_sites = 0
start_time_global = datetime.now()

with Pool(15) as pool:
    for sites in pool.starmap(extractor, [[sample, tables_path, samples_path, features_path, onehot, nucleotides_number, suffix] for sample in samples]):
        total_sites += sites
        
stop_time_global = datetime.now()
print(f"[{datetime.now()}] Computation Finished. Time elapsed {(stop_time_global-start_time_global)/60} minutes")
print(f"Cluster {cluster_number} Sites Extracted:{total_sites}", flush=True)

[2024-03-08 18:29:19.715073] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 18:29:19.737275] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 18:29:19.747674] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 18:29:19.753690] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 18:29:19.767923] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 18:29:19.776849] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz


  0%|          | 0/49730 [00:00<?, ?it/s]

[2024-03-08 18:29:19.830418] Loading reditable with tabix and pysam:

  0%|          | 0/32041 [00:00<?, ?it/s]

 /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz


 72%|███████▏  | 35886/49730 [01:47<00:48, 286.47it/s]]

[2024-03-08 18:31:07.268368] Computation for sample SRR1101693 finished. Elapsed time: 0:01:47.545160
[2024-03-08 18:31:07.270876] Total extracted sites: 5608. in SampleSRR811938


100%|██████████| 32041/32041 [01:47<00:00, 298.22it/s]
 73%|███████▎  | 47442/64853 [02:28<01:02, 279.42it/s]]

[2024-03-08 18:31:48.720470] Computation for sample SRR1082520 finished. Elapsed time: 0:02:28.960054
[2024-03-08 18:31:48.722894] Total extracted sites: 7691. in SampleSRR811938


100%|██████████| 49730/49730 [02:28<00:00, 334.00it/s]
 84%|████████▍ | 54402/64853 [02:51<00:29, 353.41it/s]]

[2024-03-08 18:32:11.188118] Computation for sample SRR1083776 finished. Elapsed time: 0:02:51.444311
[2024-03-08 18:32:11.190653] Total extracted sites: 8927. in SampleSRR811938


100%|██████████| 57361/57361 [02:51<00:00, 334.72it/s]
 64%|██████▍   | 80882/126386 [03:22<02:32, 298.23it/s]

[2024-03-08 18:32:42.221021] Computation for sample SRR811938 finished. Elapsed time: 0:03:22.466757
[2024-03-08 18:32:42.222922] Total extracted sites: 13381. in SampleSRR811938


100%|██████████| 64853/64853 [03:22<00:00, 320.41it/s]
 68%|██████▊   | 85660/126386 [03:34<01:16, 529.03it/s]

[2024-03-08 18:32:54.098540] Computation for sample SRR1074140 finished. Elapsed time: 0:03:34.315401
[2024-03-08 18:32:54.100224] Total extracted sites: 10532. in SampleSRR811938


100%|██████████| 72567/72567 [03:34<00:00, 338.66it/s]
 77%|███████▋  | 97173/126386 [04:02<00:45, 640.34it/s]

[2024-03-08 18:33:22.450219] Computation for sample SRR1076584 finished. Elapsed time: 0:04:02.675756
[2024-03-08 18:33:22.452620] Total extracted sites: 10394. in SampleSRR811938


100%|██████████| 91555/91555 [04:02<00:00, 377.32it/s]
100%|█████████▉| 126357/126386 [05:09<00:00, 388.72it/s] 

[2024-03-08 18:34:29.757284] Computation for sample SRR1071359 finished. Elapsed time: 0:05:09.921659
[2024-03-08 18:34:29.759574] Total extracted sites: 11419. in SampleSRR811938


100%|██████████| 126386/126386 [05:09<00:00, 407.83it/s]


[2024-03-08 18:34:29.918898] Computation Finished. Time elapsed 0:00:05.174182 minutes
Cluster 0 Sites Extracted:67952


In [23]:
import os, gzip, sys, time, pysam
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime
from multiprocessing import Pool
from sklearn.preprocessing import OneHotEncoder

def extractor(sample_id, input_path_1, input_path_2, output_path, ohe, interval, suffix):
    
    intervals = []
    columns = ["Region", "Position", "Ref", "Strand", "Cov", "Qual", "[A,C,G,T]", "AllSubs", "Freq", "gCov", "gQual", "g[A,C,G,T]", "gAllSubs", "gFreq", "type", "sample"]
    if suffix == "zeros":
        df_sites_all = pd.read_csv(os.path.join(input_path_2, f"{sample_id}_{suffix}_filtered_candidates_list.tsv"), sep="\t")
    else:
        df_sites_all = pd.read_csv(os.path.join(input_path_2, f"{sample_id}_{suffix}_candidates_list.tsv"), sep="\t") 
    df_sites_all.columns = ["region", "position", "ref_base", "annotation", "label"]
    df = df_sites_all.query("region != 'chrM'")
    srr_filepath = os.path.join(input_path_1, sample, sample+'_dna.txt.gz')
    print(f"[{datetime.now()}] Loading reditable with tabix and pysam:", srr_filepath)
    start_time = datetime.now()
    srr = pysam.TabixFile(srr_filepath)

    features_extracted_filepath = "{}/{}_{}_features_{}_nucleotides.tsv".format(output_path, sample_id, suffix, interval)
    features_extracted = open(features_extracted_filepath, "w")

    with tqdm(total=df.shape[0], position=0, leave=True) as pbar:
        for site in df.itertuples():
            start = site.position - ((interval-1)/2)
            stop = site.position + ((interval-1)/2)
            srr_interval = []
            for s in srr.fetch(site.region, start-1, stop):
                srr_interval.append(s.split("\t"))
            srr_interval = pd.DataFrame(srr_interval, columns=columns[:-2])
            if srr_interval.shape[0] == interval:
                counts = srr_interval.AllSubs.value_counts()
                if "AG" in counts.index:
                    if counts.loc["AG"] >= 2:
                        intervals.append([site.region, site.position, site.ref_base, site.annotation, site.label, sample, start, stop, stop-start + 1, srr_interval.shape[0]])
                        seq = srr_interval.Ref.values.reshape(-1,1)
                        seq_ohe = ohe.transform(seq).toarray().T
                        vects_freqs = []
                        vects = []
                        for vect in srr_interval["[A,C,G,T]"]:
                            vect = np.array(eval(vect))
                            cov = sum(vect)
                            vect_freqs = vect / cov
                            vects_freqs.append(vect_freqs)
                            vects.append(vect)
                        vects_freqs = np.array(vects_freqs).T
                        vects = np.array(vects).T
                        site = pd.concat([pd.DataFrame(seq_ohe), pd.DataFrame(vects_freqs)])
                        site.to_csv(features_extracted, mode="a", sep="\t", header = None, index=None)
            pbar.update(1)
   
        intervals = pd.DataFrame(intervals, columns=["Region", "Position", "Ref_base", "Annotation","Type", "SRR", "Start", "Stop", "DeltaStartStop", "TabixIntervalLen"])
        intervals.to_csv(os.path.join(output_path, f"{sample_id}_{suffix}_feature_vectors_metadata.tsv"), sep="\t", index=None)
        lenght = len(intervals)
        print(f"[{datetime.now()}] Computation for sample {sample_id} finished. Elapsed time: {datetime.now()-start_time}")
        print(f"[{datetime.now()}] Total extracted sites: {lenght}. in Sample{sample}")

        return lenght

tissue, cluster_number, nucleotides_number, suffix = "fallopian_tube", 0, 101, "snps"

tables_path = "/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra"
samples_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_samples_lists".format(tissue, tissue)
features_path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{}/{}_datasets".format(tissue, tissue)
if not os.path.isdir(features_path):
    os.mkdir(features_path)

samples = pd.read_csv(f"/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues/{tissue}/{tissue}_clusters/{tissue}_cluster_{cluster_number}.csv", usecols=["Sample"])
samples = samples["Sample"].tolist()
onehot = OneHotEncoder()
onehot.fit(np.array(["A", "C", "G", "T"]).reshape(-1, 1))

total_sites = 0
start_time_global = datetime.now()

with Pool(15) as pool:
    for sites in pool.starmap(extractor, [[sample, tables_path, samples_path, features_path, onehot, nucleotides_number, suffix] for sample in samples]):
        total_sites += sites
        
stop_time_global = datetime.now()
print(f"[{datetime.now()}] Computation Finished. Time elapsed {(stop_time_global-start_time_global)/60} minutes")
print(f"Cluster {cluster_number} Sites Extracted:{total_sites}", flush=True)

[2024-03-08 18:34:42.169800] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz[2024-03-08 18:34:42.174060] Loading reditable with tabix and pysam:
[2024-03-08 18:34:42.175892] Loading reditable with tabix and pysam: [2024-03-08 18:34:42.175771] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
 /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 18:34:42.181555] Loading reditable with tabix and pysam:
 /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz
[2024-03-08 18:34:42.189850] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz


  0%|          | 0/778 [00:00<?, ?it/s]]

[2024-03-08 18:34:42.277418] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR811938/SRR811938_dna.txt.gz


 88%|████████▊ | 805/918 [00:02<00:00, 312.88it/s]]

[2024-03-08 18:34:44.886599] Computation for sample SRR1101693 finished. Elapsed time: 0:00:02.690328
[2024-03-08 18:34:44.888329] Total extracted sites: 190. in SampleSRR811938


100%|██████████| 778/778 [00:02<00:00, 295.36it/s]
 97%|█████████▋| 915/940 [00:02<00:00, 307.93it/s]]

[2024-03-08 18:34:45.137566] Computation for sample SRR1076584 finished. Elapsed time: 0:00:02.853354
[2024-03-08 18:34:45.139343] Total extracted sites: 183. in SampleSRR811938


100%|██████████| 892/892 [00:02<00:00, 318.12it/s]
 99%|█████████▊| 906/918 [00:02<00:00, 314.43it/s]]

[2024-03-08 18:34:45.204307] Computation for sample SRR1082520 finished. Elapsed time: 0:00:03.016723[2024-03-08 18:34:45.205115] Computation for sample SRR1074140 finished. Elapsed time: 0:00:03.023164
[2024-03-08 18:34:45.205539] Computation for sample SRR1083776 finished. Elapsed time: 0:00:03.027317[2024-03-08 18:34:45.206525] Total extracted sites: 216. in SampleSRR811938


[2024-03-08 18:34:45.207994] Total extracted sites: 196. in SampleSRR811938[2024-03-08 18:34:45.207742] Total extracted sites: 199. in SampleSRR811938


100%|██████████| 940/940 [00:02<00:00, 314.51it/s]





100%|██████████| 940/940 [00:02<00:00, 316.81it/s]

100%|█████████▉| 1015/1018 [00:03<00:00, 324.27it/s]

[2024-03-08 18:34:45.344834] Computation for sample SRR1071359 finished. Elapsed time: 0:00:03.159492
[2024-03-08 18:34:45.346876] Total extracted sites: 189. in SampleSRR811938


100%|██████████| 1018/1018 [00:03<00:00, 327.89it/s]
 98%|█████████▊| 1404/1429 [00:04<00:00, 317.03it/s]

[2024-03-08 18:34:46.639906] Computation for sample SRR811938 finished. Elapsed time: 0:00:04.455947
[2024-03-08 18:34:46.641705] Total extracted sites: 310. in SampleSRR811938


100%|██████████| 1429/1429 [00:04<00:00, 324.25it/s]


[2024-03-08 18:34:46.718676] Computation Finished. Time elapsed 0:00:00.079709 minutes
Cluster 0 Sites Extracted:1483


In [41]:
import os, glob, time, random, sys
import pandas as pd
from multiprocessing import pool
from tqdm import tqdm

def counter(tissue_path, suffix):
    counter = 0
    for name in os.listdir(tissue_path):
        if name.find(f"{suffix}_feature_vectors_metadata.tsv") != -1:
            df = pd.read_csv(os.path.join(tissue_path, name), sep="\t", usecols=["Position", "Ref_base"])
            counter += df.shape[0]
    return counter

def converter(input_path, sample_id, input_indexes_1, input_indexes_2, input_indexes_3):
    
    cols = [i for i in range(101)]
    cols.sort(reverse=True)
    
    features = pd.read_csv(os.path.join(input_path, f"{sample_id}_positives_features_101_nucleotides.tsv"), 
                           sep="\t", header=None)
    features_a = features.iloc[input_indexes_1, :]
    features_a.reset_index(drop=True, inplace=True)
    features_t = features.iloc[input_indexes_2, :]
    features_t.reset_index(drop=True, inplace=True)
    del features
    features_t = features_t.iloc[:, cols]
    old_index = pd.Index(data=[f"{i}" for i in range(features_t.shape[0])])
    features_t.set_index(old_index, drop=True, inplace=True)
    new_index = []
    for i in range(0, features_t.shape[0]-7, 8):
        new_index += [f"{i+3}", f"{i+2}", f"{i+1}", f"{i}", f"{i+7}", f"{i+6}", f"{i+5}", f"{i+4}"] 
    features_t = features_t.reindex(new_index)
        
    features_snps = pd.read_csv(os.path.join(input_path, f"{sample_id}_snps_features_101_nucleotides.tsv"), sep="\t", header=None)
        
    if len(input_indexes_3) > 0:
        features_zero = pd.read_csv(os.path.join(input_path, f"{sample_id}_zeros_features_101_nucleotides.tsv"), 
                                    sep="\t", header=None)
        features_zero = features_zero.iloc[input_indexes_3, :]
        features_zero.reset_index(drop=True, inplace=True)
    else:
        features_zero = pd.DataFrame()
        
    cols = [f"{i}" for i in range(-50, 51, 1)]
    
    data_list = []
    if features_a.shape[0] > 0:
        features_a.columns = cols
        data_list.append(features_a)
    if features_t.shape[0] > 0:
        features_t.columns = cols
        data_list.append(features_t)
    if features_snps.shape[0] > 0:
        features_snps.columns = cols
        data_list.append(features_snps)
    if features_zero.shape[0] > 0:
        features_zero.columns = cols
        data_list.append(features_zero)
        
    data_list = pd.concat(data_list, axis=0)
    data_list.to_csv(os.path.join(input_path, f"{sample_id}_features_reduced_pos_zeros_snps.tsv"), sep="\t", header=False, index=None) 
    
    return data_list.shape[0]

zeros_tissue = "fallopian_tube"

start = time.time()

random.seed(42)

tissues = ["fallopian_tube"]

chrs = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", 
        "chr7", "chr8", "chr9", "chr10", "chr11", "chr12",
        "chr13", "chr14", "chr15", "chr16", "chr17", "chr18",
        "chr19", "chr20", "chr21", "chr22", "chrX", "chrY"]

columns = ["Region", "Position", "Ref_base", "Annotation", "Type", "SRR", "Start", "Stop", "DeltaStartStop", "TabixIntervalLen"]

path = "/lustrehome/pietrolucamazzacuva/filezilla-recas/tissues"

positives = pd.DataFrame()
for tissue in tqdm(tissues):
    data_path = os.path.join(path, f"{tissue}/{tissue}_datasets")
    for filename in glob.iglob(f"{data_path}/*_positives_feature_vectors_metadata.tsv"):
        sample = filename.replace("_positives_feature_vectors_metadata.tsv", "")
        sample = sample.replace(data_path+"/", "")
        df = pd.read_csv(filename, sep="\t", usecols=["Region", "Position", "Annotation"])
        df["Sample"] = sample
        df["Position_In_Dataset"] = [i for i in range(df.shape[0])]
        positives=pd.concat([positives, df], axis=0)

del data_path, sample, df

positives.reset_index(drop=True, inplace=True)    
index = positives[positives.loc[:, "Annotation"] != "NOT_ALU_POSITIVE"].index.tolist()
positives.drop("Annotation", axis=1, inplace=True)
positives_rep_not_alu = positives.iloc[index, :]
positives_rep_not_alu.drop(["Region", "Position"], axis=1, inplace=True)
positives.drop(index, axis=0, inplace=True)
positives.reset_index(drop=True, inplace=True)    
        
region_position = positives.loc[:, ["Region", "Position"]]
sample_position = positives.loc[:, ["Sample", "Position_In_Dataset"]]

del positives

value_counts = region_position.value_counts()
upper_5 = value_counts[value_counts.loc[:]>5].index.tolist()

del value_counts

drops = []

with tqdm(total = 24) as pbar:
    for Chr in chrs:
        globals()[Chr] = region_position[region_position["Region"]==Chr]
        pbar.update(1)

del region_position

with tqdm(total = len(upper_5)) as pbar:
    start = time.time()
    for region, position in upper_5:
        temp = globals()[region][globals()[region]["Position"]==position].index.tolist()
        del globals()[region]
        temp = random.sample(temp, len(temp)-5)
        drops += temp
        pbar.update(1)

drops.sort()

sample_position.drop(drops, axis=0, inplace=True)
sample_position = pd.concat([sample_position, positives_rep_not_alu], axis=0)
sample_position.sort_values(["Sample", "Position_In_Dataset"], inplace=True)
sample_position.to_csv(os.path.join(path, "positives_filtered_list.tsv"), sep="\t", index=None)

n_positives = sample_position.shape[0]
del sample_position, drops

inputs =[]
for tissue in tissues:
    inputs.append([os.path.join(path, f"{tissue}/{tissue}_datasets"), "snps"])

n_snps = 0   
with tqdm(total = len(inputs)) as pbar:
    with Pool(32) as pool:
        for result in pool.starmap(counter, inputs):
            n_snps += result
            pbar.update(1)

n_zeros = counter(os.path.join(path, f"{zeros_tissue}/{zeros_tissue}_datasets"), "zeros")

filtered_positions = pd.read_csv(os.path.join(path, "positives_filtered_list.tsv"), sep="\t")
zero_frac = round((n_positives-n_snps)/n_zeros, 9)
total_features = 0
with tqdm(total = len(tissues)) as pbar:
    for tissue in tissues:

        data_path = "{}/{}/{}_datasets".format(path, tissue, tissue)
        clusters_path = "{}/{}/{}_clusters".format(path, tissue, tissue)
        
        n_clusters = len(os.listdir(clusters_path))
        for cluster_number in range(n_clusters):
            
            samples = pd.read_csv(os.path.join(clusters_path, f"{tissue}_cluster_{cluster_number}.csv"), usecols=["Sample"])
            samples = samples["Sample"].tolist()

            inputs = []
            for sample in samples:
                metadata = pd.DataFrame()
                indexes_list = filtered_positions[filtered_positions["Sample"]==sample].loc[:, "Position_In_Dataset"].tolist()
                meta = pd.read_csv(os.path.join(data_path, f"{sample}_positives_feature_vectors_metadata.tsv"), 
                                   sep="\t", usecols=columns)
                meta = meta.iloc[indexes_list, :]
                index_a = meta[meta["Ref_base"] ==  "A"].index.tolist()
                index_t = meta[meta["Ref_base"] ==  "T"].index.tolist()
                indexes_a = []
                indexes_t = []
                for index in index_a:
                    indexes_a += [int(index)*8+i for i in range(8)]
                for index in index_t:
                    indexes_t += [int(index)*8+i for i in range(8)] 
                meta_a = meta.loc[index_a, :] 
                meta_t = meta.loc[index_t, :] 
                metadata = pd.concat([metadata, meta_a, meta_t], axis=0)
                del meta
                                  
                meta =  pd.read_csv(os.path.join(data_path, f"{sample}_snps_feature_vectors_metadata.tsv"), 
                                   sep="\t", usecols=columns)   
                
                metadata = pd.concat([metadata, meta], axis=0)
                del meta
                
                indexes_zero = []
                if os.path.isfile(os.path.join(data_path, f"{sample}_zeros_feature_vectors_metadata.tsv")):
                    meta = pd.read_csv(os.path.join(data_path, f"{sample}_zeros_feature_vectors_metadata.tsv"), 
                                       sep="\t", usecols=columns)
                    meta = meta.sample(frac=zero_frac, random_state=42)
                    metadata = pd.concat([metadata, meta], axis=0)
                    zero_index = meta.index.tolist()
                    indexes_zero= []
                    for index in zero_index:
                        indexes_zero += [int(index)*8+i for i in range(8)]
                    del meta
                    
                inputs.append([data_path, sample, indexes_a, indexes_t, indexes_zero])
                
                
                metadata.to_csv(os.path.join(data_path, f"{sample}_metadata_reduced_pos_zeros_snps.tsv"), sep="\t", index=None)
                del metadata
                
            with Pool(32) as pool:
                for result in pool.starmap(converter, inputs):
                    total_features += int(result/8)

        pbar.update(1) 

print(f"Total features matrices in dataset: {total_features}")
end = time.time()
print(f"Time elapsed: {(end-start)/60} minutes")

100%|██████████| 1/1 [00:00<00:00, 13.32it/s]
100%|██████████| 24/24 [00:00<00:00, 4210.97it/s]
0it [00:00, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  1.69it/s]
100%|██████████| 1/1 [00:11<00:00, 11.02s/it]

Total features matrices in dataset: 135904
Time elapsed: 0.1972874124844869 minutes



