In [1]:
##################
### Author: Adriano Fonzino. email: adriano.fonzino@uniba.it Date: 11/2023
##################
import pandas as pd
import os
from glob import glob
import pysam
import gzip
from datetime import datetime

In [2]:
kidney_runs = ['SRR1071807',
 'SRR1080366',
 'SRR1085759',
 'SRR1089504',
 'SRR1105272',
 'SRR1314940',
 'SRR1317086',
 'SRR1325483',
 'SRR1328447',
 'SRR1329154',
 'SRR1340662',
 'SRR1362263',
 'SRR1377578',
 'SRR1380931',
 'SRR1420649',
 'SRR1432650',
 'SRR1435730',
 'SRR1437274',
 'SRR1442708',
 'SRR1445835',
 'SRR1447631',
 'SRR1452888',
 'SRR1456711',
 'SRR1465871',
 'SRR1468426',
 'SRR1469746',
 'SRR1490658',
 'SRR1500261',
 'SRR809943',
 'SRR810007',
 'SRR821356']
print(len(kidney_runs))

31


In [3]:
sites_all = []
for SRR in kidney_runs:
    # deduce reditable filepath
    if len(glob( os.path.join('/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra',SRR,SRR+'_dna.txt.gz'))) == 1:
        reditable = glob( os.path.join('/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra',SRR,SRR+'_dna.txt.gz'))[0]
    else:
        continue
    start = datetime.now()
    print(f"\n######################\n[{datetime.now()}] Extracting SNPs and Editing sites for reditable:", reditable)
    # search snps and editing sites
    snps = []
    editing = []
    with gzip.open(reditable) as redi:
        for c,l in enumerate(redi):
            line = l.decode("utf-8").rstrip().split("\t")
            if line[2] == "A":
                if line[4] != "-" and line[9] != "-":
                    if int(line[4]) >= 50 and int(line[9]) >= 10:
                        if "AG" in line[7]:
                            if "AG" in line[12]:
                                AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                                AG_wgs = eval(line[11])[2]/sum(eval(line[11]))
                                if AG_wgs >= 0.4:
                                    if (AG_rna / AG_wgs <= 1.05) and (AG_rna / AG_wgs >= 0.95):
                                        snps.append(line)
                            elif not "AG" in line[12]:
                                AG_rna = eval(line[6])[2]/sum(eval(line[6]))
                                if AG_rna >= 0.01:
                                    editing.append(line)
            if c % 50000000 == 0:
                print(f"\tSites evaluated: {c}")
    print("Total evaluated rows:", c)
    snps = pd.DataFrame(snps)
    editing = pd.DataFrame(editing)
    snps["type"] = "SNP"
    snps["sample"] = SRR
    editing["type"] = "Editing"
    editing["sample"] = SRR
    print("Total extracted SNPs for current sample:", snps.shape[0])
    print("Total extracted Editing sites for current sample:", editing.shape[0])
    stop = datetime.now()
    print(f"[{datetime.now()}] Extraction finished for current sample. Elapsed time: {stop-start}")
    # append to all
    sites_all.append(snps)
    sites_all.append(editing)
    # create an updated version of the global df
    df_sites_all = pd.concat(sites_all, ignore_index=True)
    # save an updated version of the global df to tsv on disk
    df_sites_all.to_csv("kidney_extracted_pos_neg_sites_rediportal_13112023.tsv", sep="\t")
print(f"[{datetime.now()}] Computation Finished")


######################
[2023-11-13 21:43:56.698037] Extracting SNPs and Editing sites for reditable: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1071807/SRR1071807_dna.txt.gz
	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
Total evaluated rows: 179057199
Total extracted SNPs for current sample: 692
Total extracted Editing sites for current sample: 31808
[2023-11-13 21:48:12.682736] Extraction finished for current sample. Elapsed time: 0:04:15.984692

######################
[2023-11-13 21:48:12.820418] Extracting SNPs and Editing sites for reditable: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1080366/SRR1080366_dna.txt.gz
	Sites evaluated: 0
	Sites evaluated: 50000000
	Sites evaluated: 100000000
	Sites evaluated: 150000000
	Sites evaluated: 200000000
	Sites evaluated: 250000000
	Sites evaluated: 300000000
Total evaluated rows: 310399069
Total extracted SNPs for current sample: 1145
Total extracted Editing sites for current s