In [1]:
##################
#####################
import pysam, os
import pandas as pd
from glob import glob
from tqdm import tqdm
from datetime import datetime
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
# load all sites extracted from kidney
columns = ["Region", "Position", "Ref", "Strand", "Cov", "Qual", "[A,C,G,T]", "AllSubs", "Freq", "gCov", "gQual", "g[A,C,G,T]", "gAllSubs", "gFreq", "type", "sample"]
print(len(columns))
df_sites_all = pd.read_table("kidney_extracted_pos_neg_sites_rediportal_13112023.tsv", index_col=0, names=columns, skiprows=1)
df_sites_all

16


Unnamed: 0,Region,Position,Ref,Strand,Cov,Qual,"[A,C,G,T]",AllSubs,Freq,gCov,gQual,"g[A,C,G,T]",gAllSubs,gFreq,type,sample
0,chr1,887801,A,2,58,38.74,"[0, 0, 58, 0]",AG,1.00,42,28.88,"[0, 0, 42, 0]",AG,1.0,SNP,SRR1071807
1,chr1,981931,A,2,56,38.34,"[0, 0, 56, 0]",AG,1.00,23,28.61,"[0, 0, 23, 0]",AG,1.0,SNP,SRR1071807
2,chr1,1271175,A,2,135,37.31,"[0, 0, 135, 0]",AG,1.00,28,27.36,"[0, 0, 28, 0]",AG,1.0,SNP,SRR1071807
3,chr1,1288345,A,2,246,38.83,"[0, 0, 246, 0]",AG,1.00,47,29.02,"[0, 0, 47, 0]",AG,1.0,SNP,SRR1071807
4,chr1,1288823,A,2,136,37.49,"[0, 0, 136, 0]",AG,1.00,34,28.09,"[0, 0, 34, 0]",AG,1.0,SNP,SRR1071807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1306746,chrY,21869330,A,2,72,37.06,"[70, 0, 2, 0]",AG,0.03,21,35.76,"[21, 0, 0, 0]",-,0.0,Editing,SRR821356
1306747,chrY,21869415,A,2,76,37.67,"[73, 0, 3, 0]",AG,0.04,15,36.87,"[15, 0, 0, 0]",-,0.0,Editing,SRR821356
1306748,chrY,21869628,A,2,59,36.02,"[58, 0, 1, 0]",AG,0.02,16,36.75,"[16, 0, 0, 0]",-,0.0,Editing,SRR821356
1306749,chrY,21870242,A,2,72,36.71,"[70, 0, 2, 0]",AG,0.03,21,38.00,"[21, 0, 0, 0]",-,0.0,Editing,SRR821356


In [3]:
# detect number of samples
df_sites_all["sample"].value_counts().shape

(31,)

In [4]:
# create ohe instance
ohe = OneHotEncoder()
ohe.fit(np.array(["A", "C", "G", "T"]).reshape(-1, 1))

features_extracted_filepath = "kidney_extracted_pos_neg_sites_rediportal_13112023.feature_vectors.tsv"
features_extracted = open(features_extracted_filepath, "w")
intervals = []
total_snps = 0
total_editing = 0
interval = 101
start_time_global = datetime.now()
for n,sample in enumerate(df_sites_all["sample"].value_counts().index):
    print(f"\n########################\n[{datetime.now()}] Processing sample:", sample)
    df = df_sites_all.query(f"sample == '{sample}'")
    # mantain only non chrM regions
    df = df.query("Region != 'chrM'")
    srr_filepath = glob( os.path.join('/lustre/biomed/epicardi/ncbi/dbGaP-6698/sra',sample,sample+'_dna.txt.gz'))[0]
    print(f"[{datetime.now()}] Loading reditable with tabix and pysam:", srr_filepath)
    start_time = datetime.now()
    srr = pysam.TabixFile(srr_filepath)
    # extract pos examples
    with tqdm(total=df.shape[0], position=0, leave=True) as pbar:
        for site in df.query("type == 'Editing'").itertuples():
            start = site.Position - ((interval-1)/2)
            stop = site.Position + ((interval-1)/2)
            AGrna = eval(site._7)[2]/sum(eval(site._7))
            AGwgs = eval(site._12)[2]/sum(eval(site._12))
            srr_interval = []
            for s in srr.fetch(site.Region, start-1, stop):
                srr_interval.append(s.split("\t"))
            srr_interval = pd.DataFrame(srr_interval, columns=columns[:-2])
            # assess wheter interval is of the required length and contains at least 2 AG subs
            if srr_interval.shape[0] == interval and srr_interval.AllSubs.value_counts().reset_index()[srr_interval.AllSubs.value_counts().reset_index()["index"].str.contains("AG")].shape[0] >= 2:
                intervals.append([site.Region, site.Position, site.Strand, AGrna, AGwgs, site.type, site.sample, start, stop, stop-start + 1, srr_interval.shape[0]])
                total_editing += 1
                # encode features vector and write to disk
                seq = srr_interval.Ref.values.reshape(-1,1)
                seq_ohe = ohe.transform(seq).toarray().T
                vects_freqs = []
                vects = []
                for vect in srr_interval["[A,C,G,T]"]:
                    vect = np.array(eval(vect))
                    cov = sum(vect)
                    vect_freqs = vect / cov
                    vects_freqs.append(vect_freqs)
                    vects.append(vect)
                vects_freqs = np.array(vects_freqs).T
                vects = np.array(vects).T
                site = pd.concat([pd.DataFrame(seq_ohe), pd.DataFrame(vects_freqs)])
                # save to disk (append mode)
                site.to_csv(features_extracted, mode="a", sep="\t", header = None, index=None)
            pbar.update(1)
        # extract neg examples
        for site in df.query("type == 'SNP'").itertuples():
            start = site.Position - ((interval-1)/2)
            stop = site.Position + ((interval-1)/2)
            AGrna = eval(site._7)[2]/sum(eval(site._7))
            AGwgs = eval(site._12)[2]/sum(eval(site._12))
            srr_interval = []
            for s in srr.fetch(site.Region, start-1, stop):
                srr_interval.append(s.split("\t"))
            srr_interval = pd.DataFrame(srr_interval, columns=columns[:-2])
            # assess wheter interval is of the required length
            if srr_interval.shape[0] == interval:
                intervals.append([site.Region, site.Position, site.Strand, AGrna, AGwgs, site.type, site.sample, start, stop, stop-start + 1, srr_interval.shape[0]])
                total_snps += 1
                # encode features vector and write to disk
                seq = srr_interval.Ref.values.reshape(-1,1)
                seq_ohe = ohe.transform(seq).toarray().T
                vects_freqs = []
                vects = []
                for vect in srr_interval["[A,C,G,T]"]:
                    vect = np.array(eval(vect))
                    cov = sum(vect)
                    vect_freqs = vect / cov
                    vects_freqs.append(vect_freqs)
                    vects.append(vect)
                vects_freqs = np.array(vects_freqs).T
                vects = np.array(vects).T
                site = pd.concat([pd.DataFrame(seq_ohe), pd.DataFrame(vects_freqs)])
                # save to disk (append mode)
                site.to_csv(features_extracted, mode="a", sep="\t", header = None, index=None)
            pbar.update(1)
        print(f"[{datetime.now()}] Computation for sample {sample} finished. Elapsed time: {datetime.now()-start_time}")
        print(f"[{datetime.now()}] Total extracted Editing sites: {total_editing}. Total extracted SNPs: {total_snps}. Samples processed: {n+1}")
stop_time_global = datetime.now()
print(f"[{datetime.now()}] Computation Finished. Elapsed time {stop_time_global-start_time_global}")
features_extracted.close()


########################
[2023-11-14 02:56:52.008185] Processing sample: SRR1432650
[2023-11-14 02:56:52.046440] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1432650/SRR1432650_dna.txt.gz


100%|██████████| 64134/64134 [03:23<00:00, 314.88it/s]


[2023-11-14 03:00:15.749154] Computation for sample SRR1432650 finished. Elapsed time: 0:03:23.702628
[2023-11-14 03:00:15.749259] Total extracted Editing sites: 3654. Total extracted SNPs: 737. Samples processed: 1

########################
[2023-11-14 03:00:15.749795] Processing sample: SRR1317086
[2023-11-14 03:00:15.791549] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1317086/SRR1317086_dna.txt.gz


100%|██████████| 61595/61595 [03:16<00:00, 313.32it/s]


[2023-11-14 03:03:32.405986] Computation for sample SRR1317086 finished. Elapsed time: 0:03:16.614359
[2023-11-14 03:03:32.406025] Total extracted Editing sites: 5909. Total extracted SNPs: 1288. Samples processed: 2

########################
[2023-11-14 03:03:32.407223] Processing sample: SRR1089504
[2023-11-14 03:03:32.451430] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1089504/SRR1089504_dna.txt.gz


100%|██████████| 60651/60651 [03:29<00:00, 289.78it/s]


[2023-11-14 03:07:01.776810] Computation for sample SRR1089504 finished. Elapsed time: 0:03:29.325094
[2023-11-14 03:07:01.776852] Total extracted Editing sites: 7302. Total extracted SNPs: 1900. Samples processed: 3

########################
[2023-11-14 03:07:01.777821] Processing sample: SRR1314940
[2023-11-14 03:07:01.816643] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1314940/SRR1314940_dna.txt.gz


100%|██████████| 60648/60648 [03:20<00:00, 302.05it/s]


[2023-11-14 03:10:22.646913] Computation for sample SRR1314940 finished. Elapsed time: 0:03:20.830190
[2023-11-14 03:10:22.646954] Total extracted Editing sites: 9702. Total extracted SNPs: 2630. Samples processed: 4

########################
[2023-11-14 03:10:22.647646] Processing sample: SRR1465871
[2023-11-14 03:10:22.687762] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1465871/SRR1465871_dna.txt.gz


100%|██████████| 60349/60349 [03:05<00:00, 325.63it/s]


[2023-11-14 03:13:28.045220] Computation for sample SRR1465871 finished. Elapsed time: 0:03:05.357387
[2023-11-14 03:13:28.045255] Total extracted Editing sites: 11928. Total extracted SNPs: 3132. Samples processed: 5

########################
[2023-11-14 03:13:28.045906] Processing sample: SRR1080366
[2023-11-14 03:13:28.085113] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1080366/SRR1080366_dna.txt.gz


100%|██████████| 58968/58968 [03:16<00:00, 299.44it/s]


[2023-11-14 03:16:45.042755] Computation for sample SRR1080366 finished. Elapsed time: 0:03:16.957236
[2023-11-14 03:16:45.042798] Total extracted Editing sites: 13648. Total extracted SNPs: 3734. Samples processed: 6

########################
[2023-11-14 03:16:45.043642] Processing sample: SRR1469746
[2023-11-14 03:16:45.082080] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1469746/SRR1469746_dna.txt.gz


100%|██████████| 54668/54668 [02:58<00:00, 306.04it/s]


[2023-11-14 03:19:43.742502] Computation for sample SRR1469746 finished. Elapsed time: 0:02:58.660340
[2023-11-14 03:19:43.742545] Total extracted Editing sites: 15265. Total extracted SNPs: 4277. Samples processed: 7

########################
[2023-11-14 03:19:43.743693] Processing sample: SRR1380931
[2023-11-14 03:19:43.782567] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1380931/SRR1380931_dna.txt.gz


100%|██████████| 53805/53805 [02:39<00:00, 337.11it/s]


[2023-11-14 03:22:23.416195] Computation for sample SRR1380931 finished. Elapsed time: 0:02:39.633547
[2023-11-14 03:22:23.416240] Total extracted Editing sites: 17369. Total extracted SNPs: 4822. Samples processed: 8

########################
[2023-11-14 03:22:23.416831] Processing sample: SRR810007
[2023-11-14 03:22:23.452940] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR810007/SRR810007_dna.txt.gz


100%|██████████| 52127/52127 [03:25<00:00, 253.05it/s]


[2023-11-14 03:25:49.474565] Computation for sample SRR810007 finished. Elapsed time: 0:03:26.021185
[2023-11-14 03:25:49.474608] Total extracted Editing sites: 20992. Total extracted SNPs: 6006. Samples processed: 9

########################
[2023-11-14 03:25:49.475155] Processing sample: SRR1329154
[2023-11-14 03:25:49.514394] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1329154/SRR1329154_dna.txt.gz


100%|██████████| 50537/50537 [02:33<00:00, 328.48it/s]


[2023-11-14 03:28:23.394344] Computation for sample SRR1329154 finished. Elapsed time: 0:02:33.879519
[2023-11-14 03:28:23.394396] Total extracted Editing sites: 22688. Total extracted SNPs: 6441. Samples processed: 10

########################
[2023-11-14 03:28:23.395065] Processing sample: SRR1435730
[2023-11-14 03:28:23.433201] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1435730/SRR1435730_dna.txt.gz


100%|██████████| 49742/49742 [02:37<00:00, 315.00it/s]


[2023-11-14 03:31:01.369800] Computation for sample SRR1435730 finished. Elapsed time: 0:02:37.936527
[2023-11-14 03:31:01.370093] Total extracted Editing sites: 24302. Total extracted SNPs: 6965. Samples processed: 11

########################
[2023-11-14 03:31:01.370846] Processing sample: SRR1445835
[2023-11-14 03:31:01.408471] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1445835/SRR1445835_dna.txt.gz


100%|██████████| 47075/47075 [02:34<00:00, 305.46it/s]


[2023-11-14 03:33:35.548582] Computation for sample SRR1445835 finished. Elapsed time: 0:02:34.139835
[2023-11-14 03:33:35.548623] Total extracted Editing sites: 25843. Total extracted SNPs: 7388. Samples processed: 12

########################
[2023-11-14 03:33:35.549264] Processing sample: SRR1452888
[2023-11-14 03:33:35.587397] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1452888/SRR1452888_dna.txt.gz


100%|██████████| 46372/46372 [02:31<00:00, 305.68it/s]


[2023-11-14 03:36:07.337105] Computation for sample SRR1452888 finished. Elapsed time: 0:02:31.749355
[2023-11-14 03:36:07.337150] Total extracted Editing sites: 27523. Total extracted SNPs: 7903. Samples processed: 13

########################
[2023-11-14 03:36:07.337857] Processing sample: SRR1447631
[2023-11-14 03:36:07.376928] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1447631/SRR1447631_dna.txt.gz


100%|██████████| 46158/46158 [02:16<00:00, 337.43it/s]


[2023-11-14 03:38:24.217617] Computation for sample SRR1447631 finished. Elapsed time: 0:02:16.840614
[2023-11-14 03:38:24.217660] Total extracted Editing sites: 28916. Total extracted SNPs: 8391. Samples processed: 14

########################
[2023-11-14 03:38:24.218912] Processing sample: SRR1325483
[2023-11-14 03:38:24.255765] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1325483/SRR1325483_dna.txt.gz


100%|██████████| 42655/42655 [02:10<00:00, 327.48it/s]


[2023-11-14 03:40:34.534439] Computation for sample SRR1325483 finished. Elapsed time: 0:02:10.278596
[2023-11-14 03:40:34.534546] Total extracted Editing sites: 30369. Total extracted SNPs: 8723. Samples processed: 15

########################
[2023-11-14 03:40:34.535501] Processing sample: SRR1328447
[2023-11-14 03:40:34.572802] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1328447/SRR1328447_dna.txt.gz


100%|██████████| 41034/41034 [02:04<00:00, 329.81it/s]


[2023-11-14 03:42:39.018742] Computation for sample SRR1328447 finished. Elapsed time: 0:02:04.445766
[2023-11-14 03:42:39.018784] Total extracted Editing sites: 31418. Total extracted SNPs: 9143. Samples processed: 16

########################
[2023-11-14 03:42:39.019377] Processing sample: SRR1340662
[2023-11-14 03:42:39.056116] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1340662/SRR1340662_dna.txt.gz


100%|██████████| 40817/40817 [02:03<00:00, 330.14it/s]


[2023-11-14 03:44:42.719943] Computation for sample SRR1340662 finished. Elapsed time: 0:02:03.663755
[2023-11-14 03:44:42.719988] Total extracted Editing sites: 32585. Total extracted SNPs: 9588. Samples processed: 17

########################
[2023-11-14 03:44:42.720574] Processing sample: SRR1456711
[2023-11-14 03:44:42.757211] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1456711/SRR1456711_dna.txt.gz


100%|██████████| 40649/40649 [02:09<00:00, 312.92it/s]


[2023-11-14 03:46:52.707960] Computation for sample SRR1456711 finished. Elapsed time: 0:02:09.950680
[2023-11-14 03:46:52.708008] Total extracted Editing sites: 34112. Total extracted SNPs: 9957. Samples processed: 18

########################
[2023-11-14 03:46:52.709071] Processing sample: SRR1468426
[2023-11-14 03:46:52.745275] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1468426/SRR1468426_dna.txt.gz


100%|██████████| 39542/39542 [02:04<00:00, 316.46it/s]


[2023-11-14 03:48:57.739245] Computation for sample SRR1468426 finished. Elapsed time: 0:02:04.993568
[2023-11-14 03:48:57.739281] Total extracted Editing sites: 35439. Total extracted SNPs: 10345. Samples processed: 19

########################
[2023-11-14 03:48:57.740004] Processing sample: SRR1377578
[2023-11-14 03:48:57.775694] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1377578/SRR1377578_dna.txt.gz


100%|██████████| 39397/39397 [02:03<00:00, 318.50it/s]


[2023-11-14 03:51:01.498191] Computation for sample SRR1377578 finished. Elapsed time: 0:02:03.722219
[2023-11-14 03:51:01.498233] Total extracted Editing sites: 36778. Total extracted SNPs: 10709. Samples processed: 20

########################
[2023-11-14 03:51:01.498833] Processing sample: SRR1105272
[2023-11-14 03:51:01.534299] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1105272/SRR1105272_dna.txt.gz


100%|██████████| 36332/36332 [01:49<00:00, 332.87it/s]


[2023-11-14 03:52:50.707736] Computation for sample SRR1105272 finished. Elapsed time: 0:01:49.173357
[2023-11-14 03:52:50.707782] Total extracted Editing sites: 37677. Total extracted SNPs: 11135. Samples processed: 21

########################
[2023-11-14 03:52:50.708639] Processing sample: SRR1500261
[2023-11-14 03:52:50.745058] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1500261/SRR1500261_dna.txt.gz


100%|██████████| 36000/36000 [01:44<00:00, 345.48it/s]


[2023-11-14 03:54:34.974465] Computation for sample SRR1500261 finished. Elapsed time: 0:01:44.229118
[2023-11-14 03:54:34.974512] Total extracted Editing sites: 38508. Total extracted SNPs: 11588. Samples processed: 22

########################
[2023-11-14 03:54:34.975114] Processing sample: SRR1362263
[2023-11-14 03:54:35.011313] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1362263/SRR1362263_dna.txt.gz


100%|██████████| 34206/34206 [01:40<00:00, 340.08it/s]


[2023-11-14 03:56:15.621291] Computation for sample SRR1362263 finished. Elapsed time: 0:01:40.609901
[2023-11-14 03:56:15.621335] Total extracted Editing sites: 39422. Total extracted SNPs: 11917. Samples processed: 23

########################
[2023-11-14 03:56:15.622671] Processing sample: SRR1071807
[2023-11-14 03:56:15.658379] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1071807/SRR1071807_dna.txt.gz


100%|██████████| 32500/32500 [01:34<00:00, 342.53it/s]


[2023-11-14 03:57:50.567693] Computation for sample SRR1071807 finished. Elapsed time: 0:01:34.909048
[2023-11-14 03:57:50.567735] Total extracted Editing sites: 40102. Total extracted SNPs: 12277. Samples processed: 24

########################
[2023-11-14 03:57:50.568774] Processing sample: SRR1490658
[2023-11-14 03:57:50.603654] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1490658/SRR1490658_dna.txt.gz


100%|██████████| 29803/29803 [01:26<00:00, 344.29it/s]


[2023-11-14 03:59:17.195706] Computation for sample SRR1490658 finished. Elapsed time: 0:01:26.591682
[2023-11-14 03:59:17.195750] Total extracted Editing sites: 40910. Total extracted SNPs: 12594. Samples processed: 25

########################
[2023-11-14 03:59:17.196923] Processing sample: SRR1085759
[2023-11-14 03:59:17.231572] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1085759/SRR1085759_dna.txt.gz


100%|██████████| 26477/26477 [01:23<00:00, 315.71it/s]


[2023-11-14 04:00:41.125430] Computation for sample SRR1085759 finished. Elapsed time: 0:01:23.893782
[2023-11-14 04:00:41.125476] Total extracted Editing sites: 41178. Total extracted SNPs: 12927. Samples processed: 26

########################
[2023-11-14 04:00:41.125983] Processing sample: SRR821356
[2023-11-14 04:00:41.158707] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR821356/SRR821356_dna.txt.gz


100%|██████████| 25680/25680 [01:34<00:00, 270.93it/s]


[2023-11-14 04:02:15.971147] Computation for sample SRR821356 finished. Elapsed time: 0:01:34.812386
[2023-11-14 04:02:15.971188] Total extracted Editing sites: 42449. Total extracted SNPs: 13445. Samples processed: 27

########################
[2023-11-14 04:02:15.971825] Processing sample: SRR1420649
[2023-11-14 04:02:16.004663] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1420649/SRR1420649_dna.txt.gz


100%|██████████| 24372/24372 [01:10<00:00, 346.85it/s]


[2023-11-14 04:03:26.298374] Computation for sample SRR1420649 finished. Elapsed time: 0:01:10.293542
[2023-11-14 04:03:26.298420] Total extracted Editing sites: 42839. Total extracted SNPs: 13789. Samples processed: 28

########################
[2023-11-14 04:03:26.299080] Processing sample: SRR809943
[2023-11-14 04:03:26.331135] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR809943/SRR809943_dna.txt.gz


100%|██████████| 21992/21992 [00:51<00:00, 426.52it/s]


[2023-11-14 04:04:17.917208] Computation for sample SRR809943 finished. Elapsed time: 0:00:51.585864
[2023-11-14 04:04:17.917252] Total extracted Editing sites: 43225. Total extracted SNPs: 14052. Samples processed: 29

########################
[2023-11-14 04:04:17.917863] Processing sample: SRR1442708
[2023-11-14 04:04:17.951060] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1442708/SRR1442708_dna.txt.gz


100%|██████████| 20900/20900 [01:01<00:00, 338.92it/s]


[2023-11-14 04:05:19.641977] Computation for sample SRR1442708 finished. Elapsed time: 0:01:01.690868
[2023-11-14 04:05:19.642019] Total extracted Editing sites: 43741. Total extracted SNPs: 14244. Samples processed: 30

########################
[2023-11-14 04:05:19.642718] Processing sample: SRR1437274
[2023-11-14 04:05:19.673383] Loading reditable with tabix and pysam: /lustre/biomed/epicardi/ncbi/dbGaP-6698/sra/SRR1437274/SRR1437274_dna.txt.gz


100%|██████████| 7566/7566 [00:17<00:00, 442.96it/s]

[2023-11-14 04:05:36.776516] Computation for sample SRR1437274 finished. Elapsed time: 0:00:17.103101
[2023-11-14 04:05:36.776552] Total extracted Editing sites: 43898. Total extracted SNPs: 14321. Samples processed: 31
[2023-11-14 04:05:36.777352] Computation Finished. Elapsed time 1:08:44.829018





In [5]:
intervals = pd.DataFrame(intervals, columns=["Region", "Position", "Strand", "AGrna", "AGwgs", "Type", "SRR", "Start", "Stop", "DeltaStartStop", "TabixIntervalLen"])
intervals

Unnamed: 0,Region,Position,Strand,AGrna,AGwgs,Type,SRR,Start,Stop,DeltaStartStop,TabixIntervalLen
0,chr1,880977,2,0.010676,0.000000,Editing,SRR1432650,880927.0,881027.0,101.0,101
1,chr1,934686,2,0.018868,0.000000,Editing,SRR1432650,934636.0,934736.0,101.0,101
2,chr1,934696,2,0.017241,0.000000,Editing,SRR1432650,934646.0,934746.0,101.0,101
3,chr1,934723,2,0.010870,0.000000,Editing,SRR1432650,934673.0,934773.0,101.0,101
4,chr1,991238,2,0.010319,0.000000,Editing,SRR1432650,991188.0,991288.0,101.0,101
...,...,...,...,...,...,...,...,...,...,...,...
58214,chr22,24936970,2,0.477273,0.482143,SNP,SRR1437274,24936920.0,24937020.0,101.0,101
58215,chr22,24938908,2,0.476923,0.500000,SNP,SRR1437274,24938858.0,24938958.0,101.0,101
58216,chrX,43604841,2,1.000000,1.000000,SNP,SRR1437274,43604791.0,43604891.0,101.0,101
58217,chrX,153629155,2,1.000000,1.000000,SNP,SRR1437274,153629105.0,153629205.0,101.0,101


In [6]:
!cat kidney_extracted_pos_neg_sites_rediportal_13112023.feature_vectors.tsv | wc -l

465752


In [7]:
# save intervals as response variable and metadata
intervals.to_csv("kidney_extracted_pos_neg_sites_rediportal_13112023.feature_vectors.metadata.tsv", sep="\t", index=None)

In [8]:
!cat kidney_extracted_pos_neg_sites_rediportal_13112023.feature_vectors.metadata.tsv | head

Region	Position	Strand	AGrna	AGwgs	Type	SRR	Start	Stop	DeltaStartStop	TabixIntervalLen
chr1	880977	2	0.010676156583629894	0.0	Editing	SRR1432650	880927.0	881027.0	101.0	101
chr1	934686	2	0.018867924528301886	0.0	Editing	SRR1432650	934636.0	934736.0	101.0	101
chr1	934696	2	0.017241379310344827	0.0	Editing	SRR1432650	934646.0	934746.0	101.0	101
chr1	934723	2	0.010869565217391304	0.0	Editing	SRR1432650	934673.0	934773.0	101.0	101
chr1	991238	2	0.010318949343339587	0.0	Editing	SRR1432650	991188.0	991288.0	101.0	101
chr1	1191495	2	0.0205761316872428	0.0	Editing	SRR1432650	1191445.0	1191545.0	101.0	101
chr1	1248165	2	0.024691358024691357	0.0	Editing	SRR1432650	1248115.0	1248215.0	101.0	101
chr1	1288523	2	0.01171875	0.0	Editing	SRR1432650	1288473.0	1288573.0	101.0	101
chr1	1309144	2	0.035211267605633804	0.0	Editing	SRR1432650	1309094.0	1309194.0	101.0	101
cat: write error: Broken pipe


In [9]:
!cat kidney_extracted_pos_neg_sites_rediportal_13112023.feature_vectors.metadata.tsv | wc -l

58220


In [10]:
intervals["Type"].value_counts()

Editing    43898
SNP        14321
Name: Type, dtype: int64

In [15]:
intervals.groupby("Type").describe()[["AGrna", "AGwgs"]].T

Unnamed: 0,Type,Editing,SNP
AGrna,count,43898.0,14321.0
AGrna,mean,0.026781,0.91403
AGrna,std,0.056509,0.187875
AGrna,min,0.01,0.38
AGrna,25%,0.011858,0.994318
AGrna,50%,0.014749,1.0
AGrna,75%,0.020408,1.0
AGrna,max,1.0,1.0
AGwgs,count,43898.0,14321.0
AGwgs,mean,0.0,0.914845


In [16]:
43898 + 14321

58219.0

In [None]:
# load features
feat = pd.read_table()