In [1]:
import logging
from itertools import count
from pathlib import Path
from uBERTa.datasets_generator import DatasetGenerator

In [2]:
# Optionally, turn on logging to see the generation progress
logging.basicConfig(level=logging.INFO)

In [3]:
# Replace with relevant download paths
DS_PATH, REF_PATH = 'DS_BASE.tsv', 'hg38.fa'

In [4]:
?DatasetGenerator

[0;31mInit signature:[0m
[0mDatasetGenerator[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mds_path[0m[0;34m:[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mref_path[0m[0;34m:[0m [0mpathlib[0m[0;34m.[0m[0mPath[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mneg_multiplier[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mneg_fractions[0m[0;34m:[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpos_fractions[0m[0;34m:[0m [0mTuple[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m,[0m [0mfloat[0m[0;34m][0m [0;34m=[0m [0;34m([0m[0;36m1.0[0m[0;34m,[0m [0;36m1.0[0m[0;34m,[0m [0;36m1.0[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevel_ts[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mflank_size[0m[0;3

- Most of the sampling time is spent on scanning the genome, triplet-wise, starting from a random location, for a particular start codon
- The number of "expected" negative samples will by slightly higher than the number of actually 

In [5]:
# Setup the generator
dsg = DatasetGenerator(
    DS_PATH, REF_PATH, 
    neg_multiplier=1, 
    neg_fractions=(
        0.0,  # Completely random samples 
        0.5,  # Random samples centered on start codons
        0.5,  # Valid uORFs without experimental support
        0.0   # Valid uORFs with experimental support
    ),
    pos_fractions=(
        0.8,  # u samples  (uORF start codons)
        0.8,  # ma samples (alternative start codons)
        0.5   # m samples  (CDS start codons)
    ),
    flank_size=200
)

# Infinite ds generator with the given params
gen = (dsg() for _ in count())

In [6]:
%%time
next(gen).head()

INFO:uBERTa.datasets_generator:Obtained a dataset with 830191 records
INFO:uBERTa.datasets_generator:Found 7426 positive and 822765 negative records
INFO:uBERTa.datasets_generator:Will sample 3680=4600 * 0.8 positive examples of group u
INFO:uBERTa.datasets_generator:Will sample 388=485 * 0.8 positive examples of group ma
INFO:uBERTa.datasets_generator:Will sample 1170=2341 * 0.5 positive examples of group m
INFO:uBERTa.datasets_generator:Sampled 5238 positive examples
INFO:uBERTa.datasets_generator:Expecting around 5238 total negative examples
INFO:uBERTa.datasets_generator:Num samples after the per-codon correction: 5232
INFO:uBERTa.datasets_generator:Found 1147 codons ATG
INFO:uBERTa.datasets_generator:Found 752 codons CTG
INFO:uBERTa.datasets_generator:Found 288 codons GTG
INFO:uBERTa.datasets_generator:Found 143 codons ACG
INFO:uBERTa.datasets_generator:Found 131 codons TTG
INFO:uBERTa.datasets_generator:Found 72 codons ATC
INFO:uBERTa.datasets_generator:Found 63 codons ATT
INFO:u

Fetching seqs:   0%|          | 0/10470 [00:00<?, ?it/s]

CPU times: user 10.4 s, sys: 3.36 s, total: 13.8 s
Wall time: 13.7 s


Unnamed: 0,Chrom,StartCodonStart,Strand,IsPositive,Seq
0,chr5,36877057,+,1,GACTCACCCGACACCACCAAGCCGCAGGGAGGGACGCCCCCGCCGA...
1,chr20,10420750,-,1,CTGCCTGCTCCCTTTGGGTCAATGACATGTCTCTTTTTGTTTATTA...
2,chr13,102846126,+,1,TGCACCCCGGTCTTCCATTAGCGGCGCAGACGTTTGGGCCTAAGCG...
3,chr4,139454129,+,1,CTAGGTAGACCGGCGCCAGCCCGAGTGACGCCTGGCGTGTGGCCGC...
4,chr19,7522635,+,1,CAGAGTCAGCCCATCCCCCGCCACCCAGAGCGCGTCGGCGCTAGGA...


In [7]:
dsg.drop_meta = True
dsg().head()

INFO:uBERTa.datasets_generator:Obtained a dataset with 830191 records
INFO:uBERTa.datasets_generator:Found 7426 positive and 822765 negative records
INFO:uBERTa.datasets_generator:Will sample 3680=4600 * 0.8 positive examples of group u
INFO:uBERTa.datasets_generator:Will sample 388=485 * 0.8 positive examples of group ma
INFO:uBERTa.datasets_generator:Will sample 1170=2341 * 0.5 positive examples of group m
INFO:uBERTa.datasets_generator:Sampled 5238 positive examples
INFO:uBERTa.datasets_generator:Expecting around 5238 total negative examples
INFO:uBERTa.datasets_generator:Num samples after the per-codon correction: 5234
INFO:uBERTa.datasets_generator:Found 1158 codons ATG
INFO:uBERTa.datasets_generator:Found 736 codons CTG
INFO:uBERTa.datasets_generator:Found 281 codons GTG
INFO:uBERTa.datasets_generator:Found 146 codons ACG
INFO:uBERTa.datasets_generator:Found 141 codons TTG
INFO:uBERTa.datasets_generator:Found 73 codons ATC
INFO:uBERTa.datasets_generator:Found 63 codons ATT
INFO:u

Fetching seqs:   0%|          | 0/10472 [00:00<?, ?it/s]

Unnamed: 0,Seq,IsPositive
0,AGCTCGGAAGACCGCGAGGCGCGCTTTTCTGACGCATCGGCGCCTT...,1
1,GCGTCCTCCTGCCGGCCTGCAGGCCCGGGGCCTCCGCCTGCTTCCC...,1
2,GGCTAATTCCGAATTCCAAATCGGAAGCAAGAGGGCGGGGCCCCGT...,1
3,GCTCGTGGACTAGCTCCGCCTCCTAGCGTCTGACTACCCCCTCAGC...,1
4,CTGCAGGTACGCGCGGGCCGGGCGGGGCGGGCGGGCGGCGGGCGCG...,1


In [8]:
dsg.kmer_size = 6
dsg().head()

INFO:uBERTa.datasets_generator:Obtained a dataset with 830191 records
INFO:uBERTa.datasets_generator:Found 7426 positive and 822765 negative records
INFO:uBERTa.datasets_generator:Will sample 3680=4600 * 0.8 positive examples of group u
INFO:uBERTa.datasets_generator:Will sample 388=485 * 0.8 positive examples of group ma
INFO:uBERTa.datasets_generator:Will sample 1170=2341 * 0.5 positive examples of group m
INFO:uBERTa.datasets_generator:Sampled 5238 positive examples
INFO:uBERTa.datasets_generator:Expecting around 5238 total negative examples
INFO:uBERTa.datasets_generator:Num samples after the per-codon correction: 5236
INFO:uBERTa.datasets_generator:Found 1141 codons ATG
INFO:uBERTa.datasets_generator:Found 752 codons CTG
INFO:uBERTa.datasets_generator:Found 289 codons GTG
INFO:uBERTa.datasets_generator:Found 140 codons TTG
INFO:uBERTa.datasets_generator:Found 137 codons ACG
INFO:uBERTa.datasets_generator:Found 71 codons ATC
INFO:uBERTa.datasets_generator:Found 68 codons ATT
INFO:u

Fetching seqs:   0%|          | 0/10474 [00:00<?, ?it/s]

Unnamed: 0,Seq,IsPositive
0,GAGCGT AGCGTT GCGTTG CGTTGC GTTGCG TTGCGG TGCG...,1
1,CCGGGG CGGGGC GGGGCT GGGCTA GGCTAC GCTACA CTAC...,1
2,AGTGTT GTGTTT TGTTTG GTTTGT TTTGTT TTGTTA TGTT...,1
3,ACTCCC CTCCCA TCCCAC CCCACC CCACCG CACCGA ACCG...,1
4,GGCTGA GCTGAA CTGAAT TGAATC GAATCG AATCGC ATCG...,1
