In [1]:
from custom_random_forest import RandomForestClassifierCustom

from bio_files_processor import OpenFasta

from bioinf_starter_pack import (run_genscan,
                                 RNASequence,
                                 AminoAcidSequence,
                                 DNASequence
                                 )

## Class RandomForestClassifierCustom
- `custom_random_forest.py`

Methods `fit` and `predict` are CPU-bound tasks and, theoretically, might be speeded up with parallel processes. However, `predict` was accelerated better with flows than ones mentioned above. To show the difference, `is_process` argument to `predict` that shift processes to flows was added.

In [2]:
from sklearn.datasets import make_classification

In [3]:
X, y = make_classification(n_samples=1000000)
n_estimators = 50

#### Running program with 1 `n_jobs`

In [4]:
random_forest = RandomForestClassifierCustom(max_depth=20, n_estimators=n_estimators, 
                                             max_features=2, random_state=42)

In [6]:
%%time
random_forest.fit(X, y, n_jobs=1)

CPU times: user 862 ms, sys: 3.29 s, total: 4.15 s
Wall time: 4min 30s


- Parallel processes

In [7]:
%%time
result_1process = random_forest.predict(X, n_jobs=1)

CPU times: user 1.29 s, sys: 5.53 s, total: 6.82 s
Wall time: 26.1 s


- Parallel flows

In [8]:
%%time
result_1flow = random_forest.predict(X, n_jobs=1, is_process=False)

CPU times: user 2.99 s, sys: 284 ms, total: 3.27 s
Wall time: 3.27 s


#### Running program with 2 `n_jobs`

In [9]:
random_forest = RandomForestClassifierCustom(max_depth=20, n_estimators=n_estimators, 
                                             max_features=2, random_state=42)

In [10]:
%%time
random_forest.fit(X, y, n_jobs=2)

CPU times: user 648 ms, sys: 2.58 s, total: 3.23 s
Wall time: 2min 22s


- Parallel processes

In [11]:
%%time
result_2processes = random_forest.predict(X, n_jobs=2)

CPU times: user 1.03 s, sys: 4.35 s, total: 5.38 s
Wall time: 17.4 s


- Parallel flows

In [12]:
%%time
result_2flows = random_forest.predict(X, n_jobs=2, is_process=False)

CPU times: user 3.16 s, sys: 298 ms, total: 3.46 s
Wall time: 1.82 s


#### Comparison of results

In [13]:
all(result_1flow == result_2flows) and all(result_1process == result_2processes)

True

## Class OpenFasta
- `bio_files_processor.py`

In [14]:
with OpenFasta('data/example_fasta.fasta', 'r') as fasta_file:
    for record in fasta_file:
        print(record)
        print('')

GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+) 
ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG

GTD678345 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+) 
TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT

GTD174893 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+) 
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT

GTD906783 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-) 
TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCG

## Function run_genscan
- `bioinf_starter_pack.py`

In [15]:
result = run_genscan(sequence_file='data/sequence.fasta')
result

Status: 200

Predicted peptides: ['KPRAAEVVTLKKGRGPTLRTRGFAMAMSSGGSGGGVPEQEDSVLFRRGTGQHQVDPLTGVVVRVFLRLFPLRSLRTTSASSFLEQFGLLDSVEWRPCRDLAVAYFVHSSLAGVGESDDSDIWDDTALIKAYDKAVASFKHALKNGDICETSGKPKTTPKRKPAKKNKSQKKNTAASLQQWKVGDKCSAIWSEDGCIYPATIASIDFKRETCVVVYTGYGNREEQNLSDLLSPICEVANNIEQNAQENENESQVSTDESENSRSPGNKSDNIKPKSAPWNSFLPPPPPMPGPRLGPGKIIPPPPPICPDSLDDADALGSMLISWYMSGYHTGYYMKSKQNAND', 'MVEPVYRTLHPTSTEYTFFSAPHHTYSKIDHIVGSKALLSKRKRTEIITNCLSDHSAIKLELRNKKLTQNHSTTWKLNNLLLNDYWVHNEMKAEIKMFFETNENKDTTYQNFWDTFKAVCRGKFIALNAHKRKQERSKIDTLTSQLKELEKQEQTHSKASRRQEITKIRAELKEIETQKTLQKINPGAEIQTTIREYYKHFYANKLENLEEMDKFLDTYTLPRLNQEEVESLNGPITGSEIVAIINSLPTKKSPGPDGFTAEFYQRYKEELVPFLLKLFQSIEKEGILPNSFYEASIILIPKPGRDTTKKENFRLISLMNIDAKIINKILANRIQQHIKKLIQHDQVGFIPGMQGWFNIRKSINVIQHINRTKDKNHMIISIDAEKAFDKIQQSFMLKTLNK', 'MVLASARLLGRPQETKIMAEGFYLLHSTNTALMRVTNELHIAKLVNTSQSSL']

Discovered introns: {1.01: [288, 373], 1.02: [566, 13941], 1.03: [14014, 16491], 1.04: [16612, 17460], 1.05: [17662, 17820], 1.06: [17974, 21168], 1.07: [212

In [16]:
result.cds_list

['KPRAAEVVTLKKGRGPTLRTRGFAMAMSSGGSGGGVPEQEDSVLFRRGTGQHQVDPLTGVVVRVFLRLFPLRSLRTTSASSFLEQFGLLDSVEWRPCRDLAVAYFVHSSLAGVGESDDSDIWDDTALIKAYDKAVASFKHALKNGDICETSGKPKTTPKRKPAKKNKSQKKNTAASLQQWKVGDKCSAIWSEDGCIYPATIASIDFKRETCVVVYTGYGNREEQNLSDLLSPICEVANNIEQNAQENENESQVSTDESENSRSPGNKSDNIKPKSAPWNSFLPPPPPMPGPRLGPGKIIPPPPPICPDSLDDADALGSMLISWYMSGYHTGYYMKSKQNAND',
 'MVEPVYRTLHPTSTEYTFFSAPHHTYSKIDHIVGSKALLSKRKRTEIITNCLSDHSAIKLELRNKKLTQNHSTTWKLNNLLLNDYWVHNEMKAEIKMFFETNENKDTTYQNFWDTFKAVCRGKFIALNAHKRKQERSKIDTLTSQLKELEKQEQTHSKASRRQEITKIRAELKEIETQKTLQKINPGAEIQTTIREYYKHFYANKLENLEEMDKFLDTYTLPRLNQEEVESLNGPITGSEIVAIINSLPTKKSPGPDGFTAEFYQRYKEELVPFLLKLFQSIEKEGILPNSFYEASIILIPKPGRDTTKKENFRLISLMNIDAKIINKILANRIQQHIKKLIQHDQVGFIPGMQGWFNIRKSINVIQHINRTKDKNHMIISIDAEKAFDKIQQSFMLKTLNK',
 'MVLASARLLGRPQETKIMAEGFYLLHSTNTALMRVTNELHIAKLVNTSQSSL']

In [17]:
result.intron_list

{1.01: [288, 373],
 1.02: [566, 13941],
 1.03: [14014, 16491],
 1.04: [16612, 17460],
 1.05: [17662, 17820],
 1.06: [17974, 21168],
 1.07: [21280, 24519],
 1.08: [24547, 26921],
 1.09: [26928, 27032],
 2.03: [27029, 33245],
 2.02: [32604, 33932],
 2.01: [33370, 36327],
 2.0: [36290, 36625],
 3.03: [36622, 38226],
 3.02: [38131, 41053]}

## Class BiologicalSequence
- `bioinf_starter_pack.py`

#### RNA

In [18]:
rna_seq = RNASequence('AUGC')
print(rna_seq.is_alphabet_correct())
print(rna_seq.gc_content())
print(rna_seq.complement())

True
50.0
UACG


#### Protein

In [19]:
protein = AminoAcidSequence('MYRHHWWMYYYYYYY')
print(protein)
print(protein[:10])
print(protein.search_for_alt_frames())

MYRHHWWMYYYYYYY
MYRHHWWMYY
[MYYYYYYY]


#### DNA

In [20]:
seq = DNASequence('ATGC')
print(seq.complement())

TACG
