## Imports

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from bio_files_processor import OpenFasta
from custom_random_forest import RandomForestClassifierCustom
from ultimate_tools import run_genscan

SEED = 12345

## RandomForestClassifierCustom

In [2]:
X, y = make_classification(n_samples=100000, random_state=SEED)
clf = RandomForestClassifierCustom(max_depth=8, max_features=20, random_state=SEED)

In [3]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)
clf.fit(X_train, y_train)

result_1 = clf.predict(X_test)

CPU times: user 40.4 ms, sys: 51.2 ms, total: 91.7 ms
Wall time: 9.66 s


In [4]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)
clf.fit(X_train, y_train, n_jobs=2)

result_2 = clf.predict(X_test, n_jobs=2)

CPU times: user 77.6 ms, sys: 114 ms, total: 192 ms
Wall time: 4.87 s


In [5]:
(result_1 == result_2).all()

True

## OpenFasta

In [8]:
with OpenFasta("data/example.fasta") as fasta:
    print(fasta.read_record())

>GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG


In [16]:
with OpenFasta("data/example.fasta") as fasta:
    records = fasta.read_records()
    print(records[::len(records)-1])  # the first and the last element

[>GTD323452 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+)
ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG, >GTD129563 16S_rRNA NODE_4_length_428221_cov_75.638017:281055-282593(-)
CGGACGGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT]


## run_genscan

In [7]:
result_genscan = run_genscan(sequence_file="data/nxf1_homo_sapiens.fasta")

In [8]:
print(result_genscan)

Status: 200

Predicted peptide:
MADEGKSYSEHDDERVNFPQRKKKGRGPFRWKYGEGNRRSGRGGSGIRSSRLEEDDGDVAMSDAQDGPRVRYNPYTTRPNRRGDTWHDRDRIHVTVRRDRAPPERGGAGTSQDGTSKNWFKITIPYGRKYDKAWLLSMIQSKCSVPFTPIEFHYENTRAQFFVEDASTASALKAVNYKILDRENRRISIIINSSAPPHTILNELKPEQVEQLKLIMSKRYDGSQQALDLKGLRSDPDLVAQNIDVVLNRRSCMAATLRIIEENIPELLSLNLSNNRLYRLDDMSSIVQKAPNLKILNLSGNELKSERELDKIKGLKLEELWLDGNSLCDTFRDQSTYISAIRERFPKLLRLDGHELPPPIAFDVEAPTTLPPCKGSYFGTENLKSLVLHFLQQYYAIYDSGDRQGLLDAYHDGACCSLSIPFIPQNPARSSLAEYFKDSRNVKKLKDPTLRFRLLKHTRLNVVAFLNELPKTQHDVNSFVVDISAQTSTLLCFSVNGVFKEVDGKSRDSLRAFTRTFIAVPASNSGLCIVNDELFVRNASSEEIQRAFAMPAPTPSSSPVPTLSPEQQEMLQAFSTQSGMNLEWSQKCLQDNNWDYTRSAQAFTHLK

Predicted introns:
[['1.01', 154, 1503], ['1.02', 1691, 1909], ['1.03', 2064, 3221], ['1.04', 3306, 3435], ['1.05', 3541, 3662], ['1.06', 3744, 3850], ['1.07', 3921, 4064], ['1.08', 4154, 4280], ['1.09', 4389, 4995], ['1.10', 5106, 6906], ['1.11', 6944, 8095], ['1.12', 8165, 8243], ['1.13', 8300, 8914], ['1.14', 9023, 9136], ['1.15', 9196, 9300], ['1.16', 9

## tests

In [9]:
! python3.12 -m pytest

platform linux -- Python 3.12.3, pytest-8.2.0, pluggy-1.5.0
rootdir: /home/artemvaska/BI/Python/UltimateBioinformaticsTools
plugins: anyio-4.3.0
collected 8 items                                                              [0m

test_ubt.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                     [100%][0m

