# Jupyter Notebook GenoRobotics Full Pipeline

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

from lib.consensus.consensus import run_consensus
from lib.identification.identification import run_identification



## Define Your File and Folder Paths

- Modify the "input_src" variable to point to the directory containing the input files. 
  
- Modify the "output_src" variable to point to the directory where you want the output files to be saved.

In [3]:
input_fastq_filename = "rbcL_Qiagen_tomato_5000.fastq"
input_fastq_path = os.path.join("assets","input", input_fastq_filename)
base_name = os.path.splitext(input_fastq_filename)[0]

output_base_dir = os.path.join('assets','output')
output_dir = os.path.join(output_base_dir, base_name)
os.makedirs(output_dir, exist_ok=True)

## Run Preprocessing (Optional)

In [3]:
# preprocessing()

## Run Consensus Sequence Generation

Select which consensus sequence generation method you want to use by setting the "consensus_method" variable to either:

- "majority" (default)

- "consensus"

- "consensus_with_ambiguities"

In [5]:
# choose a consensus method between the following:
# - "80_20_best_sequence"
# - "80_20_longest_sequence"

#If you're on Windows and have to use WSL (Windows Subsystem for Linux), set wsl to True
wsl = True

run_consensus(input_name= base_name, 
              input_fastq_path=input_fastq_path, 
              output_dir=output_dir, 
              consensus_method="80_20_best_sequence",
              wsl = wsl)

Running consensus pipeline...
Running consensus pipeline with 80_20_best_sequence method...
Running read alignment with minimap2 on top 20% sequences...


ERROR:root:Error: [M::mm_idx_gen::0.099*0.37] collected minimizers
[M::mm_idx_gen::0.111*0.59] sorted minimizers
[M::main::0.111*0.59] loaded/built the index for 1000 target sequence(s)
[M::mm_mapopt_update::0.113*0.59] mid_occ = 608
[M::mm_idx_stat] kmer size: 15; skip: 5; is_hpc: 0; #seq: 1000
[M::mm_idx_stat::0.114*0.59] distinct minimizers: 61942 (78.57% are singletons); average occurrences: 3.831; average spacing: 2.966; total length: 703802
[M::worker_pipeline::2.593*2.53] mapped 1000 sequences
[M::main] Version: 2.24-r1122
[M::main] CMD: minimap2 -x ava-ont assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20.fastq assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20.fastq
[M::main] Real time: 2.605 sec; CPU: 6.569 sec; Peak RSS: 0.062 GB



Generating consensus sequence with racon on top 20% sequences...


ERROR:root:Error: [racon::Polisher::initialize] loaded target sequences 0.017505 s
[racon::Polisher::initialize] loaded sequences 0.022180 s
[racon::Polisher::initialize] loaded overlaps 1.056981 s
[racon::Polisher::initialize] aligning overlaps [=>                  ] 0.049785 s
[racon::Polisher::initialize] aligning overlaps [==>                 ] 0.057227 s
[racon::Polisher::initialize] aligning overlaps [===>                ] 0.062695 s
[racon::Polisher::initialize] aligning overlaps [====>               ] 0.067247 s
[racon::Polisher::initialize] aligning overlaps [=====>              ] 0.071263 s
[racon::Polisher::initialize] transformed data into windows 0.001117 s
[racon::Polisher::polish] generating consensus [=>                  ] 0.440261 s
[racon::Polisher::polish] generating consensus [==>                 ] 0.593741 s
[racon::Polisher::polish] generating consensus [===>                ] 0.633801 s
[racon::Polisher::polish] generating consensus [====>               ] 0.638918

Multiple sequences found in assets\output\rbcL_Qiagen_tomato_5000\rbcL_Qiagen_tomato_5000_top20_consensus.fasta. Selecting the best alignment...
Running read alignment with minimap2 on remaining 80% sequences...


ERROR:root:Error: [M::mm_idx_gen::0.043*0.23] collected minimizers
[M::mm_idx_gen::0.043*0.23] sorted minimizers
[M::main::0.044*0.23] loaded/built the index for 1 target sequence(s)
[M::mm_mapopt_update::0.044*0.23] mid_occ = 10
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 1
[M::mm_idx_stat::0.044*0.24] distinct minimizers: 129 (100.00% are singletons); average occurrences: 1.000; average spacing: 5.961; total length: 769
[M::worker_pipeline::0.302*0.32] mapped 4000 sequences
[M::main] Version: 2.24-r1122
[M::main] CMD: minimap2 -x map-ont assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20_consensus.fasta assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_remaining80.fastq
[M::main] Real time: 0.313 sec; CPU: 0.097 sec; Peak RSS: 0.005 GB



Generating final consensus sequence with racon...


ERROR:root:Error: [racon::Polisher::initialize] loaded target sequences 0.001826 s
[racon::Polisher::initialize] loaded sequences 0.060633 s
[racon::Polisher::initialize] loaded overlaps 0.014488 s
[racon::Polisher::initialize] aligning overlaps [=>                  ] 0.026338 s
[racon::Polisher::initialize] aligning overlaps [==>                 ] 0.034707 s
[racon::Polisher::initialize] aligning overlaps [===>                ] 0.042996 s
[racon::Polisher::initialize] aligning overlaps [====>               ] 0.050167 s
[racon::Polisher::initialize] aligning overlaps [=====>              ] 0.056922 s
[racon::Polisher::initialize] transformed data into windows 0.001254 s
[racon::Polisher::polish] generated consensus 4.952049 s
[racon::Polisher::] total = 5.154822 s



Deleting intermediate files...
Minimap2 alignment took 0.48 seconds.
Total Racon iterations took 10.34 seconds.
Total time taken for the pipeline: 10.82 seconds.


## Run Idenfitication of Consensus Sequence

In [90]:
# Choose your db along the gene you're trying to identify : matK, rbcL, psbA-trnH or ITS
db = None

run_identification(base_name, db)