# Jupyter Notebook GenoRobotics Full Pipeline

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

from lib.consensus.consensus import run_consensus
from lib.identification.identification import run_identification
from lib.general_helpers.process_fastq import concatenate_fastq

## Define Your File and Folder Paths

- Modify the "input_fastq_filename" variable to point to the fastq you want to use as input. 
  
- Modify wsl if you're on Windows and want to use WSL.

In [3]:
input_fastq_filename = "rbcL_Qiagen_tomato_5000.fastq"
wsl = False

In [4]:
input_fastq_path = os.path.join("assets","input", input_fastq_filename)
base_name = os.path.splitext(input_fastq_filename)[0]

## Run Preprocessing (Optional)

- If you want to preprocess your data, run the following cell. Otherwise, skip to the next cell.
- For now, preprocessing consists of concatenating all the fastq files in a folder into one file. This is useful if you have multiple fastq files for one sample. You'll have to change the first parameter in the "concatenate_fastq_files" function to point to the folder containing your fastq files.

In [5]:
# preprocessing()
#concatenate_fastq(os.path.join("assets", "input", "barcode74"), input_fastq_path)

## Run Consensus Sequence Generation

Select which consensus sequence generation method you want to use by setting the "consensus_method" variable to either:

- "majority" (default)

- "consensus"

- "consensus_with_ambiguities"

In [6]:
# choose a consensus method between the following:
# - "80_20_best_sequence"
# - "80_20_longest_sequence"
# - "straightforward_best_sequence"

# consensus_method = "straightforward_best_sequence"
consensus_method = "80_20_best_sequence"

run_consensus(input_name= base_name, 
              input_fastq_path= input_fastq_path, 
              consensus_method= consensus_method,
              wsl= wsl)

Logging set up at assets/output/post/rbcL_Qiagen_tomato_5000/consensus/rbcL_Qiagen_tomato_5000_consensus_pipeline_log.log


(14.665834903717041, 2.9280059337615967, 11.737828969955444)

## Run Identification of Consensus Sequence
- Run the following cell to identify the consensus sequence.
- Change db to the database you want to use. Options are "matK", "rbcL", "psbA-trnH" and "ITS". If you want to use all of them, set db to None.

In [7]:
# choose an identification method between the following:
# - "blastn"

identification_method = "blastn"

# Choose your db along the gene you're trying to identify : matK, rbcL, psbA-trnH or ITS
db = "matK"

run_identification(base_name, db=db, identification_method=identification_method)

Logging set up at assets/output/post/rbcL_Qiagen_tomato_5000/identification/rbcL_Qiagen_tomato_5000_identification_pipeline_log.log


({'matK': {'species': 'Solanum lycopersicum',
   'alignment': 98.96193771626297,
   'evalue': 0.0}},
 1.3930041790008545)