# Jupyter Notebook GenoRobotics Full Pipeline

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from lib.consensus.consensus import run_consensus
from lib.identification.identification import run_identification
from lib.general_helpers.process_fastq import concatenate_fastq, extract_gz

## Define Your File and Folder Paths

- Modify the "input_src" variable to point to the directory containing the input files. 
  
- Modify the "output_src" variable to point to the directory where you want the output files to be saved.

In [19]:
expedition_folder = "matK_rbcL_trnh_ITS_27samples_Jardin_botanique_1st"
consensus_method = "80_20_best_sequence"
wsl = True

In [20]:
input_base = 'assets/input'

## Run Identification of Consensus Sequence

In [21]:
for root, dirs, files in os.walk(input_base):
        if root.endswith('fastq_pass'):
            i = 0
            for barcode_folder in dirs:
                print("Processing barcode folder: ", barcode_folder, " still in progress : ", i/len(dirs)*100, "%")
                barcode_path = os.path.join(root, barcode_folder)
                output_fastq = os.path.join(barcode_path, f"{barcode_folder}.fastq")
                intermediate_files = []

                # Extract .gz files and store paths for later deletion
                if not os.path.isfile(output_fastq):
                    for file in os.listdir(barcode_path):
                        if file.endswith('.gz'):
                            src_file = os.path.join(barcode_path, file)
                            dst_file = os.path.join(barcode_path, file[:-3])
                            extract_gz(src_file, dst_file)
                            intermediate_files.append(dst_file)

                    concatenate_fastq(barcode_path, output_fastq)

                    run_consensus(barcode_folder, output_fastq, consensus_method, barcode_path, wsl=wsl)

                if os.path.getsize(os.path.join(barcode_path, f"{barcode_folder}_final_consensus.fasta")) != 0 :
                    print("Running identification for barcode folder: ", barcode_folder)
                    run_identification(barcode_folder, expedition_name=expedition_folder, input_path=barcode_path)


                # Delete intermediary files
                for file in intermediate_files:
                    os.remove(file)

                i += 1
                print("\n")

Processing barcode folder:  barcode02  still in progress :  0.0 %


Processing barcode folder:  barcode07  still in progress :  2.272727272727273 %


Processing barcode folder:  barcode10  still in progress :  4.545454545454546 %


Processing barcode folder:  barcode11  still in progress :  6.8181818181818175 %


Processing barcode folder:  barcode18  still in progress :  9.090909090909092 %


Processing barcode folder:  barcode19  still in progress :  11.363636363636363 %


Processing barcode folder:  barcode20  still in progress :  13.636363636363635 %


Processing barcode folder:  barcode33  still in progress :  15.909090909090908 %


Processing barcode folder:  barcode45  still in progress :  18.181818181818183 %


Processing barcode folder:  barcode46  still in progress :  20.454545454545457 %


Processing barcode folder:  barcode52  still in progress :  22.727272727272727 %


Processing barcode folder:  barcode62  still in progress :  25.0 %


Processing barcode folder:  barcode6