# E. faecium nanopore workflow
This notebook details the whole project, with a guide for the HPC parts

In [None]:
#-------------[ IMPORTS ]-------------
import yaml
import os

In [None]:
#-------------[ CONFIG ]-------------
# Get the path to the config file
config_path = os.path.join('..', 'config.yaml')

# Load configurations from the yaml file
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
print("Configuration loaded successfully.")

# Get the raw file path
raw_fastq_path = os.path.join('..',config['raw_fastq_path'])
print(f"the raw fastq path is: {raw_fastq_path}")

# Get the Fastplong output path and proccesed file path
fastplong_output_path = os.path.join('..', config['fastplong_output_path'])
fastplong_html_path = os.path.join(fastplong_output_path, 'fastplong_report.html')
fastplong_json_path = os.path.join(fastplong_output_path, 'fastplong_report.json')
print(f"The Fastplong output path is: {fastplong_output_path}")
fastplong_filtered_path = os.path.join('..', config['fastplong_filtered_path'])
print(f"The Fastplong filtered path is: {fastplong_filtered_path}")

# Get the MultiQC input and output path
multiqc_input_path = os.path.join('..', config['multiqc_input_path'])
print(f"The MultiQC input path is: {multiqc_input_path}")
multiqc_output_path = os.path.join('..', config['multiqc_output_path'])
print(f"The MultiQC output path is: {multiqc_output_path}")

# Get the Bakta database path and input genome path
bakta_db_path = os.path.join('..', config['bakta_db_path'])
print(f"The Bakta database path is: {bakta_db_path}")
bakta_input_path = os.path.join('..', config['bakta_input_path'])
print(f"The Bakta input genome path is: {bakta_input_path}")
bakta_output_path = os.path.join('..', config['bakta_output_path'])
print(f"The Bakta output path is: {bakta_output_path}")




## Initial processing and quality control

In [None]:
#-------------[ FASTPLONG ANALYSIS ]-------------
# make output directory if it doesn't exist
!mkdir -p {fastplong_output_path}

# Run Fastplong to filter and generate reports
!fastplong \
  -i {raw_fastq_path} \
  -o {fastplong_filtered_path} \
  -h {fastplong_html_path} \
  -j {fastplong_json_path}

In [None]:
#-------------[ INITIAL MULTIQC ]-------------

!multiqc {multiqc_input_path} \
    --title "Initial QC" \
    --filename "initial_QC" \
    --outdir {multiqc_output_path} \
    --dirs --dirs-depth 2 --force

Let's move over to the cluster to run Kraken2, Flye, and QUAST

## High Performance Cluster: Contmination Check and Assembly 
Certain resource intensive tasks or those that take huge databases are better run on a HPC


//TODO: Detail HPC work here To 

## Annotation
We will perform a final round of polishing using medaka to ensure our genes are nice before running bakta to annotate our genome

In [None]:
#-------------[ MULTIQC REPORT POST HPC  ]-------------
!multiqc {multiqc_input_path} \
    --title "Post_HPC_QC" \
    --filename "Post_HPC_QC" \
    --outdir {multiqc_output_path} \
    --dirs --dirs-depth 2 --force

In [None]:
#-------------[ BAKTA ANALYSIS  ]-------------
!bakta  --db {bakta_db_path} \
        --verbose \
        --output {bakta_output_path} \
        --threads 8 \
        {bakta_input_path}
