# Project: <project_name>
## sample subset: <primer_set>
## analysis date: <analysis_date>
## user: <name>

### Description
<High-level description of samples, primers, and references used>

### General steps
- prep fastq files
- fastqc on raw fastq files
- clean fastq files using seqyclean
- fastqc on filtered fastq files
- align using bwa
- trim primers using ivar trim
- generate consensus seqeunce with ivar consensus
- look at bam files

In [None]:
# import all python modules
# general
import os
import glob
import shutil
import re

#dates 
from datetime import date
from datetime import time
from datetime import datetime
from datetime import timedelta

# data
import pandas as pd
import numpy as np

# biopython
import Bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Align.Applications import MafftCommandline
from Bio import AlignIO

import subprocess

DOCKER_BASE = 'docker run --rm -v ${PWD}:/data -u $(id -u):$(id -g)'
FASTQC_DOCKER = f'{DOCKER_BASE} staphb/fastqc:0.12~/flu/h5/test_h5_0005_nextseq_template_test/avrl_h5n1_250bp/seqyclean.1'
MULTIQC_DOCKER = f'{DOCKER_BASE} staphb/multiqc:1.8'
SEQYCLEAN_DOCKER = f'{DOCKER_BASE} staphb/seqyclean:1.10.09'
BWA_DOCKER = f'{DOCKER_BASE} staphb/bwa:0.7.17'
SAMTOOLS_DOCKER = f'{DOCKER_BASE} staphb/samtools:1.10'
IVAR_DOCKER = f'{DOCKER_BASE} staphb/ivar:1.4.2'

# General Set Up

Edit the below input variables as needed

In [None]:
# working directory absolute path 
wd = '/home/sam_baird/flu/h5/test_h5_0005_nextseq_template_test/avrl_h5n1_250bp/'  # e.g. /home/sam_baird/flu/h5/test_h5_0005_nextseq/avrl_h5n1_250bp/

project_name = 'test_h5_0005_nextseq' # e.g. test_h5_0005_nextseq

### INPUT FILES ###
# just provide the filenames, no paths

# if multiple primer sets in single project, use subworkbook split by primer_set
workbook = 'test_h5_0005_nextseq_workbook_avrl_h5n1_250bp.tsv' # e.g. test_h5_0005_nextseq_workbook_avrl_h5n1_250bp.tsv

seqyclean_contaminants_fasta = 'Adapters_plus_PhiX_174.fasta'

primer_bed = 'AVRL_H5N1_250bpAmpWGS_v1.bed'

# keys: short name (used for naming subdirectories and files)
# values: filename
# references can be either single gene segments or multifastas
reference_sequences = {
    'h5n1_bovine': 'A_Bovine_texas_029328-01_UtoT.fasta',  # e.g. 'h5n1_bovine': 'A_Bovine_texas_029328-01_UtoT.fasta'
}

# Set up directories and paths

In [None]:
os.chdir(wd)
refs_dir = 'references'
if not os.path.exists(refs_dir):
    os.mkdir(refs_dir)
reference_sequence_paths = {k: os.path.join(refs_dir, v) for k, v in reference_sequences.items()}

fastq_dir = 'fastq_files'
if not os.path.exists(fastq_dir):
    os.mkdir(fastq_dir)

**Manual steps needed**: In the working directory, add the:
- workbook
- FASTQ files to the `fastq_files` directory
- reference genomes to the `references` directory
- seqyclean contaminants FASTA to the `references` directory
- primer BED file to the `references` directory

# Read workbook

In [None]:
path_wb = os.path.join(wd, workbook)
wb = pd.read_csv(path_wb, sep = '\t')

sample_names = wb.sample_name.tolist()
wb

# FastQC on raw FASTQ files
Assess the quality of the raw FASTQ files

In [None]:
fastqc_raw_dir = os.path.join(wd, 'fastqc_raw')
if not os.path.exists(fastqc_raw_dir):
    os.mkdir(fastqc_raw_dir)

for sample in sample_names:
    print(sample)
    
    fastq_1 = f'{sample}_R1.fastq.gz'
    fastq_2 = f'{sample}_R2.fastq.gz'

    outdir = os.path.join(fastqc_raw_dir, sample)
    if not os.path.exists(outdir):
        os.mkdir(outdir)
        
    !{FASTQC_DOCKER} fastqc --outdir fastqc_raw/{sample} fastq_files/{fastq_1}
    !{FASTQC_DOCKER} fastqc --outdir fastqc_raw/{sample} fastq_files/{fastq_2}
    print()

# MultiQC on raw FASTQ files
Combine FastQC results into a single report

In [None]:
multiqc_raw_dir = 'multiqc_raw'
if not os.path.exists(multiqc_raw_dir):
    os.mkdir(multiqc_raw_dir)

!{MULTIQC_DOCKER} multiqc fastqc_raw --outdir {multiqc_raw_dir}

# CSV report on raw FASTQ files

In [None]:
def summarize_reads(sample_names, fastqc_dir):
    for sample in sample_names:
        print(sample)
        # get fastqc path for zip files
        zip1 = glob.glob(os.path.join(fastqc_dir, sample, f'{sample}*1_fastqc.zip'))[0]
        zip2 = glob.glob(os.path.join(fastqc_dir, sample, f'{sample}*2_fastqc.zip'))[0]
        
        # set unzip outpath
        unzip_outpath = os.path.join(fastqc_dir, sample)
        
        # unzip
        !unzip -q -o {zip1} -d {unzip_outpath}
        !unzip -q -o {zip2} -d {unzip_outpath}
        
        # read in file and create summary out file
        # set up dataframe
        df = pd.DataFrame()
        df['sample_name'] = [sample]
        
        # R1
        path = glob.glob(os.path.join(unzip_outpath, f'{sample}*_fastqc', 'fastqc_data.txt'))[0]
        
        command = f'cat {path} | grep "Total Sequences" | cut -f 2'
        total_seqs = subprocess.check_output(command, shell=True, text=True).rstrip('\n')
        df['r1_total_reads'] = [total_seqs]

        command = f'cat {path} | grep "Sequences flagged as poor quality" | cut -f 2'
        flagged_seqs = subprocess.check_output(command, shell=True, text=True).rstrip('\n')
        df['r1_flagged_reads_as_poor_quality'] = [flagged_seqs]

        command = f'cat {path} | grep "Sequence length" | cut -f 2'
        seq_len = subprocess.check_output(command, shell=True, text=True).rstrip('\n')
        df['r1_read_len'] = [seq_len]
        
        # R2
        path = glob.glob(os.path.join(unzip_outpath, f'{sample}*2_fastqc', 'fastqc_data.txt'))[0]
        
        command = f'cat {path} | grep "Total Sequences" | cut -f 2'
        total_seqs = subprocess.check_output(command, shell = True, text = True).rstrip('\n')
        df['r2_total_reads'] = [total_seqs]

        command = f'cat {path} | grep "Sequences flagged as poor quality" | cut -f 2'
        flagged_seqs = subprocess.check_output(command, shell = True, text = True).rstrip('\n')
        df['r2_flagged_reads_as_poor_quality'] = [flagged_seqs]

        command = f'cat {path} | grep "Sequence length" | cut -f 2'
        seq_len = subprocess.check_output(command, shell = True, text = True).rstrip('\n')
        df['r2_read_len'] = [seq_len]
        
        outfile = os.path.join(unzip_outpath, f'{sample}_summary_metrics.tsv')
        df.to_csv(outfile, sep = '\t', index = False)

summarize_reads(sample_names, fastqc_raw_dir)

# Seqyclean on raw FASTQ files
Filter by quality and remove known contaminants

In [None]:
seqyclean_dir = 'seqyclean'
if not os.path.exists(seqyclean_dir):
    os.mkdir(seqyclean_dir)

seqyclean_commands = []
for sample in sample_names:    
    fastq_1 = f'fastq_files/{sample}_R1.fastq.gz'
    fastq_2 = f'fastq_files/{sample}_R2.fastq.gz'         
    out_name = f'seqyclean/{sample}_clean'

    command = f'{SEQYCLEAN_DOCKER} seqyclean -minlen 25 -qual 30 30 -gz -1 {fastq_1} -2 {fastq_2} -c references/Adapters_plus_PhiX_174.fasta -o {out_name}'
    seqyclean_commands.append(command)

seqyclean_commands = '\n' + '\n'.join(seqyclean_commands) + '\n'
! echo {seqyclean_commands} | parallel 

# FastQC on cleaned FASTQ files

In [None]:
fastqc_clean_dir = os.path.join(wd, 'fastqc_clean')
if not os.path.exists(fastqc_clean_dir):
    os.mkdir(fastqc_clean_dir)

for sample in sample_names:
    print(sample)
    
    fastq_1 = f'{sample}_clean_PE1.fastq.gz'
    fastq_2 = f'{sample}_clean_PE2.fastq.gz'

    outdir = os.path.join(fastqc_clean_dir, sample)
    if not os.path.exists(outdir):
        os.mkdir(outdir)
                
    !{FASTQC_DOCKER} fastqc --outdir fastqc_clean/{sample} seqyclean/{fastq_1}
    !{FASTQC_DOCKER} fastqc --outdir fastqc_clean/{sample} seqyclean/{fastq_2}
    print()

# MultiQC on cleaned FASTQ files

In [None]:
multiqc_clean_dir = 'multiqc_clean'
if not os.path.exists(multiqc_clean_dir):
    os.mkdir(multiqc_clean_dir)

!{MULTIQC_DOCKER} multiqc fastqc_clean --outdir {multiqc_clean_dir}

# CSV report on raw FASTQ files

In [None]:
clean_reads_summary = summarize_reads(sample_names, fastqc_clean_dir)
clean_reads_summary

# Get summary metrics on reads

In [None]:
# combine fastqc sumary from raw and filtered data
df_raw_list = []
df_filtered_list = []
    
    
for sample in sample_names:    
    # first read in raw summary file
    path = os.path.join(wd, 'fastqc_raw', sample, f'{sample}_summary_metrics.tsv')
    df = pd.read_csv(path, sep = '\t')
    
    df_raw_list.append(df)
    
    # now read in filtered summary file
    path = os.path.join(wd, 'fastqc_clean', sample, f'{sample}_summary_metrics.tsv')
    df = pd.read_csv(path, sep = '\t')
    
    df_filtered_list.append(df)
    
    
# concatenate dataframes and join
raw_df = pd.concat(df_raw_list).reset_index(drop = True)
raw_df = raw_df.set_index('sample_name')

filtered_df = pd.concat(df_filtered_list).reset_index(drop = True)
filtered_df = filtered_df.set_index('sample_name')


combined_df = raw_df.join(filtered_df, how = 'left', lsuffix = '_raw', rsuffix = '_filtered')
combined_df = combined_df.reset_index()

# calculate total reads diff
combined_df['total_reads_diff'] = combined_df['r1_total_reads_raw'] - combined_df['r1_total_reads_filtered']
combined_df['raw_total_reads_paired'] = combined_df['r1_total_reads_raw']
combined_df['filtered_total_reads_paired'] = combined_df['r1_total_reads_filtered']

# add primer set
for row in range(combined_df.shape[0]):
    sample = combined_df.sample_name[row]

# print(j.columns)

col_order = ['sample_name', 'raw_total_reads_paired', 'filtered_total_reads_paired',
             'total_reads_diff', 'r1_total_reads_raw', 'r1_flagged_reads_as_poor_quality_raw',
             'r1_read_len_raw', 'r2_total_reads_raw', 'r2_flagged_reads_as_poor_quality_raw',
             'r2_read_len_raw', 'r1_total_reads_filtered', 'r1_flagged_reads_as_poor_quality_filtered',
             'r1_read_len_filtered', 'r2_total_reads_filtered',
             'r2_flagged_reads_as_poor_quality_filtered', 'r2_read_len_filtered']

combined_df = combined_df[col_order]
combined_df['project_name'] = project_name

outfile = os.path.join(wd, f'{project_name}_reads_QC_summary.csv')
combined_df.to_csv(outfile, index = False)
combined_df

combined_df

# Align to each reference using BWA

In [None]:
alignment_dir = 'bwa_alignment'
if not os.path.exists(alignment_dir):
    os.mkdir(alignment_dir)

for sample in sample_names:
    # align each sample to each reference
    for ref in reference_sequence_paths:

        fastq_1 = os.path.join(seqyclean_dir, f'{sample}_clean_PE1.fastq.gz')
        fastq_2 = os.path.join(seqyclean_dir, f'{sample}_clean_PE2.fastq.gz')

        ref_base_name = ref.split('/')[-1]

        # each reference gets its own subdirectory
        out_subdir = os.path.join(alignment_dir, ref_base_name)
        if not os.path.exists(out_subdir):
            os.mkdir(out_subdir)

        outsam = os.path.join(out_subdir, f'{sample}.sam')
        outbam = os.path.join(out_subdir, f'{sample}.aln.sorted.bam')

        !{BWA_DOCKER} bwa index -p {ref_base_name} -a is {reference_sequence_paths[ref]}
        !{BWA_DOCKER} bwa mem -t 6 {ref_base_name} {fastq_1} {fastq_2} -f {outsam}
        !{SAMTOOLS_DOCKER} samtools view -bS {outsam} | samtools sort -o {outbam}
        !rm {outsam}  # save storage space

# Trim primers from alignment using ivar trim

In [None]:
raw_alignment_dir = 'bwa_alignment'
trimmed_alignment_dir = 'bwa_alignment_trimmed'

for sample in sample_names:
    for ref in reference_sequence_paths:
        primer_bed_path = os.path.join(refs_dir, primer_bed)

        out_subdir = os.path.join(trimmed_alignment_dir, ref)
        os.makedirs(out_subdir, exist_ok=True)

        in_subdir = os.path.join(raw_alignment_dir, ref)
        aln_bam = os.path.join(in_subdir, f'{sample}.aln.sorted.bam')

        trimmed_bam = os.path.join(out_subdir, f'{sample}_trimmed.bam')
        trimmed_sorted_bam = os.path.join(out_subdir, f'{sample}_trimmed.sorted.bam')

        !{IVAR_DOCKER} ivar trim -e -i {aln_bam} -b {primer_bed_path} -p {trimmed_bam}
        !{SAMTOOLS_DOCKER} samtools sort {trimmed_bam} -o {trimmed_sorted_bam}
        !{SAMTOOLS_DOCKER} samtools index {trimmed_sorted_bam}


# TODO Determine consensus sequence using ivar consensus

In [None]:
consensus_dir = 'consensus_sequences'
if not os.path.exists(consensus_dir):
    os.mkdir(consensus_dir)

for sample in sample_names:
    for ref in reference_sequence_paths:
        out_subdir_consensus = os.path.join(consensus_dir, ref)
        if not os.path.exists(out_subdir_consensus):
            os.mkdir(out_subdir_consensus)

        trimmed_sorted_bam = os.path.join(trimmed_alignment_dir, ref, f'{sample}_trimmed.sorted.bam')
        pileup_txt = os.path.join(trimmed_alignment_dir, ref, f'{sample}_pileup.txt')

        generate pileup
        !{SAMTOOLS_DOCKER} samtools faidx {reference_sequence_paths[ref]}
        !{SAMTOOLS_DOCKER} samtools mpileup -A -aa -d 600000 -B -Q 20 -q 20 -f {reference_sequence_paths[ref]} {trimmed_sorted_bam} -o {pileup_txt}

        # if reference is a multifasta, write separate consensus for each gene segment
        records = SeqIO.parse(reference_sequence_paths[ref], format='fasta')
        num_records = len(list(records))
        if num_records > 1:
            for segment in SeqIO.parse(reference_sequence_paths[ref], format='fasta'):
                print('hi')
                out_subdir_consensus = os.path.join(consensus_dir, ref, segment.id)
                consensus_prefix = os.path.join(out_subdir_consensus, f'{sample}_{ref}_{segment.id}_consensus')
                segment_id = segment.id
                command = f"cat {pileup_txt} | grep {segment_id} | ivar consensus -p {consensus_prefix} -q 20 -t 0.6 -m 10"
                print(command)
                !{IVAR_DOCKER} sh -c "{command}"
                
        else:
            out_subdir_consensus = os.path.join(consensus_dir, ref)
            consensus_prefix = os.path.join(out_subdir_consensus, f'{sample}_{ref}_consensus')
            !{IVAR_DOCKER} sh -c "cat {pileup_txt} | ivar consensus -p {consensus_prefix} -q 20 -t 0.6 -m 10"

# TODO Calculate alignment metrics

In [None]:
aln_metrics_dir = 'alignment_metrics'
if not os.path.exists(aln_metrics_dir):
    os.mkdir(aln_metrics_dir)
    
for sample in sample_names:
    # grab bam file
    aln_dir = os.path.join('bwa_alignment_trimmed', ref)
    bam_file = os.path.join(aln_dir, f'{sample}_trimmed.sorted.bam')

    # set outfile
    out_subdir_aln_metrics = os.path.join(aln_metrics_dir, ref)
    if not os.path.exists(out_subdir_aln_metrics):
        os.mkdir(out_subdir_aln_metrics)

    out_bam_coverage = os.path.join(out_subdir_aln_metrics, f'{sample}_coverage.txt')
    out_bam_stats = os.path.join(out_subdir_aln_metrics, f'{sample}_stats.txt')

    # run samtools
    ! {SAMTOOLS_DOCKER} samtools coverage -o {out_bam_coverage} {bam_file}
    ! {SAMTOOLS_DOCKER} sh -c "samtools stats {bam_file} > {out_bam_stats}"

# TODO Calculate percent coverage

In [None]:
# reference length with and without primers
# e.g.
# reference_lengths = {
reference_lengths = {}
for ref in reference_sequence_paths:
    records = SeqIO.parse(reference_sequence_paths[ref])
    num_records = len(list(records))
    bed_df = pd.read_csv(bed_file_path, sep='\t', header=None)
    if num_records > 1:
        segment_lengths = {}
        for segment in records:
            total_length = len(segment.seq)
            segment_bed_df = bed_df[bed_df[0] == segment.id]

            # Span of primers is assumed to be the (highest end coordinate) - (lowest start coordinate) 
            primer_covered_length = segment_bed_df[2].max() - segment_bed_df[1].min()

            segment_lengths[segment.id] = (total_length, primer_covered_length)
        reference_lengths[ref] = segment_lengths
    else:
        total_length = len(records[0].seq)
        primer_covered_length = bed_df[2].max() - bed_df[1].min()
        reference_lengths[ref] = (total_length, primer_covered_length)

reference_lengths