# Hubert Pausch code

## FASTQ files to BAM

I am copying and documenting here the code wrote by Hubert Pausch that deals with fastq files from the raw data to BAM alignment. The complete pipeline:

In [None]:
#################################################
#       This script reads paired end fastq.gz files
#       - carries out QC using fastp
#       - reads in RG information from header files that have been previsously collected from the fastq-files
#       - performs the alignment to ${ref} using bwa 
#       - followed by samblaster to mark duplicates
#       - and sambamba for coordinate sorting
#       - and eventually produces a bam file
#
#       INPUT:
#       ref=/PATH/TO/REF/SEQ/seq.fa
#       input_path=/PATH/TO/INPUT/FASTQ/
#       sample_id=[sample_id]
#       output_folder=PATH/TO/NEW/BAM/FILE/
#################################################

module load gcc/4.8.2 bwa/0.7.12 samtools/1.3 #loads the necessary softwares for running this program

#################################################

echo "modules loaded at `date`" > ${output_folder}logfile_${sample_id} #prints the date in a log file generated in the indicated folder

scratch_folder=${temp_scratch_directory} #scratch_folder is defined - it is provided when this program is called

cd ${scratch_folder} # moving to the scratch folder
mkdir ${sample_id} #creating directory with sample_id
cd $sample_id #moving to the sample_id directory

# the next temp directory is neccessary because sambamba produces large temporary files to facilitate fast coordinate sorting
mkdir tmp_scratch
tmp_dir=${scratch_folder}${sample_id}/tmp_scratch #tmp_dir is also called when the program runs.


# this is an almost all in one solution:
# first run fastp on the data, write the output to tmp. AFAIK, piping is not possible; 
# -g: trim poly G reads (required for NovaSeq)

/cluster/work/pausch/group_bin/fastp -i ${input_folder}${sample_id}_R1.fastq.gz -o ${sample_id}_R1.fastq.gz -I ${input_folder}${sample_id}_R2.fastq.gz -O ${sample_id}_R2.fastq.gz -q 15 -u 40 -g >/dev/null
#Running fastp over encrypted file FASTQ R1 and copying to the new folder. Action for encrypted file FASTQ R2 is concatenated.
#flags for fastp can be checked here: https://github.com/OpenGene/fastp // -q 15 means phred quality >=Q15 is qualified // -u 40 means that 40% of bases are allowed to be unqualified // -g force polyG tail trimming
# >/dev/null is used to send the output to a black box.

# copy the QC report to the output directory
cp fastp.html ${output_folder}${sample_id}_fastp.html #sending the QC report to the correct folder
printf "\nfinished read trimming" >> ${output_folder}logfile_${sample_id} #writing in th log file that this step is completed.


#       this is to collect the RGs from the fastq's (after fastp)
#       there may be more than one RG, that^s the reason for the while loop
#       in the while loop, I'm pasting the headers using \n as separator

#collect_read groups from the fastq and write them to scratch

for rg in `zcat ${sample_id}_R1.fastq.gz | awk -F":" 'NR%2==1 {print $3,$4}' | awk 'NR%2==1 {print $1":"$2}' | sort -u`
#RG are obtained here. FASTQ files have series of 4 lines. The first line is the sequence identifier. There are usually several fields in this sequence identifier, all separated by ":".
#Taking ":" as separator, columns 3 and 4 are printer for the odd-numbered lines (flowcell id and lane). From here the odd-numbered lines have field 1 and 2 printed with separation of ":"
#The resulting are printed in order
do
flowcell=`echo $rg | awk -F":" '{print $1}'`
lane=`echo $rg | awk -F":" '{print $2}'`
#flowcell and lane variables are completed with the 1st and 2nd column
printf "@RG\tID:${flowcell}.${lane}\tPL:illumina\tPU:${rg}\tLB:${SM_tag}\tSM:${SM_tag}\tCN:${seq_center}\n" >> ${sample_id}.header
#formatted text is printed, which includes a tab separated line with ID (flowcell and lane), rg, tags, sequencing centre, etc.
#this formatted text is included in a header for each sample.
done

# read in the read groups
header=''
while read i; do header=$header$i'\n'; done < ${sample_id}.header
printf "\nHeader(s):\n$header" >> ${output_folder}${sample_id}.header

#the read group line contained literal <tab> characters -- replace with escaped tabs

# bwa mem -p aligns the fastqs against the reference; -R ${header::-2} adds the RG without the final \n
# samblaster marks duplicate reads and samtools view outputs a new bam file
bwa mem -M -t 12 -R "${header::-2}" ${ref} ${sample_id}_R1.fastq.gz ${sample_id}_R2.fastq.gz | /cluster/home/pauschh/programs/samblaster/samblaster -M | samtools view -Sb - > samp.out.bam
#running bwa in order to obtain the .bam file. -M for marking shorter split hits as secondary. -t 12 for number of threads. -R for proceeding with suboptimal alignments if there are no more than INT equally best hits.
#This is pipped to samblaster (which marks the duplicates) and samtools view (which prints it as BAM file)

printf "\nfinished bwa mem | samblaster | samtools view" >> ${output_folder}logfile_${sample_id}
#Printing the success of the step in the logfile.


# sambamba performs an extremely fast coordinate sort
/cluster/home/pauschh/bin/sambamba_v0.6.6 sort -m 6G --tmpdir ${tmp_dir} --out ${sample_id}.bam --nthreads 10 samp.out.bam
# Sambamba is sorting the BAM files and generating the index BAI file.


# get the alignment statistics from the BAM file
/cluster/home/pauschh/bin/sambamba_v0.6.6 flagstat --nthreads 10 ${sample_id}.bam > ${sample_id}.stats

#Copying bam, bai and stats files to the correct folder
#Different BAM files for same sample should be merged here. Once the sample-specific file is obtained the BAM needs to be sorted again and Picard tools be run.
cp ${sample_id}.b* ${output_folder}
cp ${sample_id}.stats ${output_folder}

printf "\njob finished at `date`" >> ${output_folder}logfile_${sample_id}
#printing when the job finished, if this message is received, everything went fine.


# clear the /scratch
cd $HOME
rm -rf ${scratch_folder}${sample_id}

######################

## BAM files to FASTQ

The following lines of code are describing how BAM files are converted into fastq files by using Samtools. The resulting fastq files are then aligned againg - similar to the steps above. All commands have been described for the sake of understanding.

In [None]:
mkdir /cluster/work/pausch/temp_scratch/pausch/fastqs2/
mkdir /cluster/work/pausch/temp_scratch/pausch/fastqs2/scratch

#script starts by creating two directories.

########################### LOOP
for animal in `cat /cluster/work/pausch/inputs/ref/BTA/individual_information.csv | sed 1d | grep "Ubern" | awk '{print $1}'`
#this is opening a loop taking as looped elements the first column of these entries containing "Ubern" in the csv file. The header is removed.
#a simpler alternative to the pipe would be "grep "Ubern" individual_information.csv | cut -f 1"
do
mkdir /cluster/work/pausch/temp_scratch/pausch/fastqs2/scratch/${animal}/
mkdir /cluster/work/pausch/temp_scratch/pausch/fastqs2/scratch/${animal}/tmp_scratch
#a folder (and additional folder) is created for each looped element
bsub -n 2 -W 12:00 -R "rusage[mem=6000,scratch=4000]" -J "extract_fastqs" -env "all" "/cluster/work/pausch/group_bin/samtools-1.8/bin/samtools collate -uOn 64 /cluster/work/pausch/inputs/bam/BTA_UCD12/${animal}.bam /cluster/work/pausch/temp_scratch/pausch/fastqs2/scratch/${animal}/tmp_scratch/tmp | /cluster/work/pausch/group_bin/samtools-1.8/bin/samtools fastq -N -1 /cluster/work/pausch/temp_scratch/pausch/fastqs2/${animal}_R1.fastq.gz -2 /cluster/work/pausch/temp_scratch/pausch/fastqs2/${animal}_R2.fastq.gz -"
#This is submitting a job, requesting 2 processor cores, 12 hours, 6000 Mb of memory, 4000 Mb of scratch space
#-J "job name" is used for assigning a name to the job, so it is easily detected. [-env "all"] seems to load the necessary stuff to execute the work.
#Samtools "collate" looped BAM files as it shuffles and groups reads together by their names. -uOn 64 indicates that resulting BAM files will be uncompressed, output to sdtout and the number of temporary files will be 64. Temporary files are generated in the folders created before
#Samtools fastq is used after the pipe for converting the BAM files to compressed fastq files (collate is required). Numbers for read names will be included (-N) and output will be added to files -1 and -2. The dash (-) at the end of the second command is just telling bash to read in standard in and process it.
done
########################### END LOOP

## Splitting fastq files with multiple readgroups

gdc-fastq-splitter (https://github.com/kmhernan/gdc-fastq-splitter) is used as software for splitting fastq files with multiple readgroups

In [None]:
########################### LOOP
for sample_id in `ls /cluster/work/pausch/temp_scratch/pausch/fastqs2/*_R1.fastq.gz | xargs -n 1 basename | sed 's/_R1.fastq.gz//'`
#this is opening a loop taking as looped elements all R1 fastq files in fastqs2 folder.
#xargs is used here for listing all the files without the directory and suffix.
#sed is replacing the "-R1.fastq.gz" by nothing, so the resulting names are clean.
do
output_folder=/cluster/work/pausch/new_bams/droegemueller_2/
raw_data_folder=/cluster/work/pausch/temp_scratch/pausch/fastqs2/
#input folder (where raw data are) and output folder (where output will sit) are created
bsub -n 1 -W 16:00 -R "rusage[mem=3000,scratch=1000]" -J "fastq_split" -env "all" "unset PYTHONPATH;/cluster/home/pauschh/programs/gdc-fastq-splitter/venv/bin/gdc-fastq-splitter -o ${output_folder}${sample_id}_ ${raw_data_folder}${sample_id}_R1.fastq.gz ${raw_data_folder}${sample_id}_R2.fastq.gz"
#This is submitting a job, requesting 1 processor core, 16 hours, 3000 Mb of memory, 1000 Mb of scratch space
#-J "job name" is used for assigning a name to the job, so it is easily detected. [-env "all"] seems to load the necessary stuff to execute the work.
#unset is used for deleting the variables during program execution
#PYTHONPATH augments the default search path for module files. This is, it sets the search path for importing python modules.
#Program gdc-fastq-splitter is run. -o output_prefix (the name it should have) and both fastq files (R1 and R2).
done
########################### END LOOP

## Quality control, alignment of fastq files, generation of BAM files and sorting

This loop is mainly preparing the system for running align_fastq.sh: custom made pipeline including fastp for quality control, bwa for read alignment and sambamba for sorting the resulting BAM file:

In [None]:
########################### LOOP
#++++++++++++++++++++++++
# in this case SM must be passed to align_fastq.sh - GATK will complain otherwise
for sample_name in `ls /cluster/work/pausch/new_bams/droegemueller_2/*_R1.fastq.gz | xargs -n 1 basename | sed 's/_R1.fastq.gz//'`
#this is opening a loop taking as looped elements all R1 fastq files in droegemueller_2 folder
#xargs is used here for listing all the files without the directory and suffix.
#sed is replacing the "-R1.fastq.gz" by nothing, so the resulting names are clean.
do
SM=`echo $sample_name | sed 's/_R1.fastq.gz//' | sed 's/\(.*\)_/\1/' | sed 's/\(.*\)_/\1 /' | awk '{print $1}'`
#SM will be formed by sample_names from looped objects where "_R1.fastq.gz" is removed, the underscores are removed and the first column is printed
bsub -n 16 -W 08:00 -N -R "rusage[mem=3500,scratch=1000]" -J "bwa mem" -env "all, ref=/cluster/work/pausch/inputs/ref/BTA/UCD1.2/ARS-UCD1.2_Btau5.0.1Y.fa,input_folder=/cluster/work/pausch/new_bams/droegemueller_2/,output_folder=/cluster/work/pausch/new_bams/droegemueller_2/aligned/,sample_id=${sample_name},temp_scratch_directory=/cluster/work/pausch/temp_scratch/pausch/,SM_tag=${SM},seq_center=UBern" < /cluster/home/pauschh/scripts/align_fastq.sh
#This is submitting a job, requesting 16 processor core, 8 hours, 3500 Mb of memory, 1000 Mb of scratch space
#-J "job name" is used for assigning a name to the job, so it is easily detected. [-env "all"] seems to load the necessary stuff to execute the work. Here the environment is longer than the previous steps as a .sh script is called.
#I assume that guided by the .sh script, the env takes the reference genome, the input folder where the fastq files are and the output folder where the fastq file need to lie. Sample_name is set as sample_id. Temporary directory is also given. SM_tag is provided, as well (using SM from above).
#script /cluster/home/pauschh/scripts/align_fastq.sh is run over it. I need to check what this script does, it can be found here: /cluster/work/pausch/audald
done
########################### END LOOP

## BAM files are merged, BAI files and stats are generated

This conditional loop helps running sambamba and Picard tools for BAM merging, BAI generation and creation of stats that help understanding the quality. Only one BAM file per sample will remain.

In [None]:
########################### LOOP
output_folder=/cluster/work/pausch/new_bams/droegemueller_2/aligned/
input_folder=/cluster/work/pausch/new_bams/droegemueller_2/
#Setting input and output folder for BAM files
for SM in `ls ${input_folder}*_R1.fastq.gz | sed 's/_R1.fastq.gz//' | sed 's/\(.*\)_/\1/' | sed 's/\(.*\)_/\1 /' | awk '{print $1}' | xargs -n 1 basename | sort | uniq`
#this is opening a loop taking as looped elements all R1 fastq files in droegemueller_2 folder
#sed is replacing the "-R1.fastq.gz" by nothing and removing the underscore, so the resulting names are clean.
#awk is only printing the first column
#xargs is used here for listing all the files without the directory and suffix.
#sort and uniq help ordering and keeping only unique names
do
files_per_sample=`ls ${output_folder}${SM}*.bam | xargs -n 1 basename | sed 's/_R1.fastq.gz//'`
#files_per sample is defined as the name without folders, suffixes and the pattern
if [[ `echo "$files_per_sample" | wc -l` -gt 1 ]]; then
#if there are more than one file with the same name
bsub -n 4 -W 09:00 -N -R "rusage[mem=5500,scratch=1000]" -J "bam merge" "/cluster/home/pauschh/bin/sambamba_v0.6.6 merge -t 4 ${output_folder}merge/${SM}.bam ${output_folder}${SM}_*.bam; module load gcc/4.8.2 java/1.8.0_73; java -Xmx8g -jar /cluster/home/pauschh/programs/picard.jar MarkDuplicates I=${output_folder}merge/${SM}.bam O=${output_folder}merge/${SM}_dedup.bam METRICS_FILE=${output_folder}merge/${SM}_metrics.txt ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 CREATE_INDEX=true TMP_DIR=/cluster/work/pausch/temp_scratch/pausch/test/; /cluster/home/pauschh/bin/sambamba_v0.6.6 flagstat -t 4 ${output_folder}merge/${SM}_dedup.bam > ${output_folder}merge/${SM}_dedup.stats"
#This is submitting a job, requesting 4 processor core, 9 hours, 5500 Mb of memory, 1000 Mb of scratch space
#-J "job name" is used for assigning a name to the job, so it is easily detected.
#sambamba (https://lomereiter.github.io/sambamba/docs/sambamba-merge.html) is run in order to merge the BAM files. Number of threads is set to 4. Output BAM is stated. Input BAM is stated
#gcc and java are loaded
#Picard is then run in order to identify duplicated runs (https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates). Input files and output files are provided. Where the metrics will be written is also provided. More options are chosen (see link)
#sambamba (https://lomereiter.github.io/sambamba/docs/sambamba-flagstat.html) is used for obtaining flag statistic from the BAM files. 4 threads are requested and the input files defined. The output file is where the content will be written.
fi
if [[ `echo "$files_per_sample" | wc -l` -eq 1 ]]; then
#For unique names in the list
cp ${output_folder}${SM}_*.bam ${output_folder}merge/${SM}.bam
#copy the bam files to the new folder
cp ${output_folder}${SM}_*.bam.bai ${output_folder}merge/${SM}.bam.bai
#copy the bai files to the new folder
cp ${output_folder}${SM}_*.stats ${output_folder}merge/${SM}.stats
#copy the stats files to the new folder
fi
done
########################### END LOOP