Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 9461 lines (8100 sloc) 386 KB
#!/usr/bin/env perl
use strict;
use warnings;
use IO::Handle;
use Cwd;
$|++;
use Getopt::Long;
use FindBin qw($RealBin);
use lib "$RealBin/../lib";
## This program is Copyright (C) 2010-19, Felix Krueger (felix.krueger@babraham.ac.uk)
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
my $parent_dir = getcwd();
my $bismark_version = 'v0.22.1_dev';
my $start_run = time();
my $command_line = join (" ",@ARGV);
### before processing the command line we will replace --solexa1.3-quals with --phred64-quals as the '.' in the option name will cause Getopt::Long to fail
foreach my $arg (@ARGV){
if ($arg eq '--solexa1.3-quals'){
$arg = '--phred64-quals';
}
}
my @filenames; # will be populated by processing the command line
my ($genome_folder,$CT_index_basename,$GA_index_basename,$path_to_bowtie,$path_to_hisat2,$sequence_file_format,$aligner_options,$directional,$unmapped,$ambiguous,$phred64,$output_dir,$bowtie2,$hisat2,$sam_no_hd,$skip,$upto,$temp_dir,$non_bs_mm,$insertion_open,$insertion_extend,$deletion_open,$deletion_extend,$gzip,$bam,$samtools_path,$pbat,$prefix,$old_flag,$basename,$score_min_intercept,$score_min_slope,$bt2_large_index,$multicore,$rg_tag,$rg_id,$rg_sample,$ambig_bam,$cram,$cram_ref,$nucleotide_coverage,$dovetail,$aligner_version,$slam,$icpc,$local) = process_command_line();
my @fhs; # stores alignment process names, bisulfite index location, bowtie filehandles and the number of times sequences produced an alignment
my %chromosomes; # stores the chromosome sequences of the mouse genome
my %SQ_order; # stores the order of sequences in the reference. This is to produce SAM/BAM files with a known order of chromosomes
my %counting; # counting various events
my $final_output_filename; # required for the nucleotide coverage report
my @pids; # storing the process IDs of child processes in parallel mode
my $seqID_contains_tabs;
my $verbose = 0;
if ($multicore > 1){
warn "Running Bismark Parallel version. Number of parallel instances to be spawned: $multicore\n\n";
}
sub multi_process_handling{
my $offset = 1;
my $process_id;
if ($multicore > 1){
until ($offset == $multicore){
# warn "multicore: $multicore\noffset: $offset\n";
my $fork = fork;
if (defined $fork){
if ($fork != 0){
$process_id = $fork;
push @pids, $process_id;
if ($offset < $multicore){
++$offset;
# warn "I am the parent process, child pid: $fork\nIncrementing offset counter to: $offset\n\n";
}
else{
# warn "Reached the number of maximum multicores. Proceeeding to processing...\n";
}
}
elsif ($fork == 0){
# warn "I am a child process, pid: $fork\nOffset counter is: $offset\nProceeding to processing...\n";
$process_id = $fork;
last;
}
}
else{
die "[FATAL ERROR]: Forking unsuccessful. This normally means that something is fundamentally not working with the fork command. Please run again without the --parallel option, or ask your system admin to look into this.\n";
}
}
# warn "\nThe Thread Identity\n===================\n";
if ($process_id){
# print "I am the parent process. My children are called:\n";
# print join ("\t",@pids),"\n";
# print "I am going to process the following line count: $offset\n\n";
}
elsif($process_id == 0){
# warn "I am a child process: Process ID: $process_id\n";
# warn "I am going to process the following line count: $offset\n\n";
}
else{
die "Process ID was: '$process_id'\n";
}
}
else{
warn "Single-core mode: setting pid to 1\n";
$process_id = 1;
}
return ($process_id,$offset);
}
sub subset_input_file_FastQ{
my ($filename,$process_id,$offset) = @_;
if ($filename =~ /gz$/){
open (OFFSET,"gunzip -c $filename |") or die "Couldn't read from file '$filename': $!\n";
}
else{
open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n";
}
# warn "offset is $offset\n";
my $temp = $filename;
$temp .= ".temp.$offset";
$temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information
if ($gzip){
$temp .= '.gz';
open (TEMPFQ,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n";
}
else{
open (TEMPFQ,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n";
}
my $line_count = 0;
my $seqs_processed = 0;
if (defined $upto){
# warn "Before we begin: -u was set to: -u $upto\n";
}
while (1){
my $l1 = <OFFSET>;
my $l2 = <OFFSET>;
my $l3 = <OFFSET>;
my $l4 = <OFFSET>;
last unless ($l4);
++$line_count;
# If user only want to process a subset of the input file
if (defined $upto){
if ($seqs_processed == $upto){
last;
}
}
if ( ($line_count - $offset)%$multicore == 0){
# warn "line count: $line_count\noffset: $offset\n";
# warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
# warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
print TEMPFQ "$l1$l2$l3$l4";
$seqs_processed++;
}
else{
# warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
next;
}
}
close OFFSET; # or warn $!;
close TEMPFQ or warn "Failed to close file handle TEMPFQ: $!\n";
warn "Finished subdividing $filename for PID: $process_id and offset $offset (sequences written out: $seqs_processed)\n\n";
return ($temp); # returning the subset filename
}
sub subset_input_file_FastA{
my ($filename,$process_id,$offset) = @_;
if ($filename =~ /gz$/){
open (OFFSET,"gunzip -c $filename |") or die "Couldn't read from file '$filename': $!\n";
}
else{
open (OFFSET,$filename) or die "Couldn't read from file '$filename': $!\n";
}
# warn "offset is $offset\n";
my $temp = $filename;
$temp .= ".temp.$offset";
$temp =~ s/^.*\///; # replacing everything upto and including the last /, i.e. removing file path information
if ($gzip){
$temp .= '.gz';
open (TEMPFA,"| gzip -c - > ${temp_dir}${temp}") or die "Can't write to file ${temp_dir}${temp}: $!\n";
}
else{
open (TEMPFA,'>',"${temp_dir}${temp}") or die "Failed to write output ${temp_dir}${temp}: $!\n";
}
warn "Writing temporary infile to $temp\n";
my $line_count = 0;
my $seqs_processed = 0;
while (1){
my $l1 = <OFFSET>;
my $l2 = <OFFSET>;
last unless ($l2);
++$line_count;
# If user only want to process a subset of the input file
if (defined $upto){
if ($seqs_processed == $upto){
last;
}
}
if ( ($line_count - $offset)%$multicore == 0){
# warn "line count: $line_count\noffset: $offset\n";
# warn "Modulus: ",($line_count - $offset)%$multicore,"\n";
# warn "processing this line $line_count (processID: $process_id with \$offset $offset)\n";
print TEMPFA "$l1$l2";
$seqs_processed++;
}
else{
# warn "skipping line $line_count for processID: $process_id with \$offset $offset)\n";
next;
}
}
close OFFSET or warn $!;
close TEMPFA or warn "Failed to close file handle TEMPFQ: $!\n";
warn "Finished subdividing $filename for PID: $process_id and offset $offset (sequences processed: $seqs_processed)\n\n";
return ($temp); # returning the subset filename
}
####
####
foreach my $filename (@filenames){
my $original_filename = $filename;
my $original_filename_1;
my $original_filename_2;
chdir $parent_dir or die "Unable to move to initial working directory'$parent_dir' $!\n";
### resetting the counting hash and fhs
reset_counters_and_fhs($filename);
@pids = ();
$seqID_contains_tabs = 0;
### if 2 or more files are provided we can hold the genome in memory and don't need to read it in a second time
unless (%chromosomes){
my $cwd = getcwd(); # storing the path of the current working directory
warn "Current working directory is: $cwd\n\n";
read_genome_into_memory($cwd);
}
### As of version 0.14.0 we support multi-threading. In a first instance we accomplish this by
### splitting the input file(s) into several smaller subfiles and merging the results back at
### the end.
# get general settings (also for single-threaded use)
my ($pid,$offset) = multi_process_handling ();
my ($single_end,$paired_end);
### PAIRED-END ALIGNMENTS
if ($filename =~ ','){
$single_end = 0;
$paired_end = 1;
my ($C_to_T_infile_1,$G_to_A_infile_1); # to be made from mate1 file
$fhs[0]->{name} = 'CTread1GAread2CTgenome';
$fhs[1]->{name} = 'GAread1CTread2GAgenome';
$fhs[2]->{name} = 'GAread1CTread2CTgenome';
$fhs[3]->{name} = 'CTread1GAread2GAgenome';
warn "\nPaired-end alignments will be performed\n",'='x39,"\n\n";
my ($filename_1,$filename_2) = (split (/,/,$filename));
$original_filename_1 = $filename_1;
$original_filename_2 = $filename_2;
warn "The provided filenames for paired-end alignments are $filename_1 and $filename_2\n";
### subsetting the input file(s)
unless ($multicore == 1){ # not needed in single-core mode
# warn "My PID: $pid\nMy offset: $offset\n";
if ($sequence_file_format eq 'FASTA'){
my $temp_filename_1 = subset_input_file_FastA($filename_1,$pid,$offset);
warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n";
$filename_1 = "${temp_dir}$temp_filename_1";
my $temp_filename_2 = subset_input_file_FastA($filename_2,$pid,$offset);
warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n";
$filename_2 = "${temp_dir}$temp_filename_2";
}
else{ # FastQ format, default
my $temp_filename_1 = subset_input_file_FastQ($filename_1,$pid,$offset);
warn "Using the subset file >${temp_dir}$temp_filename_1< as new in-file 1 (instead of >$filename_1<)\n";
$filename_1 = "${temp_dir}$temp_filename_1";
my $temp_filename_2 = subset_input_file_FastQ($filename_2,$pid,$offset);
warn "Using the subset file >${temp_dir}$temp_filename_2< as new in-file 2 (instead of >$filename_2<)\n";
$filename_2 = "${temp_dir}$temp_filename_2";
}
}
### additional variables only for paired-end alignments
my ($C_to_T_infile_2,$G_to_A_infile_2); # to be made from mate2 file
my $read1_count; # to see if R1 and R2 have the same length
my $read2_count;
### FastA format
if ($sequence_file_format eq 'FASTA'){
warn "Input files are in FastA format\n";
if ($directional){
($C_to_T_infile_1,$read1_count) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
($G_to_A_infile_2,$read2_count) = biTransformFastAFiles_paired_end ($filename_2,2);
$fhs[0]->{inputfile_1} = $C_to_T_infile_1;
$fhs[0]->{inputfile_2} = $G_to_A_infile_2;
$fhs[1]->{inputfile_1} = undef;
$fhs[1]->{inputfile_2} = undef;
$fhs[2]->{inputfile_1} = undef;
$fhs[2]->{inputfile_2} = undef;
$fhs[3]->{inputfile_1} = $C_to_T_infile_1;
$fhs[3]->{inputfile_2} = $G_to_A_infile_2;
}
elsif($pbat){ # PBAT-Seq
($G_to_A_infile_1,$read1_count) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
($C_to_T_infile_2,$read2_count) = biTransformFastAFiles_paired_end ($filename_2,2);
$fhs[0]->{inputfile_1} = undef;
$fhs[0]->{inputfile_2} = undef;
$fhs[1]->{inputfile_1} = $G_to_A_infile_1;
$fhs[1]->{inputfile_2} = $C_to_T_infile_2;
$fhs[2]->{inputfile_1} = $G_to_A_infile_1;
$fhs[2]->{inputfile_2} = $C_to_T_infile_2;
$fhs[3]->{inputfile_1} = undef;
$fhs[3]->{inputfile_2} = undef;
}
else{
($C_to_T_infile_1,$G_to_A_infile_1,$read1_count) = biTransformFastAFiles_paired_end ($filename_1,1); # also passing the read number
($C_to_T_infile_2,$G_to_A_infile_2,$read2_count) = biTransformFastAFiles_paired_end ($filename_2,2);
$fhs[0]->{inputfile_1} = $C_to_T_infile_1;
$fhs[0]->{inputfile_2} = $G_to_A_infile_2;
$fhs[1]->{inputfile_1} = $G_to_A_infile_1;
$fhs[1]->{inputfile_2} = $C_to_T_infile_2;
$fhs[2]->{inputfile_1} = $G_to_A_infile_1;
$fhs[2]->{inputfile_2} = $C_to_T_infile_2;
$fhs[3]->{inputfile_1} = $C_to_T_infile_1;
$fhs[3]->{inputfile_2} = $G_to_A_infile_2;
}
unless ($read1_count eq $read2_count){
die "[FATAL ERROR]:\tNumber of bisulfite transformed reads are not equal between Read 1 (\#$read1_count) and Read 2 (\#$read2_count).\nPossible causes: file truncation, or as a result of specifying read pairs that do not belong to each other?! Please re-specify file names! Exiting...\n\n";
}
if ($bowtie2){
paired_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
}
else{ # HISAT2
paired_end_align_fragments_to_bisulfite_genome_fastA_hisat2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
}
}
### FastQ format
else{
warn "Input files are in FastQ format\n";
if ($directional){
if ($slam){
($C_to_T_infile_1,$read1_count) = biTransformFastQFiles_paired_end_slam ($filename_1,1); # also passing the read number
($G_to_A_infile_2,$read2_count) = biTransformFastQFiles_paired_end_slam ($filename_2,2);
}
else{
($C_to_T_infile_1,$read1_count) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
($G_to_A_infile_2,$read2_count) = biTransformFastQFiles_paired_end ($filename_2,2);
}
$fhs[0]->{inputfile_1} = $C_to_T_infile_1;
$fhs[0]->{inputfile_2} = $G_to_A_infile_2;
$fhs[1]->{inputfile_1} = undef;
$fhs[1]->{inputfile_2} = undef;
$fhs[2]->{inputfile_1} = undef;
$fhs[2]->{inputfile_2} = undef;
$fhs[3]->{inputfile_1} = $C_to_T_infile_1;
$fhs[3]->{inputfile_2} = $G_to_A_infile_2;
}
elsif($pbat){ # PBAT-Seq
### At the moment we are only performing alignments only with uncompressed FastQ files
if ($slam){
($G_to_A_infile_1,$read1_count) = biTransformFastQFiles_paired_end_slam ($filename_1,1); # also passing the read number
($C_to_T_infile_2,$read2_count) = biTransformFastQFiles_paired_end_slam ($filename_2,2);
}
else{
($G_to_A_infile_1,$read1_count) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
($C_to_T_infile_2,$read2_count) = biTransformFastQFiles_paired_end ($filename_2,2);
}
$fhs[0]->{inputfile_1} = undef;
$fhs[0]->{inputfile_2} = undef;
$fhs[1]->{inputfile_1} = $G_to_A_infile_1;
$fhs[1]->{inputfile_2} = $C_to_T_infile_2;
$fhs[2]->{inputfile_1} = $G_to_A_infile_1;
$fhs[2]->{inputfile_2} = $C_to_T_infile_2;
$fhs[3]->{inputfile_1} = undef;
$fhs[3]->{inputfile_2} = undef;
}
else{ # non-directional
if ($slam){
($C_to_T_infile_1,$G_to_A_infile_1,$read1_count) = biTransformFastQFiles_paired_end_slam ($filename_1,1); # also passing the read number
($C_to_T_infile_2,$G_to_A_infile_2,$read2_count) = biTransformFastQFiles_paired_end_slam ($filename_2,2);
}
else{
($C_to_T_infile_1,$G_to_A_infile_1,$read1_count) = biTransformFastQFiles_paired_end ($filename_1,1); # also passing the read number
($C_to_T_infile_2,$G_to_A_infile_2,$read2_count) = biTransformFastQFiles_paired_end ($filename_2,2);
}
$fhs[0]->{inputfile_1} = $C_to_T_infile_1;
$fhs[0]->{inputfile_2} = $G_to_A_infile_2;
$fhs[1]->{inputfile_1} = $G_to_A_infile_1;
$fhs[1]->{inputfile_2} = $C_to_T_infile_2;
$fhs[2]->{inputfile_1} = $G_to_A_infile_1;
$fhs[2]->{inputfile_2} = $C_to_T_infile_2;
$fhs[3]->{inputfile_1} = $C_to_T_infile_1;
$fhs[3]->{inputfile_2} = $G_to_A_infile_2;
}
unless ($read1_count eq $read2_count){
die "[FATAL ERROR]:\tNumber of bisulfite transformed reads are not equal between Read 1 (\#$read1_count) and Read 2 (\#$read2_count).\nPossible causes: file truncation, or as a result of specifying read pairs that do not belong to each other?! Please re-specify file names! Exiting...\n\n";
}
if ($bowtie2){
paired_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
}
else{ #
paired_end_align_fragments_to_bisulfite_genome_fastQ_hisat2 ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2);
}
}
start_methylation_call_procedure_paired_ends($filename_1,$filename_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
}
### Else we are performing SINGLE-END ALIGNMENTS
else{
warn "\nSingle-end alignments will be performed\n",'='x39,"\n\n";
$single_end = 1;
$paired_end = 0;
### subsetting the input file(s)
unless ($multicore == 1){ # not needed in single-core mode
# warn "My PID: $pid\nMy offset: $offset\n";
if ($sequence_file_format eq 'FASTA'){
my $temp_filename = subset_input_file_FastA($filename,$pid,$offset);
warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n";
$filename = "${temp_dir}$temp_filename";
}
else{ # FastQ format, default
my $temp_filename = subset_input_file_FastQ($filename,$pid,$offset);
warn "Using the subset file >${temp_dir}$temp_filename< as new in-file (instead of >$filename<)\n";
$filename = "${temp_dir}$temp_filename";
}
}
### Initialising bisulfite conversion filenames
my ($C_to_T_infile,$G_to_A_infile);
### FastA format
if ($sequence_file_format eq 'FASTA'){
warn "Input file is in FastA format\n";
if ($directional){
($C_to_T_infile) = biTransformFastAFiles ($filename);
$fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
}
else{
($C_to_T_infile,$G_to_A_infile) = biTransformFastAFiles ($filename);
$fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
$fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
}
### Creating 4 different bowtie filehandles and storing the first entry
if ($bowtie2){
single_end_align_fragments_to_bisulfite_genome_fastA_bowtie2 ($C_to_T_infile,$G_to_A_infile);
}
else{
single_end_align_fragments_to_bisulfite_genome_fastA_hisat2 ($C_to_T_infile,$G_to_A_infile);
}
}
## FastQ format
else{
warn "Input file is in FastQ format\n";
if ($directional){
if ($slam){
($C_to_T_infile) = biTransformFastQFiles_slam ($filename);
}
else{
($C_to_T_infile) = biTransformFastQFiles ($filename);
}
$fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
}
elsif($pbat){
if ($slam){
($G_to_A_infile) = biTransformFastQFiles_slam ($filename);
}
else{
($G_to_A_infile) = biTransformFastQFiles ($filename);
}
$fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $G_to_A_infile; # PBAT-Seq only uses the G to A converted files
}
else{
if ($slam){
($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles_slam ($filename);
}
else{
($C_to_T_infile,$G_to_A_infile) = biTransformFastQFiles ($filename);
}
$fhs[0]->{inputfile} = $fhs[1]->{inputfile} = $C_to_T_infile;
$fhs[2]->{inputfile} = $fhs[3]->{inputfile} = $G_to_A_infile;
}
### Creating up to 4 different filehandles and storing the first entry
if ($pbat){
if ($bowtie2){ # as of version 0.10.2 we also support PBAT alignments for Bowtie 2
single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 (undef,$G_to_A_infile);
}
else{ # HISAT2
single_end_align_fragments_to_bisulfite_genome_fastQ_hisat2 (undef,$G_to_A_infile);
}
}
elsif ($bowtie2){
single_end_align_fragments_to_bisulfite_genome_fastQ_bowtie2 ($C_to_T_infile,$G_to_A_infile);
}
else{ # HISAT2
single_end_align_fragments_to_bisulfite_genome_fastQ_hisat2 ($C_to_T_infile,$G_to_A_infile);
}
}
start_methylation_call_procedure_single_ends($filename,$C_to_T_infile,$G_to_A_infile,$pid);
}
### MERGING AND DELETING TEMP FILES // TIDYING UP AFTER A MULTICORE PROCESS
if ($pid){ # only performing this for the parent process
if ($multicore > 1){
warn "Now waiting for all child processes to complete\n";
### we need to ensure that we wait for all child processes to be finished before continuing
# warn "here are the child IDs: @pids\n";
# warn "Looping through the child process IDs:\n";
foreach my $id (@pids){
# print "$id\t";
my $kid = waitpid ($id,0);
# print "Returned: $kid\nExit status: $?\n";
unless ($? == 0){
warn "\nChild process terminated with exit signal: '$?'\n\n";
}
}
# regenerating names for temporary files
my @temp_input;
my @temp_output;
my @temp_reports;
my @temp_unmapped_1; # will store single end reads or R1 of paired-end
my @temp_unmapped_2;
my @temp_ambiguous_1; # will store single end reads or R1 of paired-end
my @temp_ambiguous_2;
my @temp_ambig_bam;
for (1..$offset){
# Temp Input Files
if ($single_end){
if ($gzip){
push @temp_input, "${original_filename}.temp.${_}.gz";
}
else{
push @temp_input, "${original_filename}.temp.${_}";
}
}
elsif($paired_end){
if ($gzip){
push @temp_input, "${original_filename_1}.temp.${_}.gz";
push @temp_input, "${original_filename_2}.temp.${_}.gz";
}
else{
push @temp_input, "${original_filename_1}.temp.${_}";
push @temp_input, "${original_filename_2}.temp.${_}";
}
}
# if files had a prefix we need to specify it
my $add_prefix;
if (defined $prefix){
$add_prefix = "${prefix}.";
}
else{
$add_prefix = '';
}
if ($single_end){
# Temp Output Files
my $pathless_filename = ${original_filename}; # 10 Jan 2017
$pathless_filename =~ s/.*\///; # deleting path information
if ($bowtie2){
if ($gzip){
push @temp_output, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}.gz_bismark_bt2.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}.gz_bismark_bt2_SE_report.txt";
push @temp_ambig_bam, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}.gz_bismark_bt2.ambig.bam"; # only for Bowtie 2
}
else{
push @temp_output, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}_bismark_bt2.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}_bismark_bt2_SE_report.txt";
push @temp_ambig_bam, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}_bismark_bt2.ambig.bam"; # only for Bowtie 2
}
}
else{ # HISAT2
if ($gzip){
push @temp_output, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}.gz_bismark_hisat2.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}.gz_bismark_hisat2_SE_report.txt";
}
else{
push @temp_output, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}_bismark_hisat2.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}_bismark_hisat2_SE_report.txt";
}
}
if ($unmapped){
if ($gzip){
push @temp_unmapped_1, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}.gz_unmapped_reads.fq";
}
else{
push @temp_unmapped_1, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}_unmapped_reads.fq";
}
}
if ($ambiguous){
if ($gzip){
push @temp_ambiguous_1, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}.gz_ambiguous_reads.fq";
}
else{
push @temp_ambiguous_1, "${output_dir}${add_prefix}${pathless_filename}.temp.${_}_ambiguous_reads.fq";
}
}
}
elsif($paired_end){
# Temp Output Files
my $pathless_filename_1 = ${original_filename_1}; # 10 Jan 2017
my $pathless_filename_2 = ${original_filename_2};
$pathless_filename_1 =~ s/.*\///; # deleting path information
$pathless_filename_2 =~ s/.*\///; # deleting path information
if ($bowtie2){
if ($gzip){
push @temp_output, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}.gz_bismark_bt2_pe.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}.gz_bismark_bt2_PE_report.txt";
push @temp_ambig_bam, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}.gz_bismark_bt2_pe.ambig.bam"; # only for Bowtie 2
}
else{
push @temp_output, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}_bismark_bt2_pe.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}_bismark_bt2_PE_report.txt";
push @temp_ambig_bam, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}_bismark_bt2_pe.ambig.bam"; # only for Bowtie 2
}
}
else{
if ($gzip){
push @temp_output, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}.gz_bismark_hisat2_pe.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}.gz_bismark_hisat2_PE_report.txt";
}
else{
push @temp_output, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}_bismark_hisat2_pe.bam";
push @temp_reports, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}_bismark_hisat2_PE_report.txt";
}
}
if ($unmapped){
if ($gzip){
push @temp_unmapped_1, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}.gz_unmapped_reads_1.fq";
push @temp_unmapped_2, "${output_dir}${add_prefix}${pathless_filename_2}.temp.${_}.gz_unmapped_reads_2.fq";
}
else{
push @temp_unmapped_1, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}_unmapped_reads_1.fq";
push @temp_unmapped_2, "${output_dir}${add_prefix}${pathless_filename_2}.temp.${_}_unmapped_reads_2.fq";
}
}
if ($ambiguous){
if ($gzip){
push @temp_ambiguous_1, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}.gz_ambiguous_reads_1.fq";
push @temp_ambiguous_2, "${output_dir}${add_prefix}${pathless_filename_2}.temp.${_}.gz_ambiguous_reads_2.fq";
}
else{
push @temp_ambiguous_1, "${output_dir}${add_prefix}${pathless_filename_1}.temp.${_}_ambiguous_reads_1.fq";
push @temp_ambiguous_2, "${output_dir}${add_prefix}${pathless_filename_2}.temp.${_}_ambiguous_reads_2.fq";
}
}
}
}
warn "\n\nRight, cleaning up now...\n\n";
# deleting temp files;
warn "Deleting temporary sequence files...\n";
foreach my $temp (@temp_input){
#print "$temp\t";
$temp =~ s/.*\///; # deleting path information
print "${temp_dir}${temp}\t";
unlink "${temp_dir}${temp}" or warn "Failed to delete temporary FastQ file ${temp_dir}$temp: $!\n";
}
print "\n\n";
# merging temp BAM files
if ($single_end){
merge_individual_BAM_files(\@temp_output,$original_filename,$single_end);
}
else{
merge_individual_BAM_files(\@temp_output,$original_filename_1,$single_end);
}
# deleting temp BAM files
warn "Deleting temporary BAM files...\n";
foreach my $temp (@temp_output){
# print "$temp\t";
$temp =~ s/.*\///; # deleting path information
print "${output_dir}${temp}\t";
unlink "${output_dir}${temp}" or warn "Failed to delete temporary BAM file ${output_dir}${temp}: $!\n";
}
print "\n\n";
### AMBIGUOUS BAM files
if ($ambig_bam){
# merging temp AMBIG BAM files
if ($single_end){
merge_individual_ambig_BAM_files(\@temp_ambig_bam,$original_filename,$single_end);
}
else{
merge_individual_ambig_BAM_files(\@temp_ambig_bam,$original_filename_1,$single_end);
}
# deleting temp BAM files
warn "Deleting temporary ambiguous BAM files...\n";
foreach my $temp (@temp_ambig_bam){
# print "$temp\t";
$temp =~ s/.*\///; # deleting path information
print "${output_dir}${temp}\t";
unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous BAM file ${output_dir}${temp}: $!\n";
}
print "\n\n";
}
if ($unmapped){
if ($single_end){
merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename,$single_end);
}
else{
merge_individual_unmapped_files(\@temp_unmapped_1,$original_filename_1,$single_end,'_1');
merge_individual_unmapped_files(\@temp_unmapped_2,$original_filename_2,$single_end,'_2');
}
# deleting temp unmapped files
warn "Deleting temporary unmapped files...\n";
foreach my $temp (@temp_unmapped_1){
print "temp\t";
unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n";
}
if ($paired_end){
foreach my $temp (@temp_unmapped_2){
print "$temp\t";
unlink "${output_dir}${temp}" or warn "Failed to delete temporary unmapped FastQ file ${output_dir}$temp: $!\n";
}
}
print "\n\n";
}
if ($ambiguous){
if ($single_end){
merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename,$single_end);
}
else{
merge_individual_ambiguous_files(\@temp_ambiguous_1,$original_filename_1,$single_end,'_1');
merge_individual_ambiguous_files(\@temp_ambiguous_2,$original_filename_2,$single_end,'_2');
}
# deleting temp ambiguous files
warn "Deleting temporary ambiguous files...\n";
foreach my $temp (@temp_ambiguous_1){
print "$temp\t";
unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n";
}
if ($paired_end){
foreach my $temp (@temp_ambiguous_2){
print "$temp\t";
unlink "${output_dir}${temp}" or warn "Failed to delete temporary ambiguous FastQ file ${output_dir}$temp: $!\n";
}
}
print "\n\n";
}
# resetting the counters once more so we can add all data from all temporary reports
reset_counters_and_fhs($original_filename);
### Merging the Bismark mapping report files
if ($single_end){
merge_individual_mapping_reports(\@temp_reports,$original_filename,$single_end);
print_final_analysis_report_single_end('mock_file1','mock_file_2','mock_pid','mergeThis');
}
else{
merge_individual_mapping_reports(\@temp_reports,$original_filename_1,$single_end,$original_filename_2);
print_final_analysis_report_paired_ends('mock_file1','mock_file_2','mock_file3','mock_file_4','mock_pid','mergeThis');
}
# deleting temp report files
warn "Deleting temporary report files...\n";
foreach my $temp (@temp_reports){
print "$temp\t";
unlink "${output_dir}${temp}" or warn "Failed to delete temporary report file $output_dir$temp: $!\n";
}
print "\n\n";
}
}
if ($pid){ # only for the Parent
### Produce Run Time
my $end_run = time();
my $run_time = $end_run - $start_run;
my $days = int($run_time/(24*60*60));
my $hours = ($run_time/(60*60))%24;
my $mins = ($run_time/60)%60;
my $secs = $run_time%60;
warn "Bismark completed in ${days}d ${hours}h ${mins}m ${secs}s\n";
print REPORT "Bismark completed in ${days}d ${hours}h ${mins}m ${secs}s\n";
warn "\n====================\nBismark run complete\n====================\n\n";
if ($nucleotide_coverage){
warn "Now calculating observed and expected nucleotide coverage statistics... \n\n";
if ($final_output_filename =~ /(bam|cram)|/){
my @args;
push @args, "--genome $genome_folder";
push @args, "--dir '$output_dir'";
push @args, "--samtools_path $samtools_path";
push @args, $final_output_filename;
print "@args","\n"; sleep(3);
system ("$RealBin/bam2nuc @args");
warn "Finished bam2nuc calculation ...\n\n";
}
else{
warn "Nucleotide coverage statistics are currently only available for BAM or CRAM files\n\n";
}
}
}
else{
# If multiple files were supplied as the command line, like so:
# -1 R1.fastq,simulated_1.fastq,ZZZ_R1.fastq -2 R2.fastq,simulated_2.fastq,ZZZ_R2.fastq --multicore 4
# we need to exit from the child processes if we don't want a steady increase of new Bismark instances! Fixed 30 10 2017
# warn "Terminating Child process\n\n";
exit;
}
}
sub merge_individual_mapping_reports{
my ($temp_reports,$original_filename_1,$single_end,$original_filename_2) = @_;
my $report_file = $original_filename_1;
$report_file =~ s/.*\///; # removing path information
$report_file =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter
if ($prefix){
$report_file = "${prefix}.${report_file}";
}
if ($basename){ # Output file basename is set using the -B argument
$report_file = ${basename};
}
if ($single_end){
if ($bowtie2){
$report_file .= '_bismark_bt2_SE_report.txt';
}
else{
$report_file .= '_bismark_hisat2_SE_report.txt';
}
}
else{
if ($bowtie2){
$report_file .= '_bismark_bt2_PE_report.txt';
}
else{
$report_file .= '_bismark_hisat2_PE_report.txt';
}
}
warn "Writing report to ${output_dir}${report_file}\n";
open (REPORT,'>',"$output_dir$report_file") or die "Failed to write to ${output_dir}${report_file}: $!\n";
foreach my $temp(@$temp_reports){
$temp =~ s/.*\///; # removing path information
}
warn "Now merging temporary reports @$temp_reports into >>> ${output_dir}${report_file} <<<\n";
if ($single_end){
print REPORT "Bismark report for: $original_filename_1 (version: $bismark_version)\n";
}
else{ # paired-end
print REPORT "Bismark report for: $original_filename_1 and $original_filename_2 (version: $bismark_version)\n";
}
my $first = 0;
foreach my $temp(@$temp_reports){
# $temp =~ s/.*\///; # removing path information
warn "Merging from file >> $temp <<\n";
open (IN,"${output_dir}${temp}") or die "Failed to read from temporary mapping report '${output_dir}${temp}'\n";
### this is printing the first couple of lines
while (<IN>){
chomp;
if ($_ =~ /^Bismark report/){
next;
}
unless ($first){ # only happens for the first run we are processing
if ($_ =~ /^Final Alignment/){
++$first;
last;
}
else{
print REPORT "$_\n";
}
}
}
close IN or warn "Failed to close filehandle\n";
### Simon says: You are going to regret this in the future. Just for the record. He might be right...
read_alignment_report($temp,$single_end);
}
warn "\n";
}
sub read_alignment_report{
my ($report,$single_end) = @_;
my $unique;
my $no_aln;
my $multiple;
my $no_genomic;
my $total_seqs;
my $bismark_version;
my $input_filename;
my $unique_text;
my $no_aln_text;
my $multiple_text;
my $total_seq_text;
my $total_C_count;
my ($meth_CpG,$meth_CHG,$meth_CHH,$meth_unknown);
my ($unmeth_CpG,$unmeth_CHG,$unmeth_CHH,$unmeth_unknown);
my $number_OT;
my $number_CTOT;
my $number_CTOB;
my $number_OB;
open (ALN,"${output_dir}${report}") or die "Failed to read from temporary mapping report '$output_dir$report'\n";
while (<ALN>){
chomp;
### General Alignment stats
if ($_ =~ /^Sequence pairs analysed in total:/ ){ ## Paired-end
(undef,$total_seqs) = split /\t/;
# warn "Total paired seqs: >> $total_seqs <<\n";
}
elsif ($_ =~ /^Sequences analysed in total:/ ){ ## Single-end
(undef,$total_seqs) = split /\t/;
# warn "total single-end seqs >> $total_seqs <<\n";
}
elsif($_ =~ /^Number of paired-end alignments with a unique best hit:/){ ## Paired-end
(undef,$unique) = split /\t/;
# warn "Unique PE>> $unique <<\n";
}
elsif($_ =~ /^Number of alignments with a unique best hit from/){ ## Single-end
(undef,$unique) = split /\t/;
# warn "Unique SE>> $unique <<\n";
}
elsif($_ =~ /^Sequence pairs with no alignments under any condition:/){ ## Paired-end
(undef,$no_aln) = split /\t/;
# warn "No alignment PE >> $no_aln <<\n";
}
elsif($_ =~ /^Sequences with no alignments under any condition:/){ ## Single-end
(undef,$no_aln) = split /\t/;
# warn "No alignments SE>> $no_aln <<\n";
}
elsif($_ =~ /^Sequence pairs did not map uniquely:/){ ## Paired-end
(undef,$multiple) = split /\t/;
# warn "Multiple alignments PE >> $multiple <<\n";
}
elsif($_ =~ /^Sequences did not map uniquely:/){ ## Single-end
(undef,$multiple) = split /\t/;
# warn "Multiple alignments SE >> $multiple <<\n";
}
elsif($_ =~ /^Sequence pairs which were discarded because genomic sequence could not be extracted:/){ ## Paired-end
(undef,$no_genomic) = split /\t/;
# warn "No genomic sequence PE >> $no_genomic <<\n";
}
elsif($_ =~ /^Sequences which were discarded because genomic sequence could not be extracted:/){ ## Single-end
(undef,$no_genomic) = split /\t/;
# warn "No genomic sequence SE>> $no_genomic <<\n";
}
### Context Methylation
elsif($_ =~ /^Total number of C/ ){
(undef,$total_C_count) = split /\t/;
# warn "Total number C >> $total_C_count <<\n";
}
elsif($_ =~ /^Total methylated C\'s in CpG context:/ ){
(undef,$meth_CpG) = split /\t/;
# warn "meth CpG >> $meth_CpG <<\n" ;
}
elsif($_ =~ /^Total methylated C\'s in CHG context:/ ){
(undef,$meth_CHG) = split /\t/;
# warn "meth CHG >> $meth_CHG <<\n" ;
}
elsif($_ =~ /^Total methylated C\'s in CHH context:/ ){
(undef,$meth_CHH) = split /\t/;
# warn "meth CHH >> $meth_CHH <<\n" ;
}
elsif($_ =~ /^Total methylated C\'s in Unknown context:/ ){
(undef,$meth_unknown) = split /\t/;
# warn "meth Unknown >> $meth_unknown <<\n" ;
}
elsif($_ =~ /^Total unmethylated C\'s in CpG context:/ or $_ =~ /^Total C to T conversions in CpG context:/){
(undef,$unmeth_CpG) = split /\t/;
# warn "unmeth CpG >> $unmeth_CpG <<\n" ;
}
elsif($_ =~ /^Total unmethylated C\'s in CHG context:/ or $_ =~ /^Total C to T conversions in CHG context:/){
(undef,$unmeth_CHG) = split /\t/;
# warn "unmeth CHG >> $unmeth_CHG <<\n" ;
}
elsif($_ =~ /^Total unmethylated C\'s in CHH context:/ or $_ =~ /^Total C to T conversions in CHH context:/){
(undef,$unmeth_CHH) = split /\t/;
# warn "unmeth CHH >> $unmeth_CHH <<\n";
}
elsif($_ =~ /^Total unmethylated C\'s in Unknown context:/ or $_ =~ /^Total C to T conversions in Unknown context:/){
(undef,$unmeth_unknown) = split /\t/;
# warn "unmeth Unknown >> $unmeth_unknown <<\n" ;
}
### Strand Origin
elsif($_ =~ /^CT\/GA\/CT:/ ){ ## Paired-end
(undef,$number_OT) = split /\t/;
# warn "Number OT PE>> $number_OT <<\n" ;
}
elsif($_ =~ /^CT\/CT:/ ){ ## Single-end
(undef,$number_OT) = split /\t/;
# warn "Number OT SE>> $number_OT <<\n" ;
}
elsif($_ =~ /^GA\/CT\/CT:/ ){ ## Paired-end
(undef,$number_CTOT) = split /\t/;
# warn "Number CTOT PE >> $number_CTOT <<\n" ;
}
elsif($_ =~ /^GA\/CT:/ ){ ## Single-end
(undef,$number_CTOT) = split /\t/;
# warn "Number CTOT SE >> $number_CTOT <<\n" ;
}
elsif($_ =~ /^GA\/CT\/GA:/ ){ ## Paired-end
(undef,$number_CTOB) = split /\t/;
# warn "Number CTOB PE >> $number_CTOB <<\n" ;
}
elsif($_ =~ /^GA\/GA:/ ){ ## Single-end
(undef,$number_CTOB) = split /\t/;
# warn "Number CTOB SE >> $number_CTOB <<\n";
}
elsif($_ =~ /^CT\/GA\/GA:/ ){ ## Paired-end
(undef,$number_OB) = split /\t/;
# warn "Number OB PE >> $number_OB <<\n";
}
elsif($_ =~ /^CT\/GA:/ ){ ## Single-end
(undef,$number_OB) = split /\t/;
# warn "Number OB SE >> $number_OB <<\n";
}
}
$counting{sequences_count} += $total_seqs;
$counting{unique_best_alignment_count} += $unique;
$counting{no_single_alignment_found} += $no_aln;
$counting{unsuitable_sequence_count} += $multiple;
$counting{genomic_sequence_could_not_be_extracted_count} += $no_genomic;
$counting{total_meCHH_count} += $meth_CHH;
$counting{total_meCHG_count} += $meth_CHG;
$counting{total_meCpG_count} += $meth_CpG;
if ($bowtie2){
$counting{total_meC_unknown_count} += $meth_unknown;
}
$counting{total_unmethylated_CHH_count} += $unmeth_CHH;
$counting{total_unmethylated_CHG_count} += $unmeth_CHG;
$counting{total_unmethylated_CpG_count} += $unmeth_CpG;
if ($bowtie2){
$counting{total_unmethylated_C_unknown_count} += $unmeth_unknown;
}
if ($single_end){
$counting{CT_CT_count} += $number_OT;
$counting{CT_GA_count} += $number_OB;
$counting{GA_CT_count} += $number_CTOT;
$counting{GA_GA_count} += $number_CTOB;
}
else{
# paired-end
$counting{GA_CT_CT_count} += $number_CTOT;
$counting{CT_GA_CT_count} += $number_OT;
$counting{GA_CT_GA_count} += $number_CTOB;
$counting{CT_GA_GA_count} += $number_OB;
}
}
sub merge_individual_ambiguous_files{
my ($temp_ambiguous,$original_filename,$single_end,$paired_information) = @_;
my $ambiguous_file = $original_filename;
$ambiguous_file =~ s/.*\///; # removing path information
if ($prefix){
$ambiguous_file = "${prefix}.${ambiguous_file}";
}
if ($single_end){
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$ambiguous_file = "${basename}_ambiguous_reads.fq.gz";
}
else{
$ambiguous_file = "${basename}_ambiguous_reads.fa.gz";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$ambiguous_file =~ s/$/_ambiguous_reads.fq.gz/;
}
else{
$ambiguous_file =~ s/$/_ambiguous_reads.fa.gz/;
}
}
}
else{ # paired-end
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fq.gz";
}
else{
$ambiguous_file = "${basename}_ambiguous_reads${paired_information}.fa.gz";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fq.gz/;
}
else{
$ambiguous_file =~ s/$/_ambiguous_reads${paired_information}.fa.gz/;
}
}
}
foreach my $temp(@$temp_ambiguous){
$temp =~ s/.*\///; # removing path information
}
open (AMBIGUOUS,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
warn "Now merging ambiguous sequences @$temp_ambiguous into >>> $output_dir$ambiguous_file <<<\n";
foreach my $temp(@$temp_ambiguous){
warn "Merging from file >> $temp <<\n";
if ($temp =~ /gz$/){
open (IN,"gunzip -c ${output_dir}$temp |") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n";
}
else{
open (IN,"${output_dir}$temp") or die "Failed to read from ambiguous temp file '${output_dir}$temp'\n";
}
while (<IN>){
print AMBIGUOUS;
}
close IN or warn "Failed to close filehandle\n";
}
warn "\n";
close AMBIGUOUS or warn "Failed to close output filehandle AMBIGUOUS\n\n";
}
sub merge_individual_unmapped_files{
my ($temp_unmapped,$original_filename,$single_end,$paired_information) = @_;
my $unmapped_file = $original_filename;
$unmapped_file =~ s/.*\///; # removing path information
if ($prefix){
$unmapped_file = "${prefix}.${unmapped_file}";
}
if ($single_end){
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$unmapped_file = "${basename}_unmapped_reads.fq.gz";
}
else{
$unmapped_file = "${basename}_unmapped_reads.fa.gz";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$unmapped_file =~ s/$/_unmapped_reads.fq.gz/;
}
else{
$unmapped_file =~ s/$/_unmapped_reads.fa.gz/;
}
}
}
else{ # paired-end
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$unmapped_file = "${basename}_unmapped_reads${paired_information}.fq.gz";
}
else{
$unmapped_file = "${basename}_unmapped_reads${paired_information}.fa.gz";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$unmapped_file =~ s/$/_unmapped_reads${paired_information}.fq.gz/;
}
else{
$unmapped_file =~ s/$/_unmapped_reads${paired_information}.fa.gz/;
}
}
}
foreach my $temp(@$temp_unmapped){
$temp =~ s/.*\///; # removing path information
}
open (UNMAPPED,"| gzip -c - > ${output_dir}${unmapped_file}") or die "Failed to write to ${output_dir}${unmapped_file}: $!\n";
warn "Now merging unmapped sequences @$temp_unmapped into >>> ${output_dir}${unmapped_file} <<<\n";
foreach my $temp(@$temp_unmapped){
warn "Merging from file >> $temp <<\n";
if ($temp =~ /gz$/){
open (IN,"gunzip -c ${output_dir}${temp} |") or die "Failed to read from unmapped temp file '${output_dir}$temp'\n";
}
else{
open (IN,"${output_dir}${temp}") or die "Failed to read from unmapped temp file '${output_dir}${temp}'\n";
}
while (<IN>){
print UNMAPPED;
}
close IN or warn "Failed to close filehandle\n";
}
warn "\n";
close UNMAPPED or warn "Failed to close output filehandle UNMAPPED\n\n";
}
sub merge_individual_BAM_files{
my ($tempbam,$original_filename,$single_end) = @_;
my $merged_name = $original_filename;
#warn "merged name is: $merged_name\n";
$merged_name =~ s/.*\///; # deleting path information
# warn "merged name is: $merged_name\n";
$merged_name =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter
# warn "merged name is: $merged_name\n"; sleep(5);
foreach my $temp_bam(@$tempbam){
$temp_bam =~ s/.*\///; # deleting path information
}
if ($prefix){
$merged_name = "$prefix.$merged_name";
}
if ($single_end){
if ($bowtie2){ # BAM format is the default for Bowtie 2
$merged_name .= '_bismark_bt2.bam';
}
else{ # BAM is the default output
$merged_name .= '_bismark_hisat2.bam';
}
if ($basename){ # Output file basename is set using the -B argument
$merged_name = "${basename}.bam";
}
}
else{
if ($bowtie2){ # BAM format is the default for Bowtie 2
$merged_name .= '_bismark_bt2_pe.bam';
}
else{ # BAM is the default output
$merged_name .= '_bismark_hisat2_pe.bam';
}
if ($basename){ # Output file basename is set using the -B argument
$merged_name = "${basename}_pe.bam";
}
}
if ($cram){
$merged_name =~ s/bam$/cram/;
warn "At this stage we write out a single CRAM file and delete all temporary BAM files\n";
warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n";
$final_output_filename = "${output_dir}${merged_name}";
open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to CRAM file $merged_name: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n";
}
else{
$final_output_filename = "${output_dir}${merged_name}";
warn "Now merging BAM files @$tempbam into >>> $merged_name <<<\n";
open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n";
}
my $first = 0;
foreach my $temp_bam(@$tempbam){
# $temp_bam =~ s/.*\///; # deleting path information
warn "Merging from file >> $temp_bam <<\n";
if ($first > 0){
open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n";
}
else{ # only for the first file we print the header as well
open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n";
}
while (<IN>){
print OUT;
}
close IN or warn "Failed to close filehandle\n";
++$first;
}
warn "\n";
close OUT or warn "Failed to close output filehandle\n\n";
}
sub merge_individual_ambig_BAM_files{
my ($tempbam,$original_filename,$single_end) = @_;
my $merged_name = $original_filename;
# warn "merged name is: $merged_name\n";
$merged_name =~ s/.*\///; # deleting path information
# warn "merged name is: $merged_name\n"; sleep(1);
foreach my $temp_bam(@$tempbam){
$temp_bam =~ s/.*\///; # deleting path information
}
if ($prefix){
$merged_name = "$prefix.$merged_name";
}
if ($single_end){
if ($bowtie2){ # BAM format is the default for Bowtie 2
$merged_name .= '_bismark_bt2.ambig.bam';
}
if ($basename){ # Output file basename is set using the -B argument
$merged_name = "${basename}.ambig.bam";
}
}
else{
if ($bowtie2){ # BAM format is the default for Bowtie 2
$merged_name .= '_bismark_bt2_pe.ambig.bam';
}
if ($basename){ # Output file basename is set using the -B argument
$merged_name = "${basename}_pe.ambig.bam";
}
}
warn "Now merging ambiguous BAM files @$tempbam into >>> $merged_name <<<\n";
open (OUT,"| $samtools_path view -bSh 2>/dev/null - > ${output_dir}${merged_name}") or die "Failed to write to $merged_name: $!\n";
my $first = 0;
foreach my $temp_bam(@$tempbam){
# $temp_bam =~ s/.*\///; # deleting path information
warn "Merging from file >> $temp_bam <<\n";
if ($first > 0){
open (IN,"$samtools_path view ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n";
}
else{ # only for the first file we print the header as well
open (IN,"$samtools_path view -h ${output_dir}${temp_bam} |") or die "Failed to read from BAM file ${output_dir}${temp_bam}\n";
}
while (<IN>){
print OUT;
}
close IN or warn "Failed to close filehandle\n";
++$first;
}
warn "\n";
close OUT or warn "Failed to close output filehandle\n\n";
}
sub start_methylation_call_procedure_single_ends {
my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_;
my ($dir,$filename);
if ($sequence_file =~ /\//){
($dir,$filename) = $sequence_file =~ m/(.*\/)(.*)$/;
}
else{
$filename = $sequence_file;
}
### printing all alignments to a results file
my $outfile = $filename;
# warn "Outfile: $outfile\n";
$outfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter
# warn "Outfile: $outfile\n";sleep(5);
if ($prefix){
$outfile = "$prefix.$outfile";
}
if ($bowtie2){ # SAM format is the default for Bowtie 2
$outfile =~ s/$/_bismark_bt2.sam/;
}
else{ # SAM output is the default for HISAT2
$outfile =~ s/$/_bismark_hisat2.sam/;
}
if ($basename){ # Output file basename is set using the -B argument
$outfile = "${basename}.sam";
}
$bam = 0 unless (defined $bam);
if ($ambig_bam){
my $ambig_bam_out = $outfile;
$ambig_bam_out =~ s/sam$/ambig.bam/;
warn "Ambiguous BAM output: $ambig_bam_out\n";
open (AMBIBAM,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$ambig_bam_out") or die "Failed to write to $ambig_bam_out: $!\n";
}
if ($cram){ ### Samtools is installed, writing out CRAM directly. This qill require Samtools version 1.2 or higher!
### for multicore processing we write out BAM files by default and merge them together as a single CRAM file in the merging step later on.
### This avoids having to change all the the file endings on the way
if($multicore > 1){
$outfile =~ s/sam$/bam/;
open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
else{ # single-core mode
$outfile =~ s/sam$/cram/;
$final_output_filename = "${output_dir}${outfile}";
open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > $output_dir$outfile") or die "Failed to write to CRAM file $outfile: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n";
}
}
elsif($bam == 1){ ### Samtools is installed, writing out BAM directly
$outfile =~ s/sam$/bam/;
$final_output_filename = "${output_dir}${outfile}";
open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
$outfile .= '.gz';
open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
else{ # uncompressed ouput, default
open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
warn "\n>>> Writing bisulfite mapping results to $output_dir$outfile <<<\n\n";
sleep(1);
### printing alignment and methylation call summary to a report file
my $reportfile = $filename;
$reportfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter
if ($prefix){
$reportfile = "$prefix.$reportfile";
}
if ($bowtie2){
$reportfile =~ s/$/_bismark_bt2_SE_report.txt/;
}
else{
$reportfile =~ s/$/_bismark_hisat2_SE_report.txt/;
}
if ($basename){ # Output file basename is set using the -B argument
$reportfile = "${basename}_SE_report.txt";
}
open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
print REPORT "Bismark report for: $sequence_file (version: $bismark_version)\n";
if ($unmapped){
my $unmapped_file = $filename;
if ($prefix){
$unmapped_file = "$prefix.$unmapped_file";
}
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$unmapped_file = "${basename}_unmapped_reads.fq";
}
else{
$unmapped_file = "${basename}_unmapped_reads.fa";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$unmapped_file =~ s/$/_unmapped_reads.fq/;
}
else{
$unmapped_file =~ s/$/_unmapped_reads.fa/;
}
}
if ($multicore > 1){ # multicore runs already output gzipped unmapped files
open (UNMAPPED,'>',"$output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
}
else{
$unmapped_file .= '.gz';
open (UNMAPPED,"| gzip -c - > $output_dir$unmapped_file") or die "Failed to write to $unmapped_file: $!\n";
}
warn "Unmapped sequences will be written to $output_dir$unmapped_file\n";
}
if ($ambiguous){
my $ambiguous_file = $filename;
if ($prefix){
$ambiguous_file = "$prefix.$ambiguous_file";
}
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$ambiguous_file = "${basename}_ambiguous_reads.fq";
}
else{
$ambiguous_file = "${basename}_ambiguous_reads.fa";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$ambiguous_file =~ s/$/_ambiguous_reads.fq/;
}
else{
$ambiguous_file =~ s/$/_ambiguous_reads.fa/;
}
}
if ($multicore > 1){ # multicore runs already output gzipped amobiguous files
open (AMBIG,'>',"$output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
}
else{
$ambiguous_file .= '.gz';
open (AMBIG,"| gzip -c - > $output_dir$ambiguous_file") or die "Failed to write to $ambiguous_file: $!\n";
}
warn "Ambiguously mapping sequences will be written to $output_dir$ambiguous_file\n";
}
if ($directional){
print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n";
}
elsif ($pbat){
print REPORT "Option '--pbat' specified: alignments to original strands (OT and OB) strands were ignored (i.e. not performed)\n";
}
else{
print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n";
}
if ($bowtie2){
print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $aligner_options\n\n";
}
else{
print REPORT "Bismark was run with HISAT2 against the bisulfite genome of $genome_folder with the specified options: $aligner_options\n\n";
}
unless ($sam_no_hd){
generate_SAM_header();
}
### Input file is in FastA format
if ($sequence_file_format eq 'FASTA'){
process_single_end_fastA_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid);
}
### Input file is in FastQ format
else{
process_single_end_fastQ_file_for_methylation_call($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid);
}
}
sub start_methylation_call_procedure_paired_ends {
my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_;
my ($dir_1,$filename_1);
if ($sequence_file_1 =~ /\//){
($dir_1,$filename_1) = $sequence_file_1 =~ m/(.*\/)(.*)$/;
}
else{
$filename_1 = $sequence_file_1;
}
my ($dir_2,$filename_2);
if ($sequence_file_2 =~ /\//){
($dir_2,$filename_2) = $sequence_file_2 =~ m/(.*\/)(.*)$/;
}
else{
$filename_2 = $sequence_file_2;
}
### printing all alignments to a results file
my $outfile = $filename_1;
# warn "Outfile: $outfile\n";
$outfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter
# warn "Outfile: $outfile\n";sleep(5);
if ($prefix){
$outfile = "$prefix.$outfile";
}
if ($bowtie2){ # SAM format is the default Bowtie 2 output
$outfile =~ s/$/_bismark_bt2_pe.sam/;
}
else{ # SAM format is the default for HISAT2
$outfile =~ s/$/_bismark_hisat2_pe.sam/;
}
if ($basename){ # Output file basename is set using the -B argument
$outfile = "${basename}_pe.sam";
}
if ($ambig_bam){
my $ambig_bam_out = $outfile;
$ambig_bam_out =~ s/sam$/ambig.bam/;
warn "Ambiguous BAM output: $ambig_bam_out\n";
open (AMBIBAM,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$ambig_bam_out") or die "Failed to write to $ambig_bam_out: $!\n";
}
$bam = 0 unless (defined $bam);
if ($cram){ ### Samtools is installed, writing out CRAM directly. This qill require Samtools version 1.2 or higher!
if ($multicore > 1){
$outfile =~ s/sam$/bam/;
open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
else{ # single-core mode
$outfile =~ s/sam$/cram/;
$final_output_filename = "${output_dir}${outfile}";
open (OUT,"| $samtools_path view -h -C -T $cram_ref 2>/dev/null - > $output_dir$outfile") or die "Failed to write to CRAM file $outfile: $!\nPlease note that this option requires Samtools version 1.2 or higher!\n\n";
}
}
elsif ($bam == 1){ ### Samtools is installed, writing out BAM directly
$outfile =~ s/sam$/bam/;
$final_output_filename = "${output_dir}${outfile}";
open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
$outfile .= '.gz';
open (OUT,"| gzip -c - > $output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
else{ # uncompressed ouput, default
open (OUT,'>',"$output_dir$outfile") or die "Failed to write to $outfile: $!\n";
}
warn "\n>>> Writing bisulfite mapping results to $outfile <<<\n\n";
sleep(1);
### printing alignment and methylation call summary to a report file
my $reportfile = $filename_1;
$reportfile =~ s/(\.fastq\.gz|\.fq\.gz|\.fastq|\.fq)$//; # attempting to remove fastq.gz etc to make filename a little shorter
if ($prefix){
$reportfile = "$prefix.$reportfile";
}
if ($bowtie2){
$reportfile =~ s/$/_bismark_bt2_PE_report.txt/;
}
else{
$reportfile =~ s/$/_bismark_hisat2_PE_report.txt/;
}
if ($basename){ # Output file basename is set using the -B argument
$reportfile = "${basename}_PE_report.txt";
}
open (REPORT,'>',"$output_dir$reportfile") or die "Failed to write to $reportfile: $!\n";
print REPORT "Bismark report for: $sequence_file_1 and $sequence_file_2 (version: $bismark_version)\n";
if ($bowtie2){
print REPORT "Bismark was run with Bowtie 2 against the bisulfite genome of $genome_folder with the specified options: $aligner_options\n";
}
else{
print REPORT "Bismark was run with HISAT2 against the bisulfite genome of $genome_folder with the specified options: $aligner_options\n";
}
### Unmapped read output
if ($unmapped){
my $unmapped_1 = $filename_1;
my $unmapped_2 = $filename_2;
if ($prefix){
$unmapped_1 = "$prefix.$unmapped_1";
$unmapped_2 = "$prefix.$unmapped_2";
}
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$unmapped_1 = "${basename}_unmapped_reads_1.fq";
$unmapped_2 = "${basename}_unmapped_reads_2.fq";
}
else{
$unmapped_1 = "${basename}_unmapped_reads_1.fa";
$unmapped_2 = "${basename}_unmapped_reads_2.fa";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$unmapped_1 =~ s/$/_unmapped_reads_1.fq/;
$unmapped_2 =~ s/$/_unmapped_reads_2.fq/;
}
else{
$unmapped_1 =~ s/$/_unmapped_reads_1.fa/;
$unmapped_2 =~ s/$/_unmapped_reads_2.fa/;
}
}
if ($multicore > 1){ # unmapped files are merged into .gz files in multicore runs anyway
open (UNMAPPED_1,'>',"$output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
open (UNMAPPED_2,'>',"$output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
}
else{
$unmapped_1 .= '.gz';
$unmapped_2 .= '.gz';
open (UNMAPPED_1,"| gzip -c - > $output_dir$unmapped_1") or die "Failed to write to $unmapped_1: $!\n";
open (UNMAPPED_2,"| gzip -c - > $output_dir$unmapped_2") or die "Failed to write to $unmapped_2: $!\n";
}
warn "Unmapped sequences will be written to $unmapped_1 and $unmapped_2\n";
}
if ($ambiguous){
my $amb_1 = $filename_1;
my $amb_2 = $filename_2;
if ($prefix){
$amb_1 = "$prefix.$amb_1";
$amb_2 = "$prefix.$amb_2";
}
if ($basename){ # Output file basename is set using the -B argument
if ($sequence_file_format eq 'FASTQ'){
$amb_1 = "${basename}_ambiguous_reads_1.fq";
$amb_2 = "${basename}_ambiguous_reads_2.fq";
}
else{
$amb_1 = "${basename}_ambiguous_reads_1.fa";
$amb_2 = "${basename}_ambiguous_reads_2.fa";
}
}
else{
if ($sequence_file_format eq 'FASTQ'){
$amb_1 =~ s/$/_ambiguous_reads_1.fq/;
$amb_2 =~ s/$/_ambiguous_reads_2.fq/;
}
else{
$amb_1 =~ s/$/_ambiguous_reads_1.fa/;
$amb_2 =~ s/$/_ambiguous_reads_2.fa/;
}
}
if ($multicore > 1){ # ambiguous files are merged into .gz files in multicore runs anyway
open (AMBIG_1,'>',"$output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
open (AMBIG_2,'>',"$output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
}
else{
$amb_1 .= '.gz';
$amb_2 .= '.gz';
open (AMBIG_1,"| gzip -c - > $output_dir$amb_1") or die "Failed to write to $amb_1: $!\n";
open (AMBIG_2,"| gzip -c - > $output_dir$amb_2") or die "Failed to write to $amb_2: $!\n";
}
warn "Ambiguously mapping sequences will be written to $amb_1 and $amb_2\n";
}
if ($directional){
print REPORT "Option '--directional' specified (default mode): alignments to complementary strands (CTOT, CTOB) were ignored (i.e. not performed)\n\n";
}
elsif ($pbat){
print REPORT "Option '--pbat' specified: alignments to original strands (OT, OB) were ignored (i.e. not performed)\n\n";
}
else{
print REPORT "Option '--non_directional' specified: alignments to all strands were being performed (OT, OB, CTOT, CTOB)\n\n";
}
unless ($sam_no_hd){
generate_SAM_header();
}
### Input files are in FastA format
if ($sequence_file_format eq 'FASTA'){
process_fastA_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
}
### Input files are in FastQ format
else{
process_fastQ_files_for_paired_end_methylation_calls($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
}
}
sub print_final_analysis_report_single_end{
my ($C_to_T_infile,$G_to_A_infile,$pid,$merge_multi) = @_;
if ($merge_multi){
warn "Printing a final merged alignment report for all individual sub-reports\n\n";
}
else{
### All sequences from the original sequence file have been analysed now
### deleting temporary C->T or G->A infiles
if ($directional){
my $deletion_successful = unlink "$temp_dir$C_to_T_infile";
if ($deletion_successful == 1){
warn "\nSuccessfully deleted the temporary file $temp_dir$C_to_T_infile\n\n";
}
else{
warn "Could not delete temporary file $C_to_T_infile properly $!\n";
}
}
elsif ($pbat){
my $deletion_successful = unlink "$temp_dir$G_to_A_infile";
if ($deletion_successful == 1){
warn "\nSuccessfully deleted the temporary file $temp_dir$G_to_A_infile\n\n";
}
else{
warn "Could not delete temporary file $G_to_A_infile properly $!\n";
}
}
else{
my $deletion_successful = unlink "$temp_dir$C_to_T_infile","$temp_dir$G_to_A_infile";
if ($deletion_successful == 2){
warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile and $temp_dir$G_to_A_infile\n\n";
}
else{
warn "Could not delete temporary files properly $!\n";
}
}
}
### printing a final report for the alignment procedure
print REPORT "Final Alignment report\n",'='x22,"\n";
warn "Final Alignment report\n",'='x22,"\n";
# foreach my $index (0..$#fhs){
# print "$fhs[$index]->{name}\n";
# print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
# print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
# }
### printing a final report for the methylation call procedure
warn "Sequences analysed in total:\t$counting{sequences_count}\n";
print REPORT "Sequences analysed in total:\t$counting{sequences_count}\n";
my $percent_alignable_sequences;
if ($counting{sequences_count} == 0){
$percent_alignable_sequences = 0;
}
else{
$percent_alignable_sequences = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
}
warn "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n\n";
print REPORT "Number of alignments with a unique best hit from the different alignments:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequences}%\n";
### percentage of low complexity reads overruled because of low complexity (thereby creating a bias for highly methylated reads),
### only calculating the percentage if there were any overruled alignments
if ($counting{low_complexity_alignments_overruled_count}){
my $percent_overruled_low_complexity_alignments = sprintf ("%.1f",$counting{low_complexity_alignments_overruled_count}*100/$counting{sequences_count});
# print REPORT "Number of low complexity alignments which were overruled to have a unique best hit rather than discarding them:\t$counting{low_complexity_alignments_overruled_count}\t(${percent_overruled_low_complexity_alignments}%)\n";
}
print "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
print "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
print "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
print "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
print join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
print REPORT "Sequences with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
print REPORT "Sequences did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
print REPORT "Sequences which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
print REPORT "Number of sequences with unique best (first) alignment came from the bowtie output:\n";
print REPORT join ("\n","CT/CT:\t$counting{CT_CT_count}\t((converted) top strand)","CT/GA:\t$counting{CT_GA_count}\t((converted) bottom strand)","GA/CT:\t$counting{GA_CT_count}\t(complementary to (converted) top strand)","GA/GA:\t$counting{GA_GA_count}\t(complementary to (converted) bottom strand)"),"\n\n";
if ($directional){
print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
}
### detailed information about Cs analysed
warn "Final Cytosine Methylation Report\n",'='x33,"\n";
my $total_number_of_C = $counting{total_meCHH_count}+$counting{total_meCHG_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CpG_count};
warn "Total number of C's analysed:\t$total_number_of_C\n\n";
warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
my $percent_meCHG;
if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
$percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
}
my $percent_meCHH;
if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
$percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
}
my $percent_meCpG;
if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
$percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
}
my $percent_meC_unknown;
if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
$percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
}
### printing methylated CpG percentage if applicable
if ($percent_meCpG){
warn "C methylated in CpG context:\t${percent_meCpG}%\n";
print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
}
### printing methylated C percentage (CHG context) if applicable
if ($percent_meCHG){
warn "C methylated in CHG context:\t${percent_meCHG}%\n";
print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
}
### printing methylated C percentage (CHH context) if applicable
if ($percent_meCHH){
warn "C methylated in CHH context:\t${percent_meCHH}%\n";
print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
}
### printing methylated C percentage (Unknown C context) if applicable
if ($percent_meC_unknown){
warn "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
print REPORT "C methylated in Unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in Unknown context (CN or CHN) if value was 0\n";
}
print REPORT "\n\n";
warn "\n\n";
if ($seqID_contains_tabs){
warn "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
print REPORT "The sequence IDs in the provided file contain tab-stops which might prevent sequence alignments. If this happened, please replace all tab characters within the seqID field with spaces before running Bismark.\n\n";
}
}
sub print_final_analysis_report_paired_ends{
my ($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid,$merge_multi) = @_;
if ($merge_multi){
warn "Printing a final merged alignment report for all individual sub-reports\n\n";
}
else{
### All sequences from the original sequence file have been analysed now, therefore deleting temporary C->T or G->A infiles
if ($directional){
my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_2";
if ($deletion_successful == 2){
warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2\n\n";
}
else{
warn "Could not delete temporary files $temp_dir$C_to_T_infile_1 and $temp_dir$G_to_A_infile_2 properly: $!\n";
}
}
elsif($pbat){
# PBAT data should only have 2 files to delete, similar to directional files
my $deletion_successful = unlink "$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2";
if ($deletion_successful == 2){
warn "\nSuccessfully deleted the temporary files $temp_dir$G_to_A_infile_1 and $temp_dir$C_to_T_infile_2\n\n";
}
else{
warn "Could not delete temporary files $temp_dir$G_to_A_infile_1 and $temp_dir$C_to_T_infile_2 properly: $!\n";
}
}
else{ # non-directional
my $deletion_successful = unlink "$temp_dir$C_to_T_infile_1","$temp_dir$G_to_A_infile_1","$temp_dir$C_to_T_infile_2","$temp_dir$G_to_A_infile_2";
if ($deletion_successful == 4){
warn "\nSuccessfully deleted the temporary files $temp_dir$C_to_T_infile_1, $temp_dir$G_to_A_infile_1, $temp_dir$C_to_T_infile_2 and $temp_dir$G_to_A_infile_2\n\n";
}
else{
warn "Could not delete temporary files properly: $!\n";
}
}
}
### printing a final report for the alignment procedure
warn "Final Alignment report\n",'='x22,"\n";
print REPORT "Final Alignment report\n",'='x22,"\n";
# foreach my $index (0..$#fhs){
# print "$fhs[$index]->{name}\n";
# print "$fhs[$index]->{seen}\talignments on the correct strand in total\n";
# print "$fhs[$index]->{wrong_strand}\talignments were discarded (nonsensical alignments)\n\n";
# }
### printing a final report for the methylation call procedure
warn "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
print REPORT "Sequence pairs analysed in total:\t$counting{sequences_count}\n";
my $percent_alignable_sequence_pairs;
if ($counting{sequences_count} == 0){
$percent_alignable_sequence_pairs = 0;
}
else{
$percent_alignable_sequence_pairs = sprintf ("%.1f",$counting{unique_best_alignment_count}*100/$counting{sequences_count});
}
print "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}%\n\n";
print REPORT "Number of paired-end alignments with a unique best hit:\t$counting{unique_best_alignment_count}\nMapping efficiency:\t${percent_alignable_sequence_pairs}% \n";
print "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
print "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
print "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
print "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
print join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
print REPORT "Sequence pairs with no alignments under any condition:\t$counting{no_single_alignment_found}\n";
print REPORT "Sequence pairs did not map uniquely:\t$counting{unsuitable_sequence_count}\n";
print REPORT "Sequence pairs which were discarded because genomic sequence could not be extracted:\t$counting{genomic_sequence_could_not_be_extracted_count}\n\n";
print REPORT "Number of sequence pairs with unique best (first) alignment came from the bowtie output:\n";
print REPORT join ("\n","CT/GA/CT:\t$counting{CT_GA_CT_count}\t((converted) top strand)","GA/CT/CT:\t$counting{GA_CT_CT_count}\t(complementary to (converted) top strand)","GA/CT/GA:\t$counting{GA_CT_GA_count}\t(complementary to (converted) bottom strand)","CT/GA/GA:\t$counting{CT_GA_GA_count}\t((converted) bottom strand)"),"\n\n";
### detailed information about Cs analysed
if ($directional){
print "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
print REPORT "Number of alignments to (merely theoretical) complementary strands being rejected in total:\t$counting{alignments_rejected_count}\n\n";
}
warn "Final Cytosine Methylation Report\n",'='x33,"\n";
print REPORT "Final Cytosine Methylation Report\n",'='x33,"\n";
my $total_number_of_C = $counting{total_meCHG_count}+ $counting{total_meCHH_count}+$counting{total_meCpG_count}+$counting{total_unmethylated_CHG_count}+$counting{total_unmethylated_CHH_count}+$counting{total_unmethylated_CpG_count};
warn "Total number of C's analysed:\t$total_number_of_C\n\n";
warn "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
warn "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
warn "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
warn "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
warn "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
warn "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
warn "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
warn "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
print REPORT "Total number of C's analysed:\t$total_number_of_C\n\n";
print REPORT "Total methylated C's in CpG context:\t$counting{total_meCpG_count}\n";
print REPORT "Total methylated C's in CHG context:\t$counting{total_meCHG_count}\n";
print REPORT "Total methylated C's in CHH context:\t$counting{total_meCHH_count}\n";
print REPORT "Total methylated C's in Unknown context:\t$counting{total_meC_unknown_count}\n\n";
print REPORT "Total unmethylated C's in CpG context:\t$counting{total_unmethylated_CpG_count}\n";
print REPORT "Total unmethylated C's in CHG context:\t$counting{total_unmethylated_CHG_count}\n";
print REPORT "Total unmethylated C's in CHH context:\t$counting{total_unmethylated_CHH_count}\n";
print REPORT "Total unmethylated C's in Unknown context:\t$counting{total_unmethylated_C_unknown_count}\n\n";
my $percent_meCHG;
if (($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}) > 0){
$percent_meCHG = sprintf("%.1f",100*$counting{total_meCHG_count}/($counting{total_meCHG_count}+$counting{total_unmethylated_CHG_count}));
}
my $percent_meCHH;
if (($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}) > 0){
$percent_meCHH = sprintf("%.1f",100*$counting{total_meCHH_count}/($counting{total_meCHH_count}+$counting{total_unmethylated_CHH_count}));
}
my $percent_meCpG;
if (($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}) > 0){
$percent_meCpG = sprintf("%.1f",100*$counting{total_meCpG_count}/($counting{total_meCpG_count}+$counting{total_unmethylated_CpG_count}));
}
my $percent_meC_unknown;
if (($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}) > 0){
$percent_meC_unknown = sprintf("%.1f",100*$counting{total_meC_unknown_count}/($counting{total_meC_unknown_count}+$counting{total_unmethylated_C_unknown_count}));
}
### printing methylated CpG percentage if applicable
if ($percent_meCpG){
warn "C methylated in CpG context:\t${percent_meCpG}%\n";
print REPORT "C methylated in CpG context:\t${percent_meCpG}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in CpG context if value was 0\n";
}
### printing methylated C percentage in CHG context if applicable
if ($percent_meCHG){
warn "C methylated in CHG context:\t${percent_meCHG}%\n";
print REPORT "C methylated in CHG context:\t${percent_meCHG}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in CHG context if value was 0\n";
}
### printing methylated C percentage in CHH context if applicable
if ($percent_meCHH){
warn "C methylated in CHH context:\t${percent_meCHH}%\n";
print REPORT "C methylated in CHH context:\t${percent_meCHH}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in CHH context if value was 0\n";
}
### printing methylated C percentage (Unknown C context) if applicable
if ($percent_meC_unknown){
warn "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
print REPORT "C methylated in unknown context (CN or CHN):\t${percent_meC_unknown}%\n";
}
else{
warn "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
print REPORT "Can't determine percentage of methylated Cs in unknown context (CN or CHN) if value was 0\n";
}
print REPORT "\n\n";
warn "\n\n";
}
sub process_single_end_fastA_file_for_methylation_call{
my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_;
### this is a FastA sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
### the C->T or G->A version
### gzipped version of the infile
if ($sequence_file =~ /\.gz$/){
open (IN,"gunzip -c $sequence_file |") or die $!;
}
else{
open (IN,$sequence_file) or die $!;
}
my $count = 0;
warn "\nReading in the sequence file $sequence_file\n";
while (1) {
# last if ($counting{sequences_count} > 100);
my $identifier = <IN>;
my $sequence = <IN>;
last unless ($identifier and $sequence);
chomp $sequence;
chomp $identifier;
$identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
++$count;
if ($skip){
next unless ($count > $skip);
}
if ($upto){
last if ($count > $upto);
}
$counting{sequences_count}++;
if ($counting{sequences_count}%1000000==0) {
warn "Processed $counting{sequences_count} sequences so far\n";
}
$identifier =~ s/^>//; # deletes the > at the beginning of FastA headers
my $return = check_results_single_end (uc$sequence,$identifier);
unless ($return){
$return = 0;
}
# print the sequence to ambiguous.out if --ambiguous was specified
if ($ambiguous and $return == 2){
print AMBIG ">$identifier\n";
print AMBIG "$sequence\n";
}
# print the sequence to <unmapped.out> file if --un was specified
elsif ($unmapped and $return == 1){
print UNMAPPED ">$identifier\n";
print UNMAPPED "$sequence\n";
}
}
print "Processed $counting{sequences_count} sequences in total\n\n";
close OUT or warn "Failed to close filehandle OUT: $!\n";
print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid);
}
sub process_single_end_fastQ_file_for_methylation_call{
my ($sequence_file,$C_to_T_infile,$G_to_A_infile,$pid) = @_;
### this is the Illumina sequence file; we need the actual sequence to compare it against the genomic sequence in order to make a methylation call.
### Now reading in the sequence file sequence by sequence and see if the current sequence was mapped to one (or both) of the converted genomes in either
### the C->T or G->A version
### gzipped version of the infile
if ($sequence_file =~ /\.gz$/){
open (IN,"gunzip -c $sequence_file |") or die $!;
}
else{
open (IN,$sequence_file) or die $!;
}
my $count = 0;
warn "\nReading in the sequence file $sequence_file\n";
while (1) {
my $identifier = <IN>;
my $sequence = <IN>;
my $identifier_2 = <IN>;
my $quality_value = <IN>;
last unless ($identifier and $sequence and $identifier_2 and $quality_value);
chomp $identifier;
$identifier = fix_IDs($identifier); # this is to avoid problems with truncated read ID when they contain white spaces
$identifier .= "\n";
++$count;
if ($skip){
next unless ($count > $skip);
}
if ($upto){
last if ($count > $upto);
}
$counting{sequences_count}++;
if ($counting{sequences_count}%1000000==0) {
warn "Processed $counting{sequences_count} sequences so far\n";
}
chomp $sequence;
chomp $identifier;
chomp $quality_value;
$identifier =~ s/^\@//; # deletes the @ at the beginning of Illumin FastQ headers
my $return = check_results_single_end (uc$sequence,$identifier,$quality_value);
unless ($return){
$return = 0;
}
# print the sequence to ambiguous.out if --ambiguous was specified
if ($ambiguous and $return == 2){
print AMBIG "\@$identifier\n";
print AMBIG "$sequence\n";
print AMBIG $identifier_2;
print AMBIG "$quality_value\n";
}
# print the sequence to <unmapped.out> file if --un was specified
elsif ($unmapped and $return == 1){
print UNMAPPED "\@$identifier\n";
print UNMAPPED "$sequence\n";
print UNMAPPED $identifier_2;
print UNMAPPED "$quality_value\n";
}
}
print "Processed $counting{sequences_count} sequences in total\n\n";
close OUT or warn "Failed to close filehandle OUT: $!\n";
print_final_analysis_report_single_end($C_to_T_infile,$G_to_A_infile,$pid);
if ($ambig_bam){
close AMBIBAM or warn "Had trouble closing filehandle AMBIBAM: $!\n";
}
}
sub process_fastA_files_for_paired_end_methylation_calls{
my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_;
### Processing the two FastA sequence files; we need the actual sequences of both reads to compare them against the genomic sequence in order to
### make a methylation call. The sequence idetifier per definition needs to be the same for a sequence pair used for paired-end mapping.
### Now reading in the sequence files sequence by sequence and see if the current sequences produced an alignment to one (or both) of the
### converted genomes (either the C->T or G->A version)
### gzipped version of the infiles
if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
open (IN1,"gunzip -c $sequence_file_1 |") or die "Failed to open gunzip -c pipe to $sequence_file_1 $!\n";
open (IN2,"gunzip -c $sequence_file_2 |") or die "Failed to open gunzip -c pipe to $sequence_file_2 $!\n";
}
else{
open (IN1,$sequence_file_1) or die $!;
open (IN2,$sequence_file_2) or die $!;
}
warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
my $count = 0;
while (1) {
# reading from the first input file
my $identifier_1 = <IN1>;
my $sequence_1 = <IN1>;
# reading from the second input file
my $identifier_2 = <IN2>;
my $sequence_2 = <IN2>;
last unless ($identifier_1 and $sequence_1 and $identifier_2 and $sequence_2);
chomp $sequence_1;
chomp $identifier_1;
chomp $sequence_2;
chomp $identifier_2;
$identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
$identifier_2 = fix_IDs($identifier_2);
++$count;
if ($skip){
next unless ($count > $skip);
}
if ($upto){
last if ($count > $upto);
}
$counting{sequences_count}++;
if ($counting{sequences_count}%1000000==0) {
warn "Processed $counting{sequences_count} sequence pairs so far\n";
}
my $orig_identifier_1 = $identifier_1;
my $orig_identifier_2 = $identifier_2;
$identifier_1 =~ s/^>//; # deletes the > at the beginning of FastA headers
my $return = check_results_paired_end (uc$sequence_1,uc$sequence_2,$identifier_1);
unless ($return){
$return = 0;
}
# print the sequences to ambiguous_1 and _2 if --ambiguous was specified
if ($ambiguous and $return == 2){
print AMBIG_1 "$orig_identifier_1\n";
print AMBIG_1 "$sequence_1\n";
print AMBIG_2 "$orig_identifier_2\n";
print AMBIG_2 "$sequence_2\n";
}
# print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
elsif ($unmapped and $return == 1){
print UNMAPPED_1 "$orig_identifier_1\n";
print UNMAPPED_1 "$sequence_1\n";
print UNMAPPED_2 "$orig_identifier_2\n";
print UNMAPPED_2 "$sequence_2\n";
}
}
warn "Processed $counting{sequences_count} sequences in total\n\n";
close OUT or die $!;
print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
}
sub process_fastQ_files_for_paired_end_methylation_calls{
my ($sequence_file_1,$sequence_file_2,$C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid) = @_;
### Processing the two Illumina sequence files; we need the actual sequence of both reads to compare them against the genomic sequence in order to
### make a methylation call. The sequence identifier per definition needs to be same for a sequence pair used for paired-end alignments.
### Now reading in the sequence files sequence by sequence and see if the current sequences produced a paired-end alignment to one (or both)
### of the converted genomes (either C->T or G->A version)
### gzipped version of the infiles
if ($sequence_file_1 =~ /\.gz$/ and $sequence_file_2 =~ /\.gz$/){
open (IN1,"gunzip -c $sequence_file_1 |") or die "Failed to open gunzip -c pipe to $sequence_file_1 $!\n";
open (IN2,"gunzip -c $sequence_file_2 |") or die "Failed to open gunzip -c pipe to $sequence_file_2 $!\n";
}
else{
open (IN1,$sequence_file_1) or die $!;
open (IN2,$sequence_file_2) or die $!;
}
my $count = 0;
warn "\nReading in the sequence files $sequence_file_1 and $sequence_file_2\n";
### Both files are required to have the exact same number of sequences, therefore we can process the sequences jointly one by one
while (1) {
# reading from the first input file
my $identifier_1 = <IN1>;
my $sequence_1 = <IN1>;
my $ident_1 = <IN1>; # not needed
my $quality_value_1 = <IN1>; # not needed
# reading from the second input file
my $identifier_2 = <IN2>;
my $sequence_2 = <IN2>;
my $ident_2 = <IN2>; # not needed
my $quality_value_2 = <IN2>; # not needed
last unless ($identifier_1 and $sequence_1 and $quality_value_1 and $identifier_2 and $sequence_2 and $quality_value_2);
chomp $sequence_1;
chomp $identifier_1;
chomp $sequence_2;
chomp $identifier_2;
chomp $quality_value_1;
chomp $quality_value_2;
$identifier_1 = fix_IDs($identifier_1); # this is to avoid problems with truncated read ID when they contain white spaces
$identifier_2 = fix_IDs($identifier_2);
++$count;
if ($skip){
next unless ($count > $skip);
}
if ($upto){
last if ($count > $upto);
}
$counting{sequences_count}++;
if ($counting{sequences_count}%1000000==0) {
warn "Processed $counting{sequences_count} sequence pairs so far\n";
}
my $orig_identifier_1 = $identifier_1;
my $orig_identifier_2 = $identifier_2;
$identifier_1 =~ s/^\@//; # deletes the @ at the beginning of the FastQ ID
my $return = check_results_paired_end (uc$sequence_1,uc$sequence_2,$identifier_1,$quality_value_1,$quality_value_2);
unless ($return){
$return = 0;
}
# print the sequences to ambiguous_1 and _2 if --ambiguous was specified
if ($ambiguous and $return == 2){
# seq_1
print AMBIG_1 "$orig_identifier_1\n";
print AMBIG_1 "$sequence_1\n";
print AMBIG_1 $ident_1;
print AMBIG_1 "$quality_value_1\n";
# seq_2
print AMBIG_2 "$orig_identifier_2\n";
print AMBIG_2 "$sequence_2\n";
print AMBIG_2 $ident_2;
print AMBIG_2 "$quality_value_2\n";
}
# print the sequences to unmapped_1.out and unmapped_2.out if --un was specified
elsif ($unmapped and $return == 1){
# seq_1
print UNMAPPED_1 "$orig_identifier_1\n";
print UNMAPPED_1 "$sequence_1\n";
print UNMAPPED_1 $ident_1;
print UNMAPPED_1 "$quality_value_1\n";
# seq_2
print UNMAPPED_2 "$orig_identifier_2\n";
print UNMAPPED_2 "$sequence_2\n";
print UNMAPPED_2 $ident_2;
print UNMAPPED_2 "$quality_value_2\n";
}
}
warn "Processed $counting{sequences_count} sequences in total\n\n";
close OUT or warn "Failed to close filehandle OUT: $!\n\n";
if ($ambig_bam){
close AMBIBAM or warn "Had trouble closing filehandle AMBIBAM: $!\n\n";
}
print_final_analysis_report_paired_ends($C_to_T_infile_1,$G_to_A_infile_1,$C_to_T_infile_2,$G_to_A_infile_2,$pid);
}
########################################
### BOWTIE 2 // HISAT2 // SINGLE-END ###
########################################
sub check_results_single_end{
my ($sequence,$identifier,$quality_value) = @_;
# warn "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\nCurrent sequence: $sequence\t$identifier\t$quality_value\n";
unless ($quality_value){ # FastA sequences get assigned a quality value of Phred 40 throughout
$quality_value = 'I'x(length$sequence);
}
# as of version Bowtie 2 2.0.0 beta7, when input reads are unpaired, Bowtie 2 no longer removes the trailing /1 or /2 from the read name.
# $identifier =~ s/\/[1234567890]+$//; # some sequencers don't just have /1 or /2 at the end of read IDs
# print "sequence $sequence\nid $identifier\nquality: '$quality_value'\n";
my $alignment_ambiguous = 0;
my $first_ambig_alignment; # storing the first ambiguous alignment so it can be written out in case '--ambig_bam' was specified
my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far
my $amb_same_thread = 0; ## if a reads primary and secondary alignments have the same alignment score we set this to true.
my %alignments = ();
### reading from the Bowtie 2 output filehandles
foreach my $index (0..$#fhs){
# if ($fhs[$index]->{last_line} =~ /\d+S/){
#print "Index: $index\n";
#print "$fhs[$index]->{last_line}\n";
#print "$fhs[$index]->{last_seq_id}\n";
# }
### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
next unless ($fhs[$index]->{last_line} and defined $fhs[$index]->{last_seq_id});
### if the sequence we are currently looking at produced an alignment we are doing various things with it
# print "last seq id: $fhs[$index]->{last_seq_id} and identifier: $identifier\n";
if ($fhs[$index]->{last_seq_id} eq $identifier) {
my ($id,$flag,$mapped_chromosome,$position,$mapping_quality,$cigar,$bowtie_sequence,$qual) = (split (/\t/,$fhs[$index]->{last_line}))[0,1,2,3,4,5,9,10];
## If a sequence has no reported alignments there will be a single output line with a bit-wise flag value of 4. We can store the next alignment and move on to the next Bowtie 2 instance
if ($flag == 4){
## reading in the next alignment, which must be the next sequence
my $newline = $fhs[$index]->{fh}-> getline();
if ($newline){
chomp $newline;
my ($seq_id) = split (/\t/,$newline);
$fhs[$index]->{last_seq_id} = $seq_id;
$fhs[$index]->{last_line} = $newline;
if ($seq_id eq $identifier){
die "Sequence with ID $identifier did not produce any alignment, but next seq-ID was also $fhs[$index]->{last_seq_id}!\n";
}
next; # next instance
}
else{
# assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
$fhs[$index]->{last_seq_id} = undef;
$fhs[$index]->{last_line} = undef;
next;
}
}
# if there are one or more proper alignments we can extract the chromosome number
my $chromosome;
# warn "FLAG: $flag\nCHR: $mapped_chromosome\n";sleep(1);
if ($mapped_chromosome =~ s/_(CT|GA)_converted$//){
# warn "FLAG: $flag\nCHR: $mapped_chromosome\n";sleep(1);
$chromosome = $mapped_chromosome;
}
else{
die "Chromosome number extraction failed for $mapped_chromosome\n";
}
### We will use the optional field to determine the best alignment. Later on we extract the number of mismatches and/or indels from the CIGAR string
my ($alignment_score,$second_best,$MD_tag);
my @fields = split (/\t/,$fhs[$index]->{last_line});
foreach (11..$#fields){
if ($fields[$_] =~ /AS:i:(.*)/){
$alignment_score = $1;
}
elsif ($fields[$_] =~ /ZS:i:(.*)/){
$second_best = $1;
}
elsif ($fields[$_] =~ /MD:Z:(.*)/){
$MD_tag = $1;
}
else{
if ($bowtie2){
if ($fields[$_] =~ /XS:i:(.*)/){
$second_best = $1;
}
elsif ($fields[$_] =~ /ZS:i:(.*)/){ # HISAT2 uses ZS:i: instead of XS:i:
$second_best = $1;
}
}
}
}
my $overwrite = 0; # If we get 2 alignments to the very same position, e.g. to OT with and AS of -156 and to CTOB with and AS of 0 we need the latter to trump the former, else
# the read will be assigned to the wrong strand which may result in incorrect methylation calls.
# this was brought to our attention by Sylvain Foret (ANU Canberra), 13 April 2016
if (!defined $best_AS_so_far){
$best_AS_so_far = $alignment_score;
$overwrite++;
# warn "First alignment score, setting \$best_AS_so_far to $best_AS_so_far\n";
if ($ambig_bam){ # also setting the first_ambig_alignment
$first_ambig_alignment = $fhs[$index]->{last_line};
$first_ambig_alignment =~ s/_(CT|GA)_converted//;
# warn "$first_ambig_alignment\n"; sleep(1);
}
}
else{
if ($alignment_score >= $best_AS_so_far){ # AS are generally negative with a maximum of 0;
# 19 07 2016: changed this to >= so that equally good alignments are also added. Ambiguous alignments from different threads will be identified later on
$overwrite++;
# 22 07 2016: resetting the ambiguous score within same thread only if the current alignment is really better than the previous one
if ($alignment_score > $best_AS_so_far){
# warn "Resetting amb within thread value to 0\n";
$amb_same_thread = 0;
if ($ambig_bam){ # also setting a new first_ambig_alignment
$first_ambig_alignment = $fhs[$index]->{last_line};
$first_ambig_alignment =~ s/_(CT|GA)_converted//;
# warn "$first_ambig_alignment\n"; sleep(1);
}
}
$best_AS_so_far = $alignment_score; # 26 06 2017: moved this down so that the $amb_same_thread gets a chance to reset
# warn "Found better or equal alignment score ($alignment_score), setting \$best_AS_so_far to $best_AS_so_far\n";
}
else{
# warn "Current alignment (AS $alignment_score) isn't better than the best so far ($best_AS_so_far). Not changing anything\n";
}
}
# warn "First best alignment_score is: '$alignment_score'\n";
# warn "MD tag is: '$MD_tag'\n";
die "Failed to extract alignment score ($alignment_score) and MD tag ($MD_tag) from line $fhs[$index]->{last_line}!\n" unless (defined $alignment_score and defined $MD_tag);
if (defined $second_best){
# warn "second best alignment_score is: '$second_best'\n\n"; sleep(1);
# If the first alignment score is the same as the alignment score of the second best hit we keep a memory of this
if ($alignment_score == $second_best){
# checking to see if this read produced the best alignment
if ($alignment_score == $best_AS_so_far){ # yes this read is the best one so far, however it is ambiguous
# warn "Read is ambiguous within the same thread, or otherwise as good as the best one so far. Setting \$amb_same_thread to 1 for currently best AS: $best_AS_so_far\n";
$amb_same_thread = 1;
}
else{
# warn "This read has a worse alignments score than the best alignment so far and will be ignored even though it is ambiguous in itself\n";
}
### if there is a better alignment later on -> fine. If not, the read will get booted altogether
## need to read and discard all additional ambiguous reads until we reach the next sequence
until ($fhs[$index]->{last_seq_id} ne $identifier){
my $newline = $fhs[$index]->{fh}-> getline();
if ($newline){
chomp $newline;
my ($seq_id) = split (/\t/,$newline);
$fhs[$index]->{last_seq_id} = $seq_id;
$fhs[$index]->{last_line} = $newline;
}
else{
# assigning undef to last_seq_id and last_line and jumping to the next index (end of HISAT2 output)
$fhs[$index]->{last_seq_id} = undef;
$fhs[$index]->{last_line} = undef;
last; # break free in case we have reached the end of the alignment output
}
}
# warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
}
else{ # the next best alignment has a lower alignment score than the current read, so we can safely store the current alignment
my $alignment_location = join (":",$chromosome,$position);
### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
if ($overwrite){
$alignments{$alignment_location}->{seq_id} = $id;
$alignments{$alignment_location}->{alignment_score} = $alignment_score;
$alignments{$alignment_location}->{alignment_score_second_best} = $second_best;
$alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
$alignments{$alignment_location}->{index} = $index;
$alignments{$alignment_location}->{chromosome} = $chromosome;
$alignments{$alignment_location}->{position} = $position;
$alignments{$alignment_location}->{CIGAR} = $cigar;
$alignments{$alignment_location}->{MD_tag} = $MD_tag;
}
### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
until ($fhs[$index]->{last_seq_id} ne $identifier){
my $newline = $fhs[$index]->{fh}-> getline();
if ($newline){
chomp $newline;
my ($seq_id) = split (/\t/,$newline);
$fhs[$index]->{last_seq_id} = $seq_id;
$fhs[$index]->{last_line} = $newline;
}
else{
# assigning undef to last_seq_id and last_line and jumping to the next index (end of HISAT2 output)
$fhs[$index]->{last_seq_id} = undef;
$fhs[$index]->{last_line} = undef;
last; # break free in case we have reached the end of the alignment output
}
}
# warn "Index: $index\tThe current Seq-ID is $identifier, skipped all ambiguous sequences until the next ID which is: $fhs[$index]->{last_seq_id}\n";
}
}
else{ # there is no second best hit, so we can just store this one and read in the next sequence
my $alignment_location = join (":",$chromosome,$position);
# warn "There is no second best hit. Overwrite status: $overwrite\n";
### If a sequence aligns to exactly the same location with a perfect match twice the sequence does either not contain any C or G, or all the Cs (or Gs on the reverse
### strand) were methylated and therefore protected. Alternatively it will align better in one condition than in the other. In any case, it is not needed to overwrite
### the same positional entry with a second entry for the same location, as the genomic sequence extraction and methylation call would not be affected by this. The only
### thing which would change is the index number for the found alignment). We will continue to assign these alignments to the first indexes 0 and 1, i.e. OT and OB
if ($overwrite){
$alignments{$alignment_location}->{seq_id} = $id;
$alignments{$alignment_location}->{alignment_score} = $alignment_score;
$alignments{$alignment_location}->{alignment_score_second_best} = undef;
$alignments{$alignment_location}->{bowtie_sequence} = $bowtie_sequence;
$alignments{$alignment_location}->{index} = $index;
$alignments{$alignment_location}->{chromosome} = $chromosome;
$alignments{$alignment_location}->{position} = $position;
$alignments{$alignment_location}->{MD_tag} = $MD_tag;
$alignments{$alignment_location}->{CIGAR} = $cigar;
}
### now reading and discarding all (inferior) alignments of this sequencing read until we hit the next sequence
until ($fhs[$index]->{last_seq_id} ne $identifier){
my $newline = $fhs[$index]->{fh}-> getline();
if ($newline){
# warn "$newline\n";
chomp $newline;
my ($seq_id) = split (/\t/,$newline);
$fhs[$index]->{last_seq_id} = $seq_id;
$fhs[$index]->{last_line} = $newline;
}
else{
# assigning undef to last_seq_id and last_line and jumping to the next index (end of HISAT2 output)
$fhs[$index]->{last_seq_id} = undef;
$fhs[$index]->{last_line} = undef;
last; # break free in case we have reached the end of the alignment output
}
}
}
}
}
### If there were several equally good alignments for the best alignment score we will boot the read
if ($amb_same_thread){
$alignment_ambiguous = 1;
# warn "\$alignment_ambiguous now: $alignment_ambiguous\n";
}
else{
# warn "alignment won't be considered ambiguous. This time....\n";
}
### if the read produced several ambiguous alignments already now can returning already now. If --ambiguous or --unmapped was specified the read sequence will be printed out.
if ($alignment_ambiguous == 1){
$counting{unsuitable_sequence_count}++;
### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
# my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
# print "$ambiguous_read_output\n";
if ($ambig_bam){
# warn "Sequence is ambiguous, printing out BAM file:\n";
print AMBIBAM "$first_ambig_alignment\n";
}
if ($ambiguous){
return 2; # => exits to next sequence, and prints it out to _ambiguous_reads.fq.gz if '--ambiguous' was specified
}
elsif ($unmapped){
return 1; # => exits to next sequence, and prints it out to _unmapped_reads.fq.gz if '--unmapped' but not '--ambiguous' was specified
}
else{
return 0;
}
}
### if there was no alignment found for a certain sequence at all we continue with the next sequence in the sequence file
unless(%alignments){
$counting{no_single_alignment_found}++;
# my $unmapped_read_output = join("\t",$identifier,'4','*','0','0','*','*','0','0',$sequence,$quality_value);
# print "$unmapped_read_output\n";
if ($unmapped){
return 1; # => exits to next sequence, and prints it out to _unmapped_reads.txt if '--unmapped' was specified
}
else{
return 0; # default
}
}
#######################################################################################################################################################
### If the sequence was not rejected so far we are now looking if there is a unique best alignment among all alignment instances. If there is only one
### single best position we are going to store the alignment information in the $meth_call variable. If there are multiple hits with the same (highest)
### alignment score we are discarding the sequence altogether.
### For end-to-end alignments the maximum alignment score can be 0, each mismatch can receive penalties up to 6, and each gap receives penalties for
### opening (5) and extending (3 per bp) the gap.
#######################################################################################################################################################
my $methylation_call_params; # hash reference which will store all information we need for the methylation call
my $sequence_fails = 0; # Going to use $sequence_fails as a 'memory' if a sequence could not be aligned uniquely (set to 1 then)
### print contents of %alignments for debugging
# if (scalar keys %alignments > 1){
# print "\n******\n";
# foreach my $alignment_location (sort {$a cmp $b} keys %alignments){
# print "Loc: $alignment_location\n";
# print "ID: $alignments{$alignment_location}->{seq_id}\n";
# print "AS: $alignments{$alignment_location}->{alignment_score}\n";
# print "Seq: $alignments{$alignment_location}->{bowtie_sequence}\n";
# print "Index $alignments{$alignment_location}->{index}\n";
# print "Chr: $alignments{$alignment_location}->{chromosome}\n";
# print "pos: $alignments{$alignment_location}->{position}\n";
# print "MD: $alignments{$alignment_location}->{MD_tag}\n\n";
# }
# print "\n******\n";
# }
### if there is only 1 entry in the hash with we accept it as the best alignment
if (scalar keys %alignments == 1){
for my $unique_best_alignment (keys %alignments){
$methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$unique_best_alignment}->{bowtie_sequence};
$methylation_call_params->{$identifier}->{chromosome} = $alignments{$unique_best_alignment}->{chromosome};
$methylation_call_params->{$identifier}->{position} = $alignments{$unique_best_alignment}->{position};
$methylation_call_params->{$identifier}->{index} = $alignments{$unique_best_alignment}->{index};
$methylation_call_params->{$identifier}->{alignment_score} = $alignments{$unique_best_alignment}->{alignment_score};
$methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$unique_best_alignment}->{alignment_score_second_best};
$methylation_call_params->{$identifier}->{MD_tag} = $alignments{$unique_best_alignment}->{MD_tag};
$methylation_call_params->{$identifier}->{CIGAR} = $alignments{$unique_best_alignment}->{CIGAR};
}
}
### otherwise we are going to find out if there is a best match among the multiple alignments, or whether there are 2 or more equally good alignments (in which case
### we boot the sequence altogether
elsif (scalar keys %alignments >= 2 and scalar keys %alignments <= 4){
my $best_alignment_score;
my $best_alignment_location;
foreach my $alignment_location (sort {$alignments{$b}->{alignment_score} <=> $alignments{$a}->{alignment_score}} keys %alignments){
# print "$alignments{$alignment_location}->{alignment_score}\n";
unless (defined $best_alignment_score){
$best_alignment_score = $alignments{$alignment_location}->{alignment_score};
$best_alignment_location = $alignment_location;
# print "setting best alignment score: $best_alignment_score\n";
}
else{
### if the second best alignment has the same alignment score as the first one, the sequence will get booted
if ($alignments{$alignment_location}->{alignment_score} == $best_alignment_score){
# warn "Same alignment score, the sequence will get booted!\n";
$sequence_fails = 1;
last; # exiting after the second alignment since we know that the sequence has ambiguous alignments
}
### else we are going to store the best alignment for further processing
else{
$methylation_call_params->{$identifier}->{bowtie_sequence} = $alignments{$best_alignment_location}->{bowtie_sequence};
$methylation_call_params->{$identifier}->{chromosome} = $alignments{$best_alignment_location}->{chromosome};
$methylation_call_params->{$identifier}->{position} = $alignments{$best_alignment_location}->{position};
$methylation_call_params->{$identifier}->{index} = $alignments{$best_alignment_location}->{index};
$methylation_call_params->{$identifier}->{alignment_score} = $alignments{$best_alignment_location}->{alignment_score};
$methylation_call_params->{$identifier}->{MD_tag} = $alignments{$best_alignment_location}->{MD_tag};
$methylation_call_params->{$identifier}->{CIGAR} = $alignments{$best_alignment_location}->{CIGAR};
if (defined $alignments{$best_alignment_location}->{alignment_score_second_best} and $alignments{$best_alignment_location}-> {alignment_score_second_best} > $alignments{$alignment_location}->{alignment_score}) {
$methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$best_alignment_location}->{alignment_score_second_best};
}
else{
$methylation_call_params->{$identifier}->{alignment_score_second_best} = $alignments{$alignment_location}->{alignment_score};
}
last; # exiting after processing the second alignment since the sequence produced a unique best alignment
}
}
}
}
else{
die "There are too many potential hits for this sequence (1-4 expected, but found: ",scalar keys %alignments,")\n";;
}
### skipping the sequence completely if there were multiple alignments with the same best alignment score at different positions
if ($sequence_fails == 1){
$counting{unsuitable_sequence_count}++;
### report that the sequence has multiple hits with bitwise flag 256. We can print the sequence to the result file straight away and skip everything else
# my $ambiguous_read_output = join("\t",$identifier,'256','*','0','0','*','*','0','0',$sequence,$quality_value);
# print OUT "$ambiguous_read_output\n";
if ($ambiguous){
return 2; # => exits to next sequence, and prints it out (in FastQ format) to _ambiguous_reads.txt if '--ambiguous' was specified
}
elsif ($unmapped){
return 1; # => exits to next sequence, and prints it out (in FastQ format) to _unmapped_reads.txt if '--unmapped' but not '--ambiguous' was specified
}
else{
return 0; # => exits to next sequence (default)
}
}
### --DIRECTIONAL
### If the option --directional has been specified the user wants to consider only alignments to the original top strand or the original bottom strand. We will therefore
### discard all alignments to strands complementary to the original strands, as they should not exist in reality due to the library preparation protocol
if ($directional){
if ( ($methylation_call_params->{$identifier}->{index} == 2) or ($methylation_call_params->{$identifier}->{index} == 3) ){
# warn "Alignment rejected! (index was: $methylation_call_params->{$identifier}->{index})\n";
$counting{alignments_rejected_count}++;
return 0;
}
}
### If the sequence has not been rejected so far it has a unique best alignment
$counting{unique_best_alignment_count}++;
### Now we need to extract a genomic sequence that exactly corresponds to the reported alignment. This potentially means that we need to deal with insertions or deletions as well
extract_corresponding_genomic_sequence_single_end ($identifier,$methylation_call_params);
### check test to see if the genomic sequence we extracted has the same length as the observed sequence+2, and only then we perform the methylation call
if (length($methylation_call_params->{$identifier}->{unmodified_genomic_sequence}) != length($sequence)+2){
warn "Chromosomal sequence could not be extracted for\t$identifier\t$methylation_call_params->{$identifier}->{chromosome}\t$methylation_call_params->{$identifier}->{position}\n";
$counting{genomic_sequence_could_not_be_extracted_count}++;
return 0;
}
# Compute MAPQ value
$methylation_call_params->{$identifier}->{mapq} = calc_mapq (length($sequence), undef,
$methylation_call_params->{$identifier}->{alignment_score},
$methylation_call_params->{$identifier}->{alignment_score_second_best});
### otherwise we are set to perform the actual methylation call
if ($slam){
$methylation_call_params->{$identifier}->{methylation_call} = methylation_call_slam($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
}
else{
$methylation_call_params->{$identifier}->{methylation_call} = methylation_call($identifier,$sequence,$methylation_call_params->{$identifier}->{unmodified_genomic_sequence},$methylation_call_params->{$identifier}->{read_conversion});
}
print_bisulfite_mapping_result_single_end ($identifier,$sequence,$methylation_call_params,$quality_value);
return 0; ## if a sequence got this far we do not want to print it to unmapped or ambiguous.out
}
sub methylation_call_slam{
my ($identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion) = @_;
### splitting both the actually observed sequence and the genomic sequence up into single bases so we can compare them one by one
# warn "SLAM-SEQ CALL\n\n";sleep (1);
my @seq = split(//,$sequence_actually_observed);
my @genomic = split(//,$genomic_sequence);
# print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n";
### Creating a match-string with different characters for non-cytosine bases (disregarding mismatches here), methyl-Cs or non-methyl Cs in either
### CpG, CHH or CHG context
### SLAM-SEQ: HERE we are not interested in cytosines and their context, but simply the "methylation" state of Ts
#################################################################
### . for bases not involving cytosines ###
### X for methylated C in CHG context (was protected) ###
### x for not methylated C in CHG context (was converted) ###
### H for methylated C in CHH context (was protected) ###
### h for not methylated C in CHH context (was converted) ###
### Z for methylated C in CpG context (was protected) ###
### z for not methylated C in CpG context (was converted) ###
### U for methylated C in unknown context (was protected) ###
### u for not methylated C in unknwon context (was converted) ###
#################################################################
my @match =();
warn "length of \@seq: ",scalar @seq,"\tlength of \@genomic: ",scalar @genomic,"\n" unless (scalar @seq eq (scalar@genomic-2)); ## CHH changed to -2
my $methyl_CHH_count = 0;
my $methyl_CHG_count = 0;
my $methyl_CpG_count = 0;
my $methyl_C_unknown_count = 0;
my $unmethylated_CHH_count = 0;
my $unmethylated_CHG_count = 0;
my $unmethylated_CpG_count = 0;
my $unmethylated_C_unknown_count = 0;
# print join ("\n",$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion),"\n~~~~\n"; sleep(1);
if ($read_conversion eq 'CT'){
for my $index (0..$#seq) {
if ($seq[$index] eq $genomic[$index]) {
### For SLAM-Seq we are only interested Ts. If the was T was not converted to C the RNA was not newly synthesized
if ($genomic[$index] eq 'T') {
++$unmethylated_CpG_count;
push @match,'z';
}
else {
push @match, '.';
}
}
elsif ($seq[$index] ne $genomic[$index]) {
if ($genomic[$index] eq 'T' and $seq[$index] eq 'C') {
++$methyl_CpG_count;
push @match,'Z'; # converted C, not methylated, in CpG context
}
### all other mismatches are not of interest for a methylation call
else {
push @match,'.';
}
}
else{
die "There can be only 2 possibilities\n";
}
}
}
elsif ($read_conversion eq 'GA'){
# print join ("\n",'***',$identifier,$sequence_actually_observed,$genomic_sequence,$read_conversion,'***'),"\n";
for my $index (0..$#seq) {
if ($seq[$index] eq $genomic[$index+2]) {
### The residue can only be an A if the T on the other strand was not converted to C, i.e. not newly synthesized
if ($genomic[$index+2] eq 'A') {
++$unmethylated_CpG_count;
push @match,'z';
}
else{
push @match, '.';
}
}
elsif ($seq[$index] ne $genomic[$index+2]) {
if ($genomic[$index+2] eq 'A' and $seq[$index] eq 'G') {
++$methyl_CpG_count;
push @match,'Z'; # converted C on opposing strand, not methylated, in CpG context
}
### all other mismatches are not of interest for a methylation call
else {
push @match,'.';
}
}
else{
die "There can be only 2 possibilities\n";
}
}
}
else{
die "Strand conversion info is required to perform a methylation call\n";
}
my $methylation_call = join ("",@match);
$counting{total_meCHH_count} += $methyl_CHH_count;
$counting{total_meCHG_count} += $methyl_CHG_count;
$counting{total_meCpG_count} += $methyl_CpG_count;
$counting{total_meC_unknown_count} += $methyl_C_unknown_count;
$counting{total_unmethylated_CHH_count} += $unmethylated_CHH_count;
$counting{total_unmethylated_CHG_count} += $unmethylated_CHG_count;
$counting{total_unmethylated_CpG_count} += $unmethylated_CpG_count;
$counting{total_unmethylated_C_unknown_count} += $unmethylated_C_unknown_count;
# print "\n$sequence_actually_observed\n$genomic_sequence\n",@match,"\n$read_conversion\n\n";
return $methylation_call;
}
########################################
### BOWTIE 2 // HISAT2 // PAIRED-END ###
########################################
sub check_results_paired_end{
my ($sequence_1,$sequence_2,$identifier,$quality_value_1,$quality_value_2) = @_;
### quality values are not given for FastA files, so they are initialised with a Phred quality of 40
unless ($quality_value_1){
$quality_value_1 = 'I'x(length$sequence_1);
}
unless ($quality_value_2){
$quality_value_2 = 'I'x(length$sequence_2);
}
# print "Read ID:$identifier\nLast ID [0]: $fhs[0]->{last_seq_id}\nLast ID [1]: $fhs[1]->{last_seq_id}\nLast ID [2]: $fhs[2]->{last_seq_id}\nLast ID [3]: $fhs[3]->{last_seq_id}\n\n"; sleep(1);
my %alignments;
my $alignment_ambiguous = 0;
my $first_ambig_alignment_line1; # storing the first ambiguous alignment so it can be written out in case '--ambig_bam' was specified R1
my $first_ambig_alignment_line2; # R2
my $best_AS_so_far; ## we need to keep a memory of the best alignment score so far
my $amb_same_thread = 0; ## if a read's primary and secondary alignments have the same alignment score we set this to true.
### reading from the Bowtie 2 output filehandles
### for paired end reads we are reporting alignments to the OT strand first (index 0), then the OB strand (index 3!!), similiar to the single end way.
### alignments to the complementary strands are reported afterwards (CTOT got index 1, and CTOB got index 2).
### This is needed so that alignments which either contain no single C or G or reads which contain only protected Cs are reported to the original strands (OT and OB)
### Before the complementary strands. Remember that it does not make any difference for the methylation calls, but it will matter if alignments to the complementary
### strands are not being reported when '--directional' is specified
foreach my $index (0,3,1,2){
### skipping this index if the last alignment has been set to undefined already (i.e. end of bowtie output)
next unless ($fhs[$index]->{last_line_1} and $fhs[$index]->{last_line_2} and defined $fhs[$index]->{last_seq_id});
### if the sequence pair we are currently looking at produced an alignment we are doing various things with it
if ($fhs[$index]->{last_seq_id} eq $identifier) {
my ($id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1) = (split (/\t/,$fhs[$index]->{last_line_1}))[0,1,2,3,4,5,9,10];
my ($id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2) = (split (/\t/,$fhs[$index]->{last_line_2}))[0,1,2,3,4,5,9,10];
# print "Index: $index\t$fhs[$index]->{last_line_1}\n";
# print "Index: $index\t$fhs[$index]->{last_line_2}\n";
# print join ("\t",$id_1,$flag_1,$mapped_chromosome_1,$position_1,$mapping_quality_1,$cigar_1,$bowtie_sequence_1,$qual_1),"\n";
# print join ("\t",$id_2,$flag_2,$mapped_chromosome_2,$position_2,$mapping_quality_2,$cigar_2,$bowtie_sequence_2,$qual_2),"\n";
$id_1 =~ s/\/1$//;
$id_2 =~ s/\/2$//;
### If a sequence has no reported alignments there will be a single output line per sequence with a bit-wise flag value of 77 for read 1 (1+4+8+64), or 141 for read 2 (1+4+8+128).
### We can store the next alignment and move on to the next Bowtie 2 instance
if ($flag_1 == 77 and $flag_2 == 141){
## reading in the next alignment, which must be the next sequence
my $newline_1 = $fhs[$index]->{fh}-> getline();
my $newline_2 = $fhs[$index]->{fh}-> getline();
if ($newline_1 and $newline_2){
chomp $newline_1;
chomp $newline_2;
my ($seq_id_1) = split (/\t/,$newline_1);
my ($seq_id_2) = split (/\t/,$newline_2);
$seq_id_1 =~ s/\/1$//;
$seq_id_2 =~ s/\/2$//;
$fhs[$index]->{last_seq_id} = $seq_id_1;
$fhs[$index]->{last_line_1} = $newline_1;
$fhs[$index]->{last_line_2} = $newline_2;
# print "current sequence ($identifier) did not map, reading in next sequence\n";
# print "$index\t$fhs[$index]->{last_seq_id}\n";
# print "$index\t$fhs[$index]->{last_line_1}\n";
# print "$index\t$fhs[$index]->{last_line_2}\n";
next; # next instance
}
else{
# assigning undef to last_seq_id and last_line and jumping to the next index (end of Bowtie 2 output)
$fhs[$index]->{last_seq_id} = undef;
$fhs[$index]->{last_line_1} = undef;
$fhs[$index]->{last_line_2} = undef;
next;
}
}
### If there are one or more proper alignments we can extract the chromosome number
my ($chromosome_1,$chromosome_2);
if ($mapped_chromosome_1 =~ s/_(CT|GA)_converted$//){
$chromosome_1 = $mapped_chromosome_1;
}
else{
die "Chromosome number extraction failed for $mapped_chromosome_1\n";
}
if ($mapped_chromosome_2 =~ s/_(CT|GA)_converted$//){
$chromosome_2 = $mapped_chromosome_2;
}
else{
die "Chromosome number extraction failed for $mapped_chromosome_2\n";
}
die "Paired-end alignments need to be on the same chromosome\n" unless ($chromosome_1 eq $chromosome_2);
### We will use the optional fields to determine the best alignments. Later on we extract the number of mismatches and/or indels from the CIGAR string
my ($alignment_score_1,$alignment_score_2,$second_best_1,$second_best_2,$MD_tag_1,$MD_tag_2);
my @fields_1 = split (/\t/,$fhs[$index]->{last_line_1});
my @fields_2 = split (/\t/,$fhs[$index]->{last_line_2});