UmiBam

#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;

### This script is supposed to remove alignments to the same position in the genome which can arise by e.g. PCR amplification
### Paired-end alignments are considered a duplicate if both partner reads start and end at the exact same position
### In addition to the mapping position we are using a UMI in the read ID to discriminate alignments from two distinct molecules to the same position

### 15 06 2020
### Adapting to also work with softclipped reads (CIGAR operation S)

### 16 08 2021
### Fixing auto-detection 

### 30 09 20201
### Looking to add Hi-C mode, where R1 and R2 map to different locations

my $dedup_version   = 'v0.2.0dev';
my $copyright_dates = "2016-21";
my $last_modified   = "01 Oct 2021";


my %umi_reporting = (); # only required for Hi-C, and --detailed_UMI_report

my $help;
my $representative;
my $single;
my $paired;
my $global_single;
my $global_paired;
my $vanilla;
my $samtools_path;
my $bam = 1;
my $umi;
my $mm;
my $double_umi;
my $hic;
my $detail;
my $version;

my $command_line = GetOptions ('help' => \$help,
			       's|single' => \$global_single,
			       'p|paired' => \$global_paired,
			       'samtools_path=s' => \$samtools_path,
			       'bam' => \$bam,
			       'umi' => \$umi,
			       'double_umi' => \$double_umi,
			       'mismatches|mm=i' => \$mm, # mismatches in the UMI sequence tolerated
			       'version' => \$version,
                   'hic'     => \$hic,
                   'detailed_UMI_report'   => \$detail,
    );

die "Please respecify command line options\n\n" unless ($command_line);

if ($help){
  print_helpfile();
  exit;
}

if ($version){
  print << "VERSION";

                           UMI Deduplication for BAM files

                          Deduplicator Version: $dedup_version
              Copyright $copyright_dates Felix Krueger, Babraham Bioinformatics
                      https://github.com/FelixKrueger/Umi-Grinder


VERSION
    exit;
  }


my @filenames = @ARGV;

unless (@filenames){
  print "Please provide one or more BAM files for deduplication\n\n";
  sleep (2);
  print_helpfile();
  exit;
}


### OPTIONS

if ($hic){
    warn "\n   >>> Hi-C mode was selected: Setting global paired-end mode <<<\n\n"; sleep (1);
    $global_paired = 1; # for Hi-C mode we set paired, but need to proceed differently at the processing step
    $global_single = 0;
}

if ($double_umi){
    warn "Setting --umi as well\n";
    $umi = 1; # setting $umi as well
}

if ($detail){
    unless ($hic){
        die "The detailed UMI report is only intended for use with Hi-C data. Please re-specify!\n\n";
    }
    unless ($umi){
        die "The detailed UMI report can only be used when the option --umi is selected. Please re-specify!\n";
    }
    if ($double_umi){
         die "The detailed UMI report can only be used when the option --umi (but not --double_umi!) is selected. Please re-specify!\n";
    }
}

unless ($global_single or $global_paired){
    if ($vanilla){
		die "Please specify either -s (single-end) or -p (paired-end) for deduplication. Reading this information from the \@PG header line only works for SAM/BAM files\n\n";
    }
    warn "\nNeither -s (single-end) nor -p (paired-end) selected for deduplication. Trying to extract this information for each file separately from the \@PG line of the SAM/BAM file\n";
}

if ($global_paired){
    if ($global_single){
        die "Please select either -s for single-end files or -p for paired-end files, but not both at the same time!\n\n";
    }
    if ($vanilla){

        if ($umi){
            die "Barcode deduplication only works with BAM output\n";
        }

        warn "Processing paired-end custom Bismark output file(s):\n";
        warn join ("\t",@filenames),"\n\n";
    }
    else{
        warn "Processing paired-end Bismark output file(s) (SAM format):\n";
        warn join ("\t",@filenames),"\n\n";
    }
}
else{
    if ($vanilla){
        warn "Processing single-end custom Bismark output file(s):\n";
        warn join ("\t",@filenames),"\n\n";
    }
    else{
        warn "Processing single-end Bismark output file(s) (SAM format):\n";
        warn join ("\t",@filenames),"\n\n";
    }
}

### UMI-mismatches: Currently only allowing 1 or 2 mismatches
if (defined $mm){
    if ($mm < 1){ # setting to FALSE
	    $mm = 0;
    }
    elsif($mm > 6){
	    die "The current maximum of mismatches in the UMI is 6, please respecify...\n\n";
    }
}


### PATH TO SAMTOOLS
if (defined $samtools_path){
    # if Samtools was specified as full command
    if ($samtools_path =~ /samtools$/){
        if (-e $samtools_path){
            # Samtools executable found
        }
        else{
            die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
        }
    }
    else{
        unless ($samtools_path =~ /\/$/){
            $samtools_path =~ s/$/\//;
        }
        $samtools_path .= 'samtools';
        if (-e $samtools_path){
            # Samtools executable found
        }
        else{
            die "Could not find an installation of Samtools at the location $samtools_path. Please respecify\n";
        }
    }
}
# Check whether Samtools is in the PATH if no path was supplied by the user
else{
    if (!system "which samtools >/dev/null 2>&1"){ # STDOUT is binned, STDERR is redirected to STDOUT. Returns 0 if Samtools is in the PATH
        $samtools_path = `which samtools`;
        chomp $samtools_path;
    }
}

if ($bam){
    if (defined $samtools_path){
	    $bam = 1;
    }
    else{
        warn "No Samtools found on your system, writing out a gzipped SAM file instead\n";
        $bam = 2;
    }
}
else{
    $bam = 0;
}


if ($representative){
    warn "\nIf there are several alignments to a single position in the genome the alignment with the most representative methylation call will be chosen (this might be the most highly amplified PCR product...)\n\n";
    sleep (1);
}
elsif($umi){
    warn "\nIf the input is a multiplexed sample with several alignments to a single position in the genome, only alignments with a unique UMI will be chosen\n";
    if ($mm){
	    warn "Number of tolerated edit distance to known UMIs for any given position: $mm\n";
    }
    warn "\n";
    sleep (1);
}
else{ # default; random (=first) alignment
    warn "\nIf there are several alignments to a single position in the genome the first alignment will be chosen. Since the input files are not in any way sorted this is a near-enough random selection of reads.\n\n";
    sleep (1);
}

foreach my $file (@filenames){

    if ($global_single){
        $paired = 0;
        $single = 1;
    }
    elsif($global_paired){
        $paired = 1;
	    $single = 0;
    }

    # Testing if the file appears to be truncated, in which case we bail with a big scary warning message
    if ($file =~ /(\.bam$)/){
	    bam_isTruncated($file);
    }
    
    %umi_reporting = (); # clearing; Only required when --detailed_UMI_report was specified

    unless($global_single or $global_paired){

        warn "Trying to determine the type of mapping from the SAM header line\n"; sleep(1);

        ### if the user did not specify whether the alignment file was single-end or paired-end we are trying to get this information from the @PG header line in the SAM/BAM file
        if ($file =~ /\.gz$/){
            open (DETERMINE,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
        }
        elsif ($file =~ /\.bam$/){
            open (DETERMINE,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
        }
        else{
            open (DETERMINE,$file) or die "Unable to read from $file: $!\n";
        }
        while (<DETERMINE>){
            last unless (/^\@/);
            if ($_ =~ /^\@PG/){
                warn "found a \@PG line:\n$_";

                # Paired-end test for known aligners
			    if ($_ =~ /ID:Bismark/ or $_ =~ /ID:hisat2/ or $_ =~ /ID:bowtie2/){
                    if ($_ =~ /\s+--?1\s+/ and $_ =~ /\s+--?2\s+/){ # allowing -1 and -2 or --1 and --2
                        warn "Treating file as paired-end data (extracted from \@PG line)\n"; sleep(1);
                        $paired = 1;
                        $single = 0;
                    }
                    else{
                        warn "Treating file as single-end data (extracted from \@PG line)\n"; sleep(1);
                        $paired = 0;
                        $single = 1;
                    }
                    last; # exiting after one attempt
                }
            }
        }
  
	    close DETERMINE or warn "$!\n";
        
        unless (defined $paired){
		    die "\nUnable to detect library type automatically, please specify whether the file is single- or paired-end manually\n\n";
	    }
    }

    if ($file =~ /(\.bam$|\.sam$)/){
	    bam_isEmpty($file);
    }

    ### OPTIONS
    unless ($single or $paired){
	    die "Please specify either -s (single-end) or -p (paired-end) for deduplication, or provide a SAM/BAM file that contains the \@PG header line\n\n";
    }

    ###
    if ($paired){
        if ($hic){
            warn "Testing for positional sorting is not desirable for Hi-C data. Skipping...\n\n";
        }    
        else{
	        test_positional_sorting($file);
        }   
    }

    ### writing to a report file
    my $report = $file;

    $report =~ s/\.gz$//;
    $report =~ s/\.sam$//;
    $report =~ s/\.bam$//;
    $report =~ s/\.txt$//;
    my $detail_report = $report;
    if ($umi){
        if ($mm){
            $report =~ s/$/.UMI_${mm}mm_deduplication_report.txt/;
            $detail_report =~ s/$/.UMI_${mm}mm_detailed_UMI_report.txt/;
        }
        else{
            $report =~ s/$/.UMI_deduplication_report.txt/;
            $detail_report =~ s/$/.UMI_detailed_UMI_report.txt/;
        }
    }
    else{
    	$report =~ s/$/.deduplication_report.txt/;
    }

    open (REPORT,'>',$report) or die "Failed to write to report file to $report: $!\n\n";

    if($umi){
        if ($mm){
            deduplicate_barcoded_umi_with_mismatches($file,$mm);
        }
        else{
            deduplicate_barcoded_umi($file);
        }

        ### PRINTING DETAILED UMI REPORT; should only be available for Hi-C data
        if ($detail){
            detailed_umi_reporting($detail_report); 
        }

    }
    ### as the default option we simply write out the first read for a position and discard all others. This is the fastest option
    else{

        my %unique_seqs;
        my %positions;

        my $upos = 0; # keeping a separate counter for already covered positions

        if ($file =~ /\.gz$/){
            open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
        }
        elsif ($file =~ /\.bam$/){
            open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
        }
        else{
            open (IN,$file) or die "Unable to read from $file: $!\n";
        }

        my $outfile = $file;
        $outfile =~ s/\.gz$//;
        $outfile =~ s/\.sam$//;
        $outfile =~ s/\.bam$//;
        $outfile =~ s/\.txt$//;

        if ($bam == 1){
            $outfile =~ s/$/.deduplicated.bam/;
        }
        elsif ($bam == 2){
            $outfile =~ s/$/.deduplicated.sam.gz/;
        }
        else{
            $outfile =~ s/$/.deduplicated.sam/;
        }

        if ($bam == 1){
            open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
        }
        elsif($bam == 2){ ### no Samtools found on system. Using GZIP compression instead
            open (OUT,"| gzip -c - > $outfile") or die "Failed to write to $outfile: $!\n";
        }
        else{
	        open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
        }

        my $count = 0;
        my $unique_seqs = 0;
        my $removed = 0;

        while (<IN>){

            if ($count == 0){
                if ($_ =~ /^Bismark version:/){
                    warn "The file appears to be in the custom Bismark and not SAM format. Please see option --vanilla!\n";
                    sleep (2);
                    print_helpfile();
                    exit;
                }
            }

            ### if this was a SAM file we ignore header lines
            unless ($vanilla){
                if (/^\@\w{2}\t/){
                    print "skipping header line:\t$_";
                    print OUT "$_"; # Printing the header lines again into the de-duplicated file
                    next;
                }
            }

            ++$count;
            my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths

            my ($strand,$chr,$start,$end,$cigar);
            my $line1;

            # BAM/SAM format
            ($strand,$chr,$start,$cigar) = (split (/\t/))[1,2,3,5]; # we are assigning the FLAG value to $strand

            ### SAM single-end
            if ($single){

                if ($strand == 0 ){
                    ### read aligned to the forward strand. No action needed
                }
                elsif ($strand == 16){
                    ### read is on reverse strand
                    $start -= 1; # only need to adjust this once

                    # for InDel free matches we can simply use the M number in the CIGAR string
                    if ($cigar =~ /^(\d+)M$/){ # linear match
                        $start += $1;
                    }
                    else{
                        # parsing CIGAR string
                        my @len = split (/\D+/,$cigar); # storing the length per operation
                        my @ops = split (/\d+/,$cigar); # storing the operation
                        shift @ops; # remove the empty first element
                        die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);

                        # warn "CIGAR string; $cigar\n";
                        ### determining end position of a read
                        foreach my $index(0..$#len){
                            if ($ops[$index] eq 'M'){  # standard matching bases
                                $start += $len[$index];
                                # warn "Operation is 'M', adding $len[$index] bp\n";
                            }
                            elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                                # warn "Operation is 'I', next\n";
                            }
                            elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                                #  warn "Operation is 'D',adding $len[$index] bp\n";
                                $start += $len[$index];
                            }
                            elsif($ops[$index] eq 'S'){ # soft-clipped bases do not affect the end position
                                # warn "Operation is 'S', next\n";
                            }
                            elsif($ops[$index] eq 'N'){ # skipped regions (e.g. splice junctions) do affect the end position
                                # warn "Operation is 'N',adding $len[$index] bp\n";
                                $start += $len[$index];
                            }
                            else{
                                die "SE BAM: Found CIGAR operations other than M, I, S, N or D: '$ops[$index]'. Not allowed at the moment\n";
                            }
                        }
                    }
                }
                $composite = join (":",$strand,$chr,$start);
            }
            elsif($paired){

                ### storing the current line
                $line1 = $_;
                
                if($hic){
                    ## Added this section 30 09 2021
                    # warn "Hi-C mode:\n";
                    
                    ### reading in the next line
                    my $line2 = <IN>;
                    chomp $line1;
                    chomp $line2;
                    
                    # warn "R1: $line1\n";
                    # warn "R2: $line2\n~~~~~~~~~~~~~~~~~\n";
                    
                    # For Hi-C mode it should be enough to work with the chromosome and start positions of both R1 and R2
                    my ($chr_r1,$start_r1) = (split (/\t/,$line1))[2,3];
                    my ($chr_r2,$start_r2) = (split (/\t/,$line2))[2,3];

                    # print "$chr_r1\t$start_r1\n$chr_r2\t$start_r2\n"; sleep(1);
                    
                    # Hi-C data essentially behaves like 2 separate single-end alignments, so we
                    # should be fine to just score chr1:start1:chr2:start2:UMI. Since we don't
                    # know whether R1 and R2 are aligned in a directional fashion, we need to test
                    # both combinations.
                
                    ### Here we take only the mapping position into consideration
                    my $composite1 = join (":",$chr_r1,$start_r1,$chr_r2,$start_r2);
                    my $composite2 = join (":",$chr_r2,$start_r2,$chr_r1,$start_r1);

                    # print ("$composite1\n$composite2\n\n      :::::\n\n"); sleep(1);

                    if ( (exists $unique_seqs{$composite1}) or (exists $unique_seqs{$composite2}) ){
                        ++$removed;
                        # warn "Exactly this read combination was present already!\n"; sleep(1);
                        
                        # If a read was found  at this position already, we record both R1/R2 combinations (and devide this number by 2 for reporting)
                        unless (exists $positions{$composite1}){
                            $positions{$composite1}++;
                            $positions{$composite2}++;
                        }
                    }
                    else{
                        print OUT "$line1\n"; # printing first  line of Hi-C output
                        print OUT "$line2\n";  # printing second line of Hi-C output
                        # print "$line1\n$line2\n"; sleep(1);

                        # If neither read combination was found so far, it should be enough to record a single one, e.g. composite1
                        $unique_seqs{$composite1}++;
                    }

                    
                }
                else{
            
                    my $flag = (split /\t/,$line1)[1];

                    # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2
                    if ($flag == 99){ # 99 is paired-end, properly paired, read first in pair, mate on reverse strand

                        ### reading in the next line
                        $_ = <IN>;
                        # the only thing we need is the end position
                        ($end,my $cigar_2) = (split (/\t/))[3,5];

                        $end -= 1; # only need to adjust this once

                        # for InDel free matches we can simply use the M number in the CIGAR string
                        if ($cigar_2 =~ /^(\d+)M$/){ # linear match
                            $end += $1;
                        }
                        else{
                            # parsing CIGAR string
                            my @len = split (/\D+/,$cigar_2); # storing the length per operation
                            my @ops = split (/\d+/,$cigar_2); # storing the operation
                            shift @ops; # remove the empty first element
                            die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);

                            # warn "CIGAR string; $cigar_2\n";
                            ### determining end position of the read
                            foreach my $index(0..$#len){
                                if ($ops[$index] eq 'M'){  # standard matching bases
                                    $end += $len[$index];
                                    # warn "Operation is 'M', adding $len[$index] bp\n";
                                }
                                elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                                    # warn "Operation is 'I', next\n";
                                }
                                elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                                    #  warn "Operation is 'D',adding $len[$index] bp\n";
                                    $end += $len[$index];
                                }
                                elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                                    # warn "Operation is 'S'\n";
                                }
                                elsif($ops[$index] eq 'N'){ # skipped regions (e.g. splice junctions) do affect the end position
                                    #  warn "Operation is 'N',adding $len[$index] bp\n";
                                    $end += $len[$index];
                                }
                                else{
                                    die "PE BAM: Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
                                }
                            }
                        }
                    }
                    elsif($flag == 83){
                        # Flag 83 is paired-end, properly paired, read reverse strand, first in pair
                        # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line

                        $end = $start - 1; # need to adjust this only once

                        # for InDel free matches we can simply use the M number in the CIGAR string
                        if ($cigar =~ /^(\d+)M$/){ # linear match
                            $end += $1;
                        }
                        else{
                            # parsing CIGAR string
                            my @len = split (/\D+/,$cigar); # storing the length per operation
                            my @ops = split (/\d+/,$cigar); # storing the operation
                            shift @ops; # remove the empty first element
                            die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);

                            # warn "CIGAR string; $cigar\n";
                            ### determining end position of the read
                            foreach my $index(0..$#len){
                                if ($ops[$index] eq 'M'){  # standard matching bases
                                    $end += $len[$index];
                                    # warn "Operation is 'M', adding $len[$index] bp\n";
                                }
                                elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                                    # warn "Operation is 'I', next\n";
                                }
                                elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                                    # warn "Operation is 'D',adding $len[$index] bp\n";
                                    $end += $len[$index];
                                }
                                elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                                    # warn "Operation is 'S'\n";
                                }
                                elsif($ops[$index] eq 'N'){ # skipped regions such as splice junctions do affect the end position
                                    # warn "Operation is 'N',adding $len[$index] bp\n";
                                    $end += $len[$index];
                                }
                                else{
                                    die "PE BAM: Found CIGAR operations other than M, I, D, S or N: '$ops[$index]'. Not allowed at the moment\n";
                                }
                            }
                        }

                        ### reading in the next line
                        $_ = <IN>;
                        # the only thing we need is the start position
                        ($start) = (split (/\t/))[3];
                    }
                    else{
                        die "So far unhandled FLAG value: $flag. Please update UMI-BAM\n\n";
                    }
                    $composite = join (":",$strand,$chr,$start,$end);
                }
            }
            else{
                die "Input must be single or paired-end\n";
            }
          

            unless ($hic){ # this has already been carried out at this step
                if (exists $unique_seqs{$composite}){
                    ++$removed;
                    unless (exists $positions{$composite}){
                        $positions{$composite}++;
                    }
                }
                else{
                    if ($paired){
                        print OUT "$line1"; # printing first paired-end line for SAM output
                    }
                    print OUT "$_"; # printing single-end SAM alignment or second paired-end line
                    $unique_seqs{$composite}++;
                }
            }
        }

        my $percentage;
        my $percentage_leftover;
        my $leftover = $count - $removed;

        unless ($count == 0){
            $percentage = sprintf("%.2f",$removed/$count*100);
            $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
        }
        else{
            $percentage = 'N/A';
            $percentage_leftover = 'N/A';
        }

        warn "\nTotal number of alignments analysed in $file:\t$count\n";
        warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
        if ($hic){
            warn "Duplicated Hi-C alignments were found at: \t",(scalar keys %positions)/2," different positional combinations\n\n";
        }
        else{
            warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
        }
        warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

       
        print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
        print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
        if ($hic){
            print REPORT "Duplicated Hi-C alignments were found at: \t",(scalar keys %positions)/2," different positional combinations\n\n";
        }
        else{
            print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
        }
        print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
    }

    close OUT or warn "Failed to close output filehandle: $!\n";
    close REPORT or warn "Failed to close report filehandle: $!\n";

}


sub deduplicate_barcoded_umi{

    my $file = shift;
    warn "Running in >>> UMI-mode <<< (no mismatches in UMI tolerated)\n\n"; sleep(1);
    my %unique_seqs;
    my %positions;

    if ($file =~ /\.gz$/){
    	open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
    }
    elsif ($file =~ /\.bam$/){
	    open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
    	open (IN,$file) or die "Unable to read from $file: $!\n";
    }

    my $outfile = $file;
    $outfile =~ s/\.gz$//;
    $outfile =~ s/\.sam$//;
    $outfile =~ s/\.bam$//;
    $outfile =~ s/\.txt$//;

    if ($bam == 1){
	$outfile =~ s/$/.UMI_deduplicated.bam/;
    }
    else{
	$outfile =~ s/$/.UMI_deduplicated.sam/;
    }

    if ($bam == 1){
	    open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
    }
    else{
	    open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
    }

    my $count = 0;
    my $unmapped = 0;
    my $unique_seqs = 0;
    my $removed = 0;

    while (<IN>){

        ### if this was a SAM file we ignore header lines
        if (/^\@\w{2}\t/){
      #         warn "skipping SAM header line:\t$_";
            print OUT; # Printing the header lines again into the de-duplicated file
            next;
        }

        my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
        ### in this UMI mode we also store the read barcode (UMI) as additional means of assisting the deduplication
        ### in effect the $composite string looks like this (separated by ':'):

        ### FLAG:chromosome:start->{barcode}

        my $end;
        my $line1;

        # SAM format
        my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand

        if ($single){
            if($strand & 0x4){
            ++$unmapped;
            # warn "Sequence unmapped -> skipping\n";
            next;
            }
        }

	    ++$count;

        my $umi_seq;

        if ($double_umi){
            $id =~ /:R1:(\w+):R2:(\w+):.*$/; # adapting for Tom's barcodes, possibly subject to change
            my $umi1 = $1;
            my $umi2 = $2;
            $umi_seq = $umi1.$umi2;
        }
        else{
            $id =~ /.*:(.+)$/;
            $umi_seq = $1;
        }

        if ($paired){
            # warn "paired-end mode\n"
        }
        # warn "$umi_seq\n"; sleep(1);

        if ($umi_seq){
            if ($umi_seq =~ /[^GATCNgatcn\+]/){
                # warn "UMI was: $umi_seq\nLine was: $_\n";
                die "Failed to extract a UMI that looks like a DNA sequence: $umi_seq (last element of each read ID needs to be the UMI sequence, e.g. ':CATGAT'\n\n";
            }
        }
        else{
            die "Failed to extract a UMI from the read ID (last element of each read ID needs to be the UMI sequence, e.g. ':CATGAT'\n\n";
        }

        ### SAM single-end
        if ($single){

            if ($strand == 0 ){
            ### read aligned to the forward strand. No action needed
            }
            elsif ($strand == 16){
            ### read is on reverse strand

            ### ignoring reverse reads for the moment

            $start -= 1; # only need to adjust this once

            # for InDel free matches we can simply use the M number in the CIGAR string
            if ($cigar =~ /^(\d+)M$/){ # linear match
                $start += $1;
            }
            else{
                # parsing CIGAR string
                my @len = split (/\D+/,$cigar); # storing the length per operation
                my @ops = split (/\d+/,$cigar); # storing the operation
                shift @ops; # remove the empty first element
                die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);

                # warn "CIGAR string; $cigar\n";
                ### determining end position of a read
                foreach my $index(0..$#len){
                if ($ops[$index] eq 'M'){  # standard matching bases
                    $start += $len[$index];
                    # warn "Operation is 'M', adding $len[$index] bp\n";
                }
                elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                    # warn "Operation is 'I', next\n";
                }
                elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                    #  warn "Operation is 'D', adding $len[$index] bp\n";
                    $start += $len[$index];
                }
                elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                    # warn "Operation is 'S' (SE BAM (UMI mode))\n";
                }
                elsif ($ops[$index] eq 'N'){  # splice junctions are similar to deletions and do count towards the end position
                    # warn "Operation is 'N', adding $len[$index] bp\n";
                    $start += $len[$index];
                }
                else{
                    die "SE BAM (UMI mode): Found CIGAR operations other than M, I, S, N or D: '$ops[$index]'. Not allowed at the moment\n";
                }
                }
            }
            }

            ### Here we take the barcode sequence into consideration
            $composite = join (":",$strand,$chr,$start,$umi_seq);
            # warn "$composite\n"; sleep(1);
        }
        elsif($paired){

            ### storing the current line
            $line1 = $_;
            
            if($hic){
                ## Added this section 30 09 2021
                # warn "Hi-C mode:\n";
                
                ### reading in the next line
                my $line2 = <IN>;
                chomp $line1;
                chomp $line2;
                
                # warn "R1: $line1\n";
                # warn "R2: $line2\n~~~~~~~~~~~~~~~~~\n";
                
                # For Hi-C mode it should be enough to work with the chromosome and start positions of both R1 and R2
                my ($chr_r1,$start_r1) = (split (/\t/,$line1))[2,3];
                my ($chr_r2,$start_r2) = (split (/\t/,$line2))[2,3];

                # print "$chr_r1\t$start_r1\n$chr_r2\t$start_r2\n"; sleep(1);
                
                # Hi-C data essentially behaves like 2 separate single-end alignments, so we
                # should be fine to just score chr1:start1:chr2:start2:UMI. Since we don't
                # know whether R1 and R2 are aligned in a directional fashion, we need to test
                # both combinations.
            
               
                ### Here we take the barcode sequence into consideration
                my $composite1 = join (":",$chr_r1,$start_r1,$chr_r2,$start_r2,$umi_seq);
                my $composite2 = join (":",$chr_r2,$start_r2,$chr_r1,$start_r1,$umi_seq);
                
                my $comp_pos1 = join (":",$chr_r1,$start_r1,$chr_r2,$start_r2);
                my $comp_pos2 = join (":",$chr_r2,$start_r2,$chr_r1,$start_r1);
                # print ("$composite1\n$composite2\n\n      :::::\n\n"); sleep(1);

                if (exists $unique_seqs{$composite1} ){
                    ++$removed;
                    # warn "Exactly this read combination was present already\n";
                    
                    # If a read was found  at this position already, we record both R1/R2 combinations (and devide this number by 2 for reporting)
                    unless (exists $positions{$comp_pos1}){
                        $positions{$comp_pos1}++;
                        $positions{$comp_pos2}++;
                    }
                }
                elsif(exists $unique_seqs{$composite2}){
                    ++$removed;
                    # warn "Exactly this read combination was present already\n";

                    # If a read was found  at this position already, we record both R1/R2 combinations (and devide this number by 2 for reporting)
                    unless (exists $positions{$comp_pos1}){
                        $positions{$comp_pos1}++;
                        $positions{$comp_pos2}++;
                    }
                }
                else{
                    print OUT "$line1\n"; # printing first  line of Hi-C output
                    print OUT "$line2\n";  # printing second line of Hi-C output
                    # print "$line1\n$line2\n"; sleep(1);
                    
                    # If neither read combination was found so far, it should be enough to record a single one, e.g. composite1
                    $unique_seqs{$composite1}++;
                }

                # Optional detailed UMI report
                if ($detail){
                    push @{$umi_reporting{$umi_seq}},"${chr_r1}:${start_r1}:::${chr_r2}:${start_r2}";
                }
            }
            else{
                my $flag = (split /\t/,$line1)[1];
            
                # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2
                if ($flag == 99){ # 99 is paired-end, properly paired, read first in pair, mate on reverse strand

                    ### reading in the next line
                    $_ = <IN>;
                    # the only thing we need is the end position
                    ($end,my $cigar_2) = (split (/\t/))[3,5];

                    $end -= 1; # only need to adjust this once

                    # for InDel free matches we can simply use the M number in the CIGAR string
                    if ($cigar_2 =~ /^(\d+)M$/){ # linear match
                        $end += $1;
                    }
                    else{
                        # parsing CIGAR string
                        my @len = split (/\D+/,$cigar_2); # storing the length per operation
                        my @ops = split (/\d+/,$cigar_2); # storing the operation
                        shift @ops; # remove the empty first element
                        die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);

                        # warn "CIGAR string; $cigar_2\n";
                        ### determining end position of the read
                        foreach my $index(0..$#len){
                            if ($ops[$index] eq 'M'){  # standard matching bases
                                $end += $len[$index];
                                # warn "Operation is 'M', adding $len[$index] bp\n";
                            }
                            elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                                # warn "Operation is 'I', next\n";
                            }
                            elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                                #  warn "Operation is 'D',adding $len[$index] bp\n";
                                $end += $len[$index];
                            }
                            elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                                # warn "Operation is 'S' (PE BAM (UMI mode))\n";
                            }
                            elsif($ops[$index] eq 'N'){ # skipped regions (e.g. splice junctions) do affect the end position
                                            #  warn "Operation is 'N',adding $len[$index] bp\n";
                                            $end += $len[$index];
                                        }
                            else{
                                die "PE BAM (UMI mode): Found CIGAR operations other than M, I, S, N or D: '$ops[$index]'. Not allowed at the moment\n";
                            }
                        }
                    }
                    $composite = join (":",$strand,$chr,$start,$end,$umi_seq);
                }
                elsif($flag == 83){
                    # Flag 83 is paired-end, properly paired, read reverse strand, first in pair
                    # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line

                    $end = $start - 1; # need to adjust this only once

                    # for InDel free matches we can simply use the M number in the CIGAR string
                    if ($cigar =~ /^(\d+)M$/){ # linear match
                        $end += $1;
                    }
                    else{
                        # parsing CIGAR string
                        my @len = split (/\D+/,$cigar); # storing the length per operation
                        my @ops = split (/\d+/,$cigar); # storing the operation
                        shift @ops; # remove the empty first element
                        die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);

                        # warn "CIGAR string; $cigar\n";
                        ### determining end position of the read
                        foreach my $index(0..$#len){
                            if ($ops[$index] eq 'M'){  # standard matching bases
                                $end += $len[$index];
                                # warn "Operation is 'M', adding $len[$index] bp\n";
                            }
                            elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                                # warn "Operation is 'I', next\n";
                            }
                            elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                                # warn "Operation is 'D',adding $len[$index] bp\n";
                                $end += $len[$index];
                            }
                            elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                                # warn "Operation is 'S' (PE BAM (UMI mode))\n";
                            }
                            elsif($ops[$index] eq 'N'){ # skipped regions such as splice junctions do affect the end position
                                            # warn "Operation is 'N',adding $len[$index] bp\n";
                                            $end += $len[$index];
                                        }
                            else{
                                die "PE BAM (UMI mode): Found CIGAR operations other than M, I, N, S or D: '$ops[$index]'. Not allowed at the moment\n";
                            }
                        }
                    }

                    ### reading in the next line
                    $_ = <IN>;
                    # the only thing we need is the start position
                    ($start) = (split (/\t/))[3];

                    ### Here we take the barcode sequence into consideration
                    $composite = join (":",$strand,$chr,$start,$end,$umi_seq);
                }
                ### TODO: Need to add support for unmapped FLAGs! 07 11 2016
                else{
                    die "So far unhandled FLAG: $flag. Please update UMI-Bam\n\n";
                }
            }
        }
        else{
            die "Input must be single or paired-end\n";
        }
        #warn "$composite\n";
        unless ($hic){ # we have already carried this out for Hi-C data
            if (exists $unique_seqs{$composite}){
                ++$removed;
                unless (exists $positions{$composite}){
                    $positions{$composite}++;
                }
            }
            else{
                if ($paired){
                print OUT $line1; # printing first paired-end line for SAM output
                }
                print OUT; # printing single-end SAM alignment or second paired-end line
                $unique_seqs{$composite}++;
            }
        }
    }

    my $percentage;
    my $percentage_leftover;
    my $leftover = $count - $removed;

    unless ($count == 0){
        $percentage = sprintf("%.2f",$removed/$count*100);
        $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
    }
    else{
        $percentage = 'N/A';
        $percentage_leftover = 'N/A';
   }


    warn "\nTotal number of alignments analysed in $file:\t$count\n";
    warn "Total number of sequences were unaligned:\t$unmapped\n";
    warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    if ($hic){
        warn "Duplicated Hi-C alignments were found at: \t",(scalar keys %positions)/2," different positional combinations\n\n";
    }
    else{
        warn "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
    }
    warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

    print REPORT "\nTotal number of alignments analysed in $file:\t$count\n";
    print REPORT "Total number of sequences were unaligned:\t$unmapped\n";
    print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    if ($hic){
        print REPORT "Duplicated Hi-C alignments were found at: \t",(scalar keys %positions)/2," different positional combinations\n\n";
    }
    else{
        print REPORT "Duplicated alignments were found at:\t",scalar keys %positions," different position(s)\n\n";
    }
    print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";
}

sub bam_isEmpty{

    my $file = shift;

    if ($file =~ /\.bam$/){
	open (EMPTY,"$samtools_path view $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
	open (EMPTY,$file) or die "Unable to read from $file: $!\n";
    }
    my $count = 0;
    while (<EMPTY>){
	if ($_){
	    $count++;  # file contains data, fine.
	}
	last; # one line is enough
    }

    if ($count == 0){
	die "\n### File appears to be empty, terminating deduplication process. Please make sure the input file has not been truncated. ###\n\n";
    }
    close EMPTY or warn "$!\n";
}

sub bam_isTruncated{

    my $file = shift;
    warn "Checking file >>$file<< for signs of file truncation...\n";

    open (TRUNC,"$samtools_path view 2>&1 $file |") or die "Unable to read from BAM file $file: $!\n"; # 2>&1 redirects stderr to stdout, so we should be able to capture problems

    my $count = 0;
    while (<TRUNC>){
	chomp;
	++$count;
	# errors tend to start with a [], I have seen e.g.:
	# [W::bam_hdr_read] EOF marker is absent. The input is probably truncated.
	if ($_ =~ /^\[/){
	    if ($_ =~ /[EOF|truncated]/){
		die "Captured error message: '$_'\n\n[ERROR] The file appears to be truncated, please ensure that there were no errors while copying the file!!! Exiting...\n\n";
	    }
	}
	last if ($count == 10); 	# this should be enough
    }
    close TRUNC or warn "$!\n";
}


sub print_helpfile{
    print "\n",'='x111,"\n";
    print "\nThis script is supposed to remove alignments to the same position in the genome from \nboth single and paired-end BAM files, which can arise by e.g. excessive PCR amplification. If sequences align\nto the same genomic position but on different strands they will be scored individually.\n\nIn the default mode, the first alignment to a given position will be used\n(as the alignments are not ordered in any way this is also near enough random).\n\n";
    print "For single-end alignments only use the start coordinate of a read will be used for deduplication.\n\n";
    print "For paired-end alignments the start-coordinate of the first read and the end coordinate of the second\nread will be used for deduplication.\n\n";
    print "UmiBam accepts BAM files with CIGAR operations M (match), D (deletion), I (insertion), N (splice-junction) and S (soft-clipping).\n\n";
    print "*** Please note that for paired-end BAM files the deduplication script expects Read1 and Read2 to\nfollow each other in consecutive lines! If the file has been sorted by position make sure that you resort it\nby read name first (e.g. using samtools sort -n)  ***\n\n";

    print '='x111,"\n\n";
    print ">>> USAGE: UmiBam [options] filename(s) <<<\n\n";

    print "-s/--single\t\tdeduplicate single-end BAM files (default format: SAM)\n";
    print "-p/--paired\t\tdeduplicate paired-end BAM files (default format: SAM)\n\n";
    print "--vanilla\t\tThe input file is in the old custom Bismark format and not in SAM format\n\n";
    print "--umi\t\t\tIn addition to chromosome, start position and orientation this will also take a potential UMI into\n                        consideration while deduplicating. The barcode needs to be the last element of the read ID and separated\n                        by a ':', e.g.: MISEQ:14:000000000-A55D0:1:1101:18024:2858_1:N:0:CTCCT\n\n";
    print "--double_umi\t\tIf the file was double-barcoded this mode extracts both Read 1 and Read 2 barcodes for the UMI\n\t\t\tdeduplication. In its current implementation the two UMIs are expected to the in a format like this:\n\t\t\tHWI-D00436:267:C71A4ANXX:5:1102:1531:82511_1:N:0:_TCTCACGG:R1:CCAACCTA:R2:TATGGGGT:F1:CAGT:F2:CAGT.\n\t\t\tThe barcodes following R1: and R2: are being used, here CCAACCTA and TATGGGGT.\n\n";

    print "--hic\t\t\tAssumes that consecutive reads in a BAM file are paired Hi-C reads, e.g. from HiCUP. Also sets --paired.\n";
    print "\t\t\tDeduplication is based on the chromosome and starting position of both R1 and R2, in combinations\n";
    print "\t\t\tchr_R1:start_R1:chr_R2:start_R2 and chr_R2:start_R2:chr_R1:start_R1. If --umi is selected as well,\n\t\t\tthe UMI sequence is used in addition to that (e.g. chr_R1:start_R1:chr_R2:start_R2:UMI)\n\n";

    print "--detailed_UMI_report\tIn Hi-C mode, this will print an additional tab-delimited text report listing ALL\n\t\t\tfound UMIs in decending order, with R1 and R2 locations in the following format:\n\t\t\tUMI // count // R1 chrom // R1 pos // R2 chrom // R2 pos\n\n";

    print "--mm/--mismatches <INT>\tNumber of mismatches tolerated in the UMI. If a sequence has the same edit (hemming) distance to\n\t\t\tseveral different UMIs the read will be regarded as a duplicate and discarded. Currently allowed\n\t\t\tmaximum of --mm is 6. Default: 0.\n\n";
    print "--bam\t\t\tThe output will be written out in BAM format instead of the default SAM format. This script will\n\t\t\tattempt to use the path to Samtools that was specified with '--samtools_path', or, if it hasn't\n\t\t\tbeen specified, attempt to find Samtools in the PATH. If no installation of Samtools can be found,\n\t\t\tthe SAM output will be compressed with GZIP instead (yielding a .sam.gz output file)\n\n";
    print "--samtools_path\t\tThe path to your Samtools installation, e.g. /home/user/samtools/. Does not need to be specified\n\t\t\texplicitly if Samtools is in the PATH already\n\n";
    print "--version\t\tPrint version information and exit\n";

    print '='x111,"\n\n";

    print "This script was last modified on $last_modified\n\n";
}


sub test_positional_sorting{

    my $filename = shift;

    print "\nNow testing Bismark result file $filename for positional sorting (which would be bad...)\t";
    sleep(1);

    if ($filename =~ /\.gz$/) {
	open (TEST,"gunzip -c $filename |") or die "Can't open gzipped file $filename: $!\n";
    }
    elsif ($filename =~ /bam$/ ||  isBam($filename) ){ ### this would allow to read BAM files that do not end in *.bam
	if ($samtools_path){
	    open (TEST,"$samtools_path view -h $filename |") or die "Can't open BAM file $filename: $!\n";
	}
	else{
	    die "Sorry couldn't find an installation of Samtools. Either specifiy an alternative path using the option '--samtools_path /your/path/', or use a SAM file instead\n\n";
	}
    }
    else {
	open (TEST,$filename) or die "Can't open file $filename: $!\n";
    }

    my $count = 0;

    while (<TEST>) {
	if (/^\@/) {	     # testing header lines if they contain the @SO flag (for being sorted)
	    if (/^\@SO/) {
		die "SAM/BAM header line '$_' indicates that the Bismark aligment file has been sorted by chromosomal positions which is is incompatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
	    }
	    next;
	}
	$count++;

	last if ($count > 100000); # else we test the first 100000 sequences if they start with the same read ID

	my ($id_1) = (split (/\t/));

	### reading the next line which should be read 2
	$_ = <TEST>;
	my ($id_2) = (split (/\t/));
	last unless ($id_2);
	++$count;

	if ($id_1 eq $id_2){
	    ### ids are the same
	    next;
	}
	else{ ### in previous versions of Bismark we appended /1 and /2 to the read IDs for easier eyeballing which read is which. These tags need to be removed first
	    my $id_1_trunc = $id_1;
	    $id_1_trunc =~ s/\/1$//;
	    $id_1_trunc =~ s/_\d:N:.*$//; # truncating to 1:N:0
	    my $id_2_trunc = $id_2;
	    $id_2_trunc =~ s/\/2$//;
	    $id_2_trunc =~ s/_\d:N:.*$//; # truncating to 3:N:0

	    unless ($id_1_trunc eq $id_2_trunc){
		die "\nThe IDs of Read 1 ($id_1) and Read 2 ($id_2) are not the same. This might be a result of sorting the paired-end SAM/BAM files by chromosomal position which is not compatible with correct methylation extraction. Please use an unsorted file instead (e.g. use samtools sort -n)\n\n";
	    }
    }
    }
    #  close TEST or die $!; somehow fails on our cluster...
    ### If it hasen't died so far then it seems the file is in the correct Bismark format (read 1 and read 2 of a pair directly following each other)
    warn "...passed!\n";
    sleep(1);

}

sub isBam{

    my $filename = shift;

    # reading the first line of the input file to see if it is a BAM file in disguise (i.e. a BAM file that does not end in *.bam which may be produced by Galaxy)
    open (DISGUISE,"gunzip -c $filename |") or die "Failed to open filehandle DISGUISE for $filename\n\n";

    ### when BAM files read through a gunzip -c stream they start with BAM...
    my $bam_in_disguise = <DISGUISE>;
    # warn "BAM in disguise: $bam_in_disguise\n\n";

    if ($bam_in_disguise){
	if ($bam_in_disguise =~ /^BAM/){
	    close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
	    return 1;
	}
	else{
	    close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
	    return 0;
	}
    }
    else{
	close (DISGUISE) or warn "Had trouble closing filehandle BAM in disguise: $!\n";
	return 0;
    }
}


sub get_edit_distance {

    my ($seq,$umi_seq) = @_;
    my @seq     = split(//,$seq);
    my @umi_seq = split(//,$umi_seq);

    # print "seq    : ";
    # print  join ('',@seq),"\n";
    # print "UMI-Seq: ";
    # print  join ('',@umi_seq),"\n\n";

    my $n_count = 0;
    my $match_count = 0;
    my $mismatch_count = 0;

    for my $index (0..$#umi_seq) {

	last if ($index > $#seq);

	if ($umi_seq[$index] eq 'N') {
	    ++$n_count;
	}
	elsif ($umi_seq[$index] eq $seq[$index]) {
	    ++$match_count;
	}
	else {
	    ++$mismatch_count;
	}
    }
    # warn "match count:    $match_count\nN count:        $n_count\nmismatch count: $mismatch_count\n";
    my $edit_dist = length($umi_seq) - $match_count;
    return $edit_dist;
}


sub deduplicate_barcoded_umi_with_mismatches{

    my ($file,$mm) = @_;

    my %unique_seqs;
    my %positions;
      
    if ($file =~ /\.gz$/){
	    open (IN,"gunzip -c $file |") or die "Unable to read from gzipped file $file: $!\n";
    }
    elsif ($file =~ /\.bam$/){
	    open (IN,"$samtools_path view -h $file |") or die "Unable to read from BAM file $file: $!\n";
    }
    else{
	    open (IN,$file) or die "Unable to read from $file: $!\n";
    }

    my $outfile = $file;
    $outfile =~ s/\.gz$//;
    $outfile =~ s/\.sam$//;
    $outfile =~ s/\.bam$//;
    $outfile =~ s/\.txt$//;

    if ($bam == 1){
	    $outfile =~ s/$/.UMI_${mm}mm_deduplicated.bam/;
    }
    else{
	    $outfile =~ s/$/.UMI_${mm}mm_deduplicated.sam/;
    }

    if ($bam == 1){
	    open (OUT,"| $samtools_path view -bSh 2>/dev/null - > $outfile") or die "Failed to write to $outfile: $!\n";
    }
    else{
	    open (OUT,'>',$outfile) or die "Unable to write to $outfile: $!\n";
    }
    warn "UMI deduplication mode allowing $mm mismatches\n\n"; sleep(3);

    my $count = 0;
    my $unique_seqs = 0;
    my $removed = 0;
    my @locations; # needed for Hi-C fragment sorting

    while (<IN>){

        ### if this was a SAM file we ignore header lines
        if (/^\@\w{2}\t/){
            # warn "skipping SAM header line:\t$_";
            print OUT; # Printing the header lines again into the de-duplicated file
            next;
        }

        my $composite; # storing positional data. For single end data we are only using the start coordinate since the end might have been trimmed to different lengths
        ### in this UMI mode we also store the read barcode (UMI) as additional means of assisting the deduplication
        ### in effect the $composite string looks like this (separated by ':'):

        ### FLAG:chromosome:start->{barcode}

        my $end;
        my $line1;

        # SAM format
        my ($id,$strand,$chr,$start,$cigar) = (split (/\t/))[0,1,2,3,5]; # we are assigning the FLAG value to $strand

        my $umi_seq;

        if ($double_umi){
            $id =~ /:R1:(\w+):R2:(\w+):.*$/; # adapting for Tom's barcodes
            my $umi1 = $1;
            my $umi2 = $2;
            $umi_seq = $umi1.$umi2;
        }
        else{
            $id =~ /.*:(.+)$/; # taking the last element after the :
            $umi_seq = $1;
        }
        # warn "$umi_seq\n"; 	 sleep(1);

        if ($umi_seq){
            if ($umi_seq =~ /[^GATCNgatcn\+]/){
                die "Failed to extract a UMI that looks like a DNA sequence: $umi (last element of each read ID needs to be the UMI sequence, e.g. ':CATGAT'\n\n";
            }
        }
        else{
            die "Failed to extract a UMI from the read ID (last element of each read ID needs to be the UMI sequence, e.g. ':CATGAT'\n\n";
        }

        ### SAM single-end
        if ($single){

            if ($strand == 0 ){
                ### read aligned to the forward strand. No action needed
            }
            elsif ($strand == 16){
                ### read is on reverse strand

                ### ignoring reverse reads for the moment

                $start -= 1; # only need to adjust this once

                # for InDel free matches we can simply use the M number in the CIGAR string
                if ($cigar =~ /^(\d+)M$/){ # linear match
                    $start += $1;
                }
                else{
                    # parsing CIGAR string
                    my @len = split (/\D+/,$cigar); # storing the length per operation
                    my @ops = split (/\d+/,$cigar); # storing the operation
                    shift @ops; # remove the empty first element
                    die "CIGAR string contained a non-matching number of lengths and operations\n" unless (scalar @len == scalar @ops);

                    # warn "CIGAR string; $cigar\n";
                    ### determining end position of a read
                    foreach my $index(0..$#len){
                        if ($ops[$index] eq 'M'){  # standard matching bases
                            $start += $len[$index];
                            # warn "Operation is 'M', adding $len[$index] bp\n";
                        }
                        elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                            # warn "Operation is 'I', next\n";
                        }
                        elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                            #  warn "Operation is 'D', adding $len[$index] bp\n";
                            $start += $len[$index];
                        }
                        elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                            # warn "Operation is 'S'\n";
                        }
                        elsif ($ops[$index] eq 'N'){  # splice junctions are similar to deletions and do count towards the end position
                            # warn "Operation is 'N', adding $len[$index] bp\n";
                            $start += $len[$index];
                        }
                        else{
                            die "SE BAM (mismatch_mode): Found CIGAR operations other than M, I, S, N or D: '$ops[$index]'. Not allowed at the moment\n";
                        }
                    }
                }
            }

            ### Here we take the barcode sequence into consideration
            $composite = join (":",$strand,$chr,$start);
            # warn "$composite\n"; sleep(1);

            $count++;
            if (exists $positions{$composite}->{$umi_seq}){
            # warn "Position $composite with UMI $umi_seq already existed - skipping...\n";
            ++$removed;
            }
            else{
            $positions{$composite}->{$umi_seq}->{alignment} = $_;
            # warn "Position $composite with UMI $umi_seq is new - adding...\n";
            }

        }
        elsif($paired){

            ### storing the current line
            $line1 = $_;
            if($hic){
                ## Added this section 30 09 2021
                # warn "Hi-C mode:\n";
                
                ### reading in the next line
                my $line2 = <IN>;
                chomp $line1;
                chomp $line2;
                
                # warn "R1: $line1\n";
                # warn "R2: $line2\n~~~~~~~~~~~~~~~~~\n";
                
                # For Hi-C mode it should be enough to work with the chromosome and start positions of both R1 and R2
                my ($chr_r1,$start_r1) = (split (/\t/,$line1))[2,3];
                my ($chr_r2,$start_r2) = (split (/\t/,$line2))[2,3];

                # print "$chr_r1\t$start_r1\n$chr_r2\t$start_r2\n"; sleep(1);
                
                # Hi-C data essentially behaves like 2 separate single-end alignments, so we
                # should be fine to just score chr1:start1:chr2:start2:UMI. Since we don't
                # know whether R1 and R2 are aligned in a directional fashion, we need to test
                # both combinations.
            
                ### Here we take the barcode sequence into consideration
                # Let's try out a sorting approach, so that we always use alignments in the same order
                my $composite1 = join (":",$chr_r1,$start_r1);
                my $composite2 = join (":",$chr_r2,$start_r2);
                # print "$chr_r1\t$start_r1\n$chr_r2\t$start_r2\n"; sleep(1);
                push @locations,$composite1;
                push @locations,$composite2;
                
                #print @locations;
                # location combinations should always appear in the same order
                @locations = sort (@locations);
                #print @locations;
            
                foreach (@locations){
                    # warn "$_\n";
                    $composite .= "$_:";
                }
                $composite =~ s/:$//;
                @locations = (); # resetting
                # warn ">>$composite<<\n"; sleep(1);
                # print ("$composite1\n$composite2\n\n      :::::\n\n"); sleep(1);
                
                
                $count++;
                if (exists $positions{$composite}->{$umi_seq}){
                    # warn "Position $composite with UMI $umi_seq already existed - skipping...\n";
                    ++$removed;
                }
                else{
                    # Hi-C; PAIRED-END
                    $positions{$composite}->{$umi_seq}->{alignment_1} = "$line1\n";
                    $positions{$composite}->{$umi_seq}->{alignment_2} = "$line2\n";
                }
                
                # Optional detailed UMI report
                if ($detail){
                    push @{$umi_reporting{$umi_seq}},"${composite1}:::$composite2";
                }

            }
            else{

                my $flag = (split /\t/,$line1)[1];

                # if the read aligns in forward orientation we can certainly use the start position of read 1, and only need to work out the end position of read 2
                if ($flag == 99){ # 99 is paired-end, properly paired, read first in pair, mate on reverse strand

                    ### reading in the next line
                    $_ = <IN>;
                    # the only thing we need is the end position
                    ($end,my $cigar_2) = (split (/\t/))[3,5];

                    $end -= 1; # only need to adjust this once

                    # for InDel free matches we can simply use the M number in the CIGAR string
                    if ($cigar_2 =~ /^(\d+)M$/){ # linear match
                        $end += $1;
                    }
                    else{
                        # parsing CIGAR string
                        my @len = split (/\D+/,$cigar_2); # storing the length per operation
                        my @ops = split (/\d+/,$cigar_2); # storing the operation
                        shift @ops; # remove the empty first element
                        die "CIGAR string contained a non-matching number of lengths and operations ($cigar_2)\n" unless (scalar @len == scalar @ops);

                        # warn "CIGAR string; $cigar_2\n";
                        ### determining end position of the read
                        foreach my $index(0..$#len){
                            if ($ops[$index] eq 'M'){  # standard matching bases
                                $end += $len[$index];
                                # warn "Operation is 'M', adding $len[$index] bp\n";
                            }
                            elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                                # warn "Operation is 'I', next\n";
                            }
                            elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                                #  warn "Operation is 'D',adding $len[$index] bp\n";
                                $end += $len[$index];
                            }
                            elsif($ops[$index] eq 'N'){ # skipped regions (e.g. splice junctions) do affect the end position
                                            #  warn "Operation is 'N',adding $len[$index] bp\n";
                                            $end += $len[$index];
                            }
                            elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                                # warn "Operation is 'S'\n";
                            }
                            else{
                                die "PE BAM (mismatch mode): Found CIGAR operations other than M, I, S, N or D: '$ops[$index]'. Not allowed at the moment\n";
                            }
                        }
                    }
                }
                elsif($flag == 83){
                    # Flag 83 is paired-end, properly paired, read reverse strand, first in pair
                    # else read 1 aligns in reverse orientation and we need to work out the end of the fragment first, and use the start of the next line

                    $end = $start - 1; # need to adjust this only once

                    # for InDel free matches we can simply use the M number in the CIGAR string
                    if ($cigar =~ /^(\d+)M$/){ # linear match
                        $end += $1;
                    }
                    else{
                        # parsing CIGAR string
                        my @len = split (/\D+/,$cigar); # storing the length per operation
                        my @ops = split (/\d+/,$cigar); # storing the operation
                        shift @ops; # remove the empty first element
                        die "CIGAR string contained a non-matching number of lengths and operations ($cigar)\n" unless (scalar @len == scalar @ops);

                        # warn "CIGAR string; $cigar\n";
                        ### determining end position of the read
                        foreach my $index(0..$#len){
                            if ($ops[$index] eq 'M'){  # standard matching bases
                                $end += $len[$index];
                                # warn "Operation is 'M', adding $len[$index] bp\n";
                            }
                            elsif($ops[$index] eq 'I'){ # insertions do not affect the end position
                                # warn "Operation is 'I', next\n";
                            }
                            elsif($ops[$index] eq 'D'){ # deletions do affect the end position
                                # warn "Operation is 'D',adding $len[$index] bp\n";
                                $end += $len[$index];
                            }
                            elsif($ops[$index] eq 'S'){ # soft-clipped bases do NOT affect the end position
                                                # warn "Operation is 'S'\n";
                            }
                            elsif($ops[$index] eq 'N'){ # skipped regions such as splice junctions do affect the end position
                                            # warn "Operation is 'N',adding $len[$index] bp\n";
                                            $end += $len[$index];
                                        }
                            else{
                                die "PE BAM (mismatch mode): Found CIGAR operations other than M, I or D: '$ops[$index]'. Not allowed at the moment\n";
                            }
                        }
                    }

                    ### reading in the next line
                    $_ = <IN>;
                    # the only thing we need is the start position
                    ($start) = (split (/\t/))[3];
                }
                else{
                    die "So far unhandled FLAG: $flag. Please update UMI-Bam\n\n";
                }

                ### Here we take the barcode sequence into consideration
                $composite = join (":",$strand,$chr,$start,$end);

                $count++;
                if (exists $positions{$composite}->{$umi_seq}){
                    # warn "Position $composite with UMI $umi_seq already existed - skipping...\n";
                    ++$removed;
                }
                else{
                    # SAM PAIRED-END
                    $positions{$composite}->{$umi_seq}->{alignment_1} = $line1;
                    $positions{$composite}->{$umi_seq}->{alignment_2} = $_;
                    # warn "Position $composite with UMI $umi_seq is new - adding...\n";
                }
            }
        }
        else{
            die "Input must be single or paired-end\n";
        }
    }


    ### PRINTING RESULTS
    
    ### Now going through all stored positions and printing out sequences with unique UMIs, or UMIs with an edit distance > $mm
    warn "Now printing out alignments with unique UMIs (UMIs closer than or equal to an edit distance of >>$mm<< are considered duplicates)\n";

  POS:    foreach my $pos (keys %positions){
      my @umis; # storing UMIs that are considered unique for this given position
      # warn "$pos\n";

      ### Looping through all sequences to see if they are very different to other sequences for this position (using a hemming distance, mismatches only, no indels)
        UMI:	foreach my $umi_seq (keys %{$positions{$pos}}){
            # warn "Current UMI:  $umi_seq\n";
            unless (@umis){
                # warn "Adding first UMI to the unique UMI array\n";
                push @umis, $umi_seq;
                next UMI;
            }
            # sleep(1);
            my $umi_too_close = 0;

            foreach my $seq(@umis){
                my $edit_dist = get_edit_distance($seq,$umi_seq);
                # warn "UMI-seq:      $umi_seq\nPrevious-seq: $seq\nEdit distance: $edit_dist\n";
                if ($edit_dist <= $mm){
                    # warn "Sequence seems too closely related to already existing sequence(s). Chucking...\n";
                    $umi_too_close++;
                    last; # one fail is quite enough
                }
            }

            if ($umi_too_close){
                ++$removed;
            }
            else{
                # warn "Adding UMI to the unique UMI array\n";
                push @umis, $umi_seq;
            }
        }

        ### Printing
        if ($paired){
            foreach my $uniq_umi(@umis){
                # warn "printing alignment for UMI: $uniq_umi\n";
                print OUT $positions{$pos}->{$uniq_umi}->{alignment_1}; # SAM read 1
                print OUT $positions{$pos}->{$uniq_umi}->{alignment_2}; # SAM read 2
                # warn "$positions{$pos}->{$uniq_umi}->{alignment_1}"; 
                # warn "$positions{$pos}->{$uniq_umi}->{alignment_2}\n"; sleep(1);
            }
        }
        else{ # single-end
            foreach my $uniq_umi(@umis){
                # warn "printing alignment for UMI: $uniq_umi\n";
                print OUT $positions{$pos}->{$uniq_umi}->{alignment};
            }
        }
        # warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n";sleep(1);
    }

    my $percentage;
    my $percentage_leftover;
    my $leftover = $count - $removed;

    unless ($count == 0){
	    $percentage = sprintf("%.2f",$removed/$count*100);
	    $percentage_leftover = sprintf("%.2f",$leftover/$count*100);
    }
    else{
	    $percentage = 'N/A';
	    $percentage_leftover = 'N/A';
    }

    warn "\nUMIs closer than, or equal to, an edit distance of >>$mm<< were considered duplicates:\n";
    warn "="x84,"\n";
    warn "Total number of alignments analysed in $file:\t$count\n";
    warn "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    warn "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

    print REPORT "UMIs closer than, or equal to, an edit distance of >>$mm<< were considered duplicates:\n";
    print REPORT "="x84,"\n";
    print REPORT "Total number of alignments analysed in $file:\t$count\n";
    print REPORT "Total number duplicated alignments removed:\t$removed ($percentage%)\n";
    print REPORT "Total count of deduplicated leftover sequences: $leftover ($percentage_leftover% of total)\n\n";

    # close REPORT;
    # close OUT;
    close IN;
}

sub detailed_umi_reporting{
    my ($detail_report) = @_;
    warn "Now printing detailed UMI report\n";
    open (DETAIL,">",$detail_report) or die "Failed to write to report $detail_report: $!";
    print DETAIL "UMI\tcount\tR1 chrom\tR1 pos\tR2 chrom\tR2 pos\n";
    foreach my $um (sort { scalar @{$umi_reporting{$b}} <=> scalar @{$umi_reporting{$a}}} keys %umi_reporting){
        # print "$um\t";
        
        foreach my $entry ( @{$umi_reporting{$um}}){
            # warn "$entry\n";
            my ($r1coords,$r2coords) = (split ":::",$entry);
            my ($r1_chr,$r1_pos) = (split ":",$r1coords);
            my ($r2_chr,$r2_pos) = (split ":",$r2coords);
            print DETAIL join ("\t",$um,scalar @{$umi_reporting{$um}},$r1_chr,$r1_pos,$r2_chr,$r2_pos),"\n";
        }
        # sleep(1);
    }
    close DETAIL or die $!;
    warn "Finished printing detailed UMI report. Enjoy.\n\n";
}