fastaptamer_cluster

#!/usr/bin/env perl

## Last Modified March 10th, 2015 13:53 CDT by Christopher Bottoms

## Citation:
## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke. 
## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of 
## Combinatorial Selections." Molecular Therapy -- Nucleic Acids. 2015.
## DOI: 10.1038/mtna.2015.4

## Distributed under GNU General Public License v3

use Getopt::Long;   ## Core Perl module for command line options/arguments

my $start = time;           ## Starts program execution timer
my $infile;                 ## Variable for input file
my $outfile;                ## Variable for output file
my $defined_edit_distance;  ## Variable (integer) for user-defined edit distance
my $help;                   ## true/false variable for help screen
my $quiet;                  ## true/false to suppress standard output
my $filter = 0;				## Variable to define a read filter for entries
my $max_clusters = 9999999999999999; ## Maximum number of clusters
my $version;                ## true/false variable for version screen

                                           ## Take command line arguments for...
GetOptions (    "infile=s" => \$infile,    ## ... input file
                "outfile=s" => \$outfile,  ## ... output file
                "distance=i" => \$defined_edit_distance,  ## ... edit distance
                "filter=f" => \$filter,    ## ... read filter
                "cluster_max=i" => \$max_clusters, ## max number of clusters
                "help" => \$help,          ## ... help screen
                "version" => \$version,    ## ... version screen
                "quiet" => \$quiet);       ## ... supress standard output

if ($help){         ## Prints help screen if $help returns as true
print <<"HELP";
	
--------------------------------------------------------------------------------
                              FASTAptamer-Cluster
--------------------------------------------------------------------------------

Usage: fastaptamer_cluster [-h] [-i INFILE] [-o OUTFILE] [-d #] [-f #] [-q] [-v]

    [-h]            = Help screen.
    [-i INFILE]     = Input file from FASTAptamer-Count. REQUIRED.
    [-o OUTFILE]    = Output file, FASTA format. REQUIRED.
    [-d]            = Edit distance for clustering sequences. REQUIRED.
    [-f]            = Read filter. Only sequences with total reads greater than
                      the value supplied will be clustered.
    [-c]            = Maximum number of clusters to find.
    [-q]            = Quiet mode.  Suppresses standard output of file I/O, numb-
                      er of clusters, cluster size and execution time.
    [-v]            = Display version.

FASTAptamer-Cluster uses the Levenshtein algorithm to cluster together sequences
based on a user-defined edit distance.  The most abundant and unclustered seque-
nce is used as the "seed sequence" for which edit distance is calculated from.  
Output is FASTA with the following information on the identifier line for each 
sequence entry:

    >Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeed
    SEQUENCE

To prevent clustering of sequences not highly sampled (and improve execution ti-
me), invoke the read filter and enter a number.  Only sequences with total reads
greater than the number entered will be clustered. 

Input for FASTAptamer-Cluster MUST come from a FASTAptamer-Count output file. 

PLEASE NOTE: This is a computationally intense program that can take multiple h-
ours to finish depending on the size and complexity of your population. Utilize
the read filter [-f] and/or the maximum number of clusters [-c] to improve ex-
ecution time. 

HELP
exit;
}

if (defined $version){                     ## Print version screen if -v is true
    print <<"VERSION";
	
FASTAptamer v1.0.14
	
VERSION
exit;
}

##########################################	
## Open input file or exit with warning  #
##########################################

open (INPUT, '<', $infile) or die 
"\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n";

##########################################
## Open output file or exit with warning #
##########################################

open (OUTPUT, '>', $outfile) or die 
"\nCould not open output file or no input file was specified.\See help documentation [-h], README, or User's Guide for program usage.\n";

###############################################
## Exit with warning if no distance specified #
###############################################

unless ($defined_edit_distance) { 
die "\nNo edit distance specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; }

my @sequence_and_info;   ## Array to contain each FASTA entry individually
my $entries;             ## Variable to keep track of the number of entries

$/= ">";   ## Change default input record separator to read FASTA formatted file

my $Count_Record = qr{
                       (              # Beginning of capture
                         \d+            # Sequence number
                         (?:\(\d+\))?   # Optional label for uniqueness
                         -              # literal dash
                         \d+            # Read counts
                         -              # literal dash
                         \d+\.?\d*      # Counts per million (with optional decimals)
                         \n             # Newline
                         \w+            # sequence represented by "word" characters
                       )              # End of capture
                   }xms;    ## Regex for FASTAptamer-Count format

while (<INPUT>){                             ## Reads through entire input file
    if ($_ =~ /$Count_Record/){    ## Regex for FASTAptamer-Count format
        my @read_count_test = split /-/, $1; ## Splits entry by dashes, loads array
        if ($read_count_test[1] > $filter){     ## Tests for read count filter
            push @sequence_and_info, $1;         ## Add each matched entry to array
            $entries++;                          ## Increase entry count by 1
        }
    }
}

close INPUT;  ## Closes input file

unless ($quiet){  ## Unless -q option is invoked, print this warning and summary
	print "\n**************************************************";
    print "\nClustering is a computationally intense process.\n";
    print "Please be patient as clustering can take several\n";
    print "hours for a single file.  For faster clustering\n";
    print "use the read filter [-f] to avoid clustering of \n";
    print "sequences not highly sampled and/or use the max \n";
    print "clusters option [-c] to limit clustering to the \n";
    print "number of clusters that you are interested in. \n";
    print "**************************************************";
    print "\n\nTotal number of sequences to cluster: $entries.\n";
    print "\nCluster\tUnique Sequences\tReads\tRPM\n";
}

my $iterator = get_cluster_iterator(@sequence_and_info);

while( $iterator->() ) {1} ## Simply repeat until iterator runs out (i.e. 
                           ##  until all of the sequences have been processed)

###############################################
##   SUBROUTINE THAT BEGINS TO PROCESS FILE   #
## One iteration generates a single cluster   #
###############################################

sub get_cluster_iterator {
    my @remaining_sequences = @_;

    my $current_cluster = 1;  ## This variable defines what cluster we're working on 

    return sub {
        my ( $top_entry, @entires ) = @remaining_sequences; ## Takes "seed sequence" and remaining entries
        return if ! $top_entry; ## Return false value if no more entries left
        my @keepers; ## Array to store unclustered entries
        my @current_seq_info = split /\n/, $top_entry;
        ## Splits entry into metrics (reads, rank, RPM) and the "seed" sequence

        my $cluster_rank = 1; ## Defines that seed sequence is ranked #1 in its cluster

        print OUTPUT ">$current_seq_info[0]-$current_cluster-$cluster_rank-0\n$current_seq_info[1]\n";
        ## Prints current seed sequence in FASTAptamer-Cluster format

        my @current_seq_metrics = split /-/, $current_seq_info[0]; ## Split sequence identifier line
        my $current_cluster_reads = $current_seq_metrics[1]; ## Take reads to tally cluster size
        my $current_cluster_rpm = $current_seq_metrics[2]; ## Take RPM to tally cluster size in RPM

        for (@entires) { ## for each entry left in array send to calculate distance
            my @comparison_seq_info = split /\n/, $_; ## Split entry into metrics and sequence
            my @comparison_seq_metrics = split /-/, $comparison_seq_info[0]; ## Split identifier line
            my $comparison_seq_reads = $comparison_seq_metrics[1]; ## Take reads to tally cluster size
            my $comparison_seq_rpm = $comparison_seq_metrics[2]; ## Take RPM to tally cluster size
            my $distance = levenshtein( $current_seq_info[1], $comparison_seq_info[1] );
            ## sends comparison sequence to compare against current seed sequence, returns distances
            if ( $distance > $defined_edit_distance ) { ## If distance is greater than defined
                push @keepers, $_; ## Add to array for comparison in next iteration
            }
            elsif ( $distance <= $defined_edit_distance ) { ## If distance is less than or equal to defined
                $cluster_rank++; ## Increment the cluster rank
                $current_cluster_reads += $comparison_seq_reads; ## Add reads to cluster reads tally
                $current_cluster_rpm += $comparison_seq_rpm; ## Add RPM to cluster RPM tally
                print OUTPUT ">$comparison_seq_info[0]-$current_cluster-$cluster_rank-$distance\n";
                print OUTPUT "$comparison_seq_info[1]\n"; ## Print entry in output file
            }
        }

        unless ($quiet) { print "$current_cluster\t$cluster_rank\t$current_cluster_reads\t$current_cluster_rpm\n"; }
        ## Display cluster number, number of unique sequences, reads and RPM
        $current_cluster++; ## Increment cluster number prior to next cluster

        # Quit clustering if the maximum clusters have been found
        return if $current_cluster > $max_clusters;

        # Store unclustered sequences for use in next iteration
        @remaining_sequences = @keepers;

        # Sucessful iteration, so returning a true value
        return 1;
      }
}

my $duration = time - $start; ## Calculates how much time has elapsed since start

unless ($quiet) { ## Displays summary report unless -q is invoked
    print "\nInput file: \"$infile\".\n";
    print "Output file: \"$outfile\".\n";
    print "Execution time: $duration s.\n"; 
}

#############################################################################
## The subroutines below calculates the Levenshtein edit distance for the   #
## current "seed" sequence and the comparison sequence that are sent to it  #
#############################################################################


sub levenshtein {
	my ($s1, $s2) = @_;
	my ($len1, $len2) = (length $s1, length $s2);
	
	return $len2 if ($len1 == 0);
	return $len1 if ($len2 == 0);
	
	my %mat;
	
	for (my $i = 0; $i <= $len1; ++$i){
		for (my $j = 0; $j <= $len2; ++$j){
			$mat{$i}{$j} = 0;
			$mat{0}{$j} = $j;
		}
		$mat{$i}{0} = $i;
	}
	
	my @ar1 = split(//, $s1);
	my @ar2 = split(//, $s2);

	for (my $i = 1; $i <= $len1; ++$i){
		for (my $j = 1; $j <= $len2; ++$j){
			my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
			$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
			$mat{$i}{$j-1} + 1,
			$mat{$i-1}{$j-1} + $cost]);
		}
	}
    return $mat{$len1}{$len2};
}

sub min
{
    my @list = @{$_[0]};
    my $min = $list[0];

    foreach my $i (@list)
    {
        $min = $i if ($i < $min);
    }

    return $min;
}