-
Notifications
You must be signed in to change notification settings - Fork 1
/
fastaptamer_cluster
executable file
·274 lines (216 loc) · 11.6 KB
/
fastaptamer_cluster
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env perl
## Last Modified March 10th, 2015 13:53 CDT by Christopher Bottoms
## Citation:
## Khalid K. Alam, Jonathan L. Chang & Donald H. Burke.
## "FASTAptamer: A Bioinformatic Toolkit for High-Throughput Sequence Analysis of
## Combinatorial Selections." Molecular Therapy -- Nucleic Acids. 2015.
## DOI: 10.1038/mtna.2015.4
## Distributed under GNU General Public License v3
use Getopt::Long; ## Core Perl module for command line options/arguments
my $start = time; ## Starts program execution timer
my $infile; ## Variable for input file
my $outfile; ## Variable for output file
my $defined_edit_distance; ## Variable (integer) for user-defined edit distance
my $help; ## true/false variable for help screen
my $quiet; ## true/false to suppress standard output
my $filter = 0; ## Variable to define a read filter for entries
my $max_clusters = 9999999999999999; ## Maximum number of clusters
my $version; ## true/false variable for version screen
## Take command line arguments for...
GetOptions ( "infile=s" => \$infile, ## ... input file
"outfile=s" => \$outfile, ## ... output file
"distance=i" => \$defined_edit_distance, ## ... edit distance
"filter=f" => \$filter, ## ... read filter
"cluster_max=i" => \$max_clusters, ## max number of clusters
"help" => \$help, ## ... help screen
"version" => \$version, ## ... version screen
"quiet" => \$quiet); ## ... supress standard output
if ($help){ ## Prints help screen if $help returns as true
print <<"HELP";
--------------------------------------------------------------------------------
FASTAptamer-Cluster
--------------------------------------------------------------------------------
Usage: fastaptamer_cluster [-h] [-i INFILE] [-o OUTFILE] [-d #] [-f #] [-q] [-v]
[-h] = Help screen.
[-i INFILE] = Input file from FASTAptamer-Count. REQUIRED.
[-o OUTFILE] = Output file, FASTA format. REQUIRED.
[-d] = Edit distance for clustering sequences. REQUIRED.
[-f] = Read filter. Only sequences with total reads greater than
the value supplied will be clustered.
[-c] = Maximum number of clusters to find.
[-q] = Quiet mode. Suppresses standard output of file I/O, numb-
er of clusters, cluster size and execution time.
[-v] = Display version.
FASTAptamer-Cluster uses the Levenshtein algorithm to cluster together sequences
based on a user-defined edit distance. The most abundant and unclustered seque-
nce is used as the "seed sequence" for which edit distance is calculated from.
Output is FASTA with the following information on the identifier line for each
sequence entry:
>Rank-Reads-RPM-Cluster#-RankWithinCluster-EditDistanceFromSeed
SEQUENCE
To prevent clustering of sequences not highly sampled (and improve execution ti-
me), invoke the read filter and enter a number. Only sequences with total reads
greater than the number entered will be clustered.
Input for FASTAptamer-Cluster MUST come from a FASTAptamer-Count output file.
PLEASE NOTE: This is a computationally intense program that can take multiple h-
ours to finish depending on the size and complexity of your population. Utilize
the read filter [-f] and/or the maximum number of clusters [-c] to improve ex-
ecution time.
HELP
exit;
}
if (defined $version){ ## Print version screen if -v is true
print <<"VERSION";
FASTAptamer v1.0.14
VERSION
exit;
}
##########################################
## Open input file or exit with warning #
##########################################
open (INPUT, '<', $infile) or die
"\nCould not open input file or no input file was specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n";
##########################################
## Open output file or exit with warning #
##########################################
open (OUTPUT, '>', $outfile) or die
"\nCould not open output file or no input file was specified.\See help documentation [-h], README, or User's Guide for program usage.\n";
###############################################
## Exit with warning if no distance specified #
###############################################
unless ($defined_edit_distance) {
die "\nNo edit distance specified.\nSee help documentation [-h], README, or User's Guide for program usage.\n"; }
my @sequence_and_info; ## Array to contain each FASTA entry individually
my $entries; ## Variable to keep track of the number of entries
$/= ">"; ## Change default input record separator to read FASTA formatted file
my $Count_Record = qr{
( # Beginning of capture
\d+ # Sequence number
(?:\(\d+\))? # Optional label for uniqueness
- # literal dash
\d+ # Read counts
- # literal dash
\d+\.?\d* # Counts per million (with optional decimals)
\n # Newline
\w+ # sequence represented by "word" characters
) # End of capture
}xms; ## Regex for FASTAptamer-Count format
while (<INPUT>){ ## Reads through entire input file
if ($_ =~ /$Count_Record/){ ## Regex for FASTAptamer-Count format
my @read_count_test = split /-/, $1; ## Splits entry by dashes, loads array
if ($read_count_test[1] > $filter){ ## Tests for read count filter
push @sequence_and_info, $1; ## Add each matched entry to array
$entries++; ## Increase entry count by 1
}
}
}
close INPUT; ## Closes input file
unless ($quiet){ ## Unless -q option is invoked, print this warning and summary
print "\n**************************************************";
print "\nClustering is a computationally intense process.\n";
print "Please be patient as clustering can take several\n";
print "hours for a single file. For faster clustering\n";
print "use the read filter [-f] to avoid clustering of \n";
print "sequences not highly sampled and/or use the max \n";
print "clusters option [-c] to limit clustering to the \n";
print "number of clusters that you are interested in. \n";
print "**************************************************";
print "\n\nTotal number of sequences to cluster: $entries.\n";
print "\nCluster\tUnique Sequences\tReads\tRPM\n";
}
my $iterator = get_cluster_iterator(@sequence_and_info);
while( $iterator->() ) {1} ## Simply repeat until iterator runs out (i.e.
## until all of the sequences have been processed)
###############################################
## SUBROUTINE THAT BEGINS TO PROCESS FILE #
## One iteration generates a single cluster #
###############################################
sub get_cluster_iterator {
my @remaining_sequences = @_;
my $current_cluster = 1; ## This variable defines what cluster we're working on
return sub {
my ( $top_entry, @entires ) = @remaining_sequences; ## Takes "seed sequence" and remaining entries
return if ! $top_entry; ## Return false value if no more entries left
my @keepers; ## Array to store unclustered entries
my @current_seq_info = split /\n/, $top_entry;
## Splits entry into metrics (reads, rank, RPM) and the "seed" sequence
my $cluster_rank = 1; ## Defines that seed sequence is ranked #1 in its cluster
print OUTPUT ">$current_seq_info[0]-$current_cluster-$cluster_rank-0\n$current_seq_info[1]\n";
## Prints current seed sequence in FASTAptamer-Cluster format
my @current_seq_metrics = split /-/, $current_seq_info[0]; ## Split sequence identifier line
my $current_cluster_reads = $current_seq_metrics[1]; ## Take reads to tally cluster size
my $current_cluster_rpm = $current_seq_metrics[2]; ## Take RPM to tally cluster size in RPM
for (@entires) { ## for each entry left in array send to calculate distance
my @comparison_seq_info = split /\n/, $_; ## Split entry into metrics and sequence
my @comparison_seq_metrics = split /-/, $comparison_seq_info[0]; ## Split identifier line
my $comparison_seq_reads = $comparison_seq_metrics[1]; ## Take reads to tally cluster size
my $comparison_seq_rpm = $comparison_seq_metrics[2]; ## Take RPM to tally cluster size
my $distance = levenshtein( $current_seq_info[1], $comparison_seq_info[1] );
## sends comparison sequence to compare against current seed sequence, returns distances
if ( $distance > $defined_edit_distance ) { ## If distance is greater than defined
push @keepers, $_; ## Add to array for comparison in next iteration
}
elsif ( $distance <= $defined_edit_distance ) { ## If distance is less than or equal to defined
$cluster_rank++; ## Increment the cluster rank
$current_cluster_reads += $comparison_seq_reads; ## Add reads to cluster reads tally
$current_cluster_rpm += $comparison_seq_rpm; ## Add RPM to cluster RPM tally
print OUTPUT ">$comparison_seq_info[0]-$current_cluster-$cluster_rank-$distance\n";
print OUTPUT "$comparison_seq_info[1]\n"; ## Print entry in output file
}
}
unless ($quiet) { print "$current_cluster\t$cluster_rank\t$current_cluster_reads\t$current_cluster_rpm\n"; }
## Display cluster number, number of unique sequences, reads and RPM
$current_cluster++; ## Increment cluster number prior to next cluster
# Quit clustering if the maximum clusters have been found
return if $current_cluster > $max_clusters;
# Store unclustered sequences for use in next iteration
@remaining_sequences = @keepers;
# Sucessful iteration, so returning a true value
return 1;
}
}
my $duration = time - $start; ## Calculates how much time has elapsed since start
unless ($quiet) { ## Displays summary report unless -q is invoked
print "\nInput file: \"$infile\".\n";
print "Output file: \"$outfile\".\n";
print "Execution time: $duration s.\n";
}
#############################################################################
## The subroutines below calculates the Levenshtein edit distance for the #
## current "seed" sequence and the comparison sequence that are sent to it #
#############################################################################
sub levenshtein {
my ($s1, $s2) = @_;
my ($len1, $len2) = (length $s1, length $s2);
return $len2 if ($len1 == 0);
return $len1 if ($len2 == 0);
my %mat;
for (my $i = 0; $i <= $len1; ++$i){
for (my $j = 0; $j <= $len2; ++$j){
$mat{$i}{$j} = 0;
$mat{0}{$j} = $j;
}
$mat{$i}{0} = $i;
}
my @ar1 = split(//, $s1);
my @ar2 = split(//, $s2);
for (my $i = 1; $i <= $len1; ++$i){
for (my $j = 1; $j <= $len2; ++$j){
my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
$mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
$mat{$i}{$j-1} + 1,
$mat{$i-1}{$j-1} + $cost]);
}
}
return $mat{$len1}{$len2};
}
sub min
{
my @list = @{$_[0]};
my $min = $list[0];
foreach my $i (@list)
{
$min = $i if ($i < $min);
}
return $min;
}