/
biosequence.h
687 lines (543 loc) · 28.7 KB
/
biosequence.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
// biosequence code using STL
// ihh@lanl.gov, 1999/04/28
#ifndef BIOSEQUENCE_INCLUDED
#define BIOSEQUENCE_INCLUDED
#include <stdio.h>
#include <iostream>
#include <map>
#include <list>
#include <vector>
#include <algorithm>
#include "util/sstring.h"
#include "util/macros.h"
#include "util/score.h"
#include "util/phonebook.h"
#include "util/rnd.h"
#include "util/map_keys.h"
#define Alphabet_nonsense_char '?'
#define UNINITIALIZED_ALPHABET_NAME "Uninitialized"
// a Biosequence is a text representation of a sequence
//
typedef sstring Biosequence;
// a Digitized_biosequence is a numerical representation of a sequence, related to a Biosequence by an Alphabet (see below)
// NB implementation isn't quite there (will throw an exception for negative (=ambiguous) symbols)
//
class Digitized_biosequence : public vector<int>
{
public:
Digitized_biosequence() : vector<int>() { }
Digitized_biosequence (const vector<int>& intvec) : vector<int> (intvec) { }
Digitized_biosequence (size_type sz, int i = -1) : vector<int> (sz, i) { }
int null_score (const vector<int>& null_model) const;
int stuttered_null_score (const vector<int>& null_model, const int stutter_sc) const;
void count_symbol_frequencies (vector<double>& composition) const;
// the "context" methods a Digitized_biosequence where every site
// encodes the last 'context_order' residues of the parent sequence
// ...a hacky way of doing higher-order Markov chains. probably best to ignore these methods.
Digitized_biosequence make_context_dsq (int alphabet_size, int context_order) const;
Digitized_biosequence make_reverse_context_dsq (int alphabet_size, int context_order) const;
static Digitized_biosequence context_to_nmer (int context, int alphabet_size, int context_order);
};
// a Weight_profile is an ungapped profile in probability space
// it should probably have all the same methods as Score_profile (see below), but it is more rarely used
//
typedef pair <int, Prob> Symbol_weight;
typedef map <int, Prob> Symbol_weight_map;
typedef Symbol_weight_map::const_iterator Symbol_weight_iterator;
class Weight_profile : public vector <Symbol_weight_map>
{
public:
Weight_profile () {}
Weight_profile (const Digitized_biosequence& dsq);
Weight_profile (int len, const Symbol_weight_map& weight_map) : vector<Symbol_weight_map> (len, weight_map) {}
static int weight_map_consensus (const Symbol_weight_map& w); // returns most heavily weighted symbol in a Symbol_score_map
int consensus (int pos) const { return weight_map_consensus ((*this)[pos]); }
Digitized_biosequence consensus_dsq() const;
void normalise(); // for each residue, re-scales symbol weights so that probability sum is 1
void count_symbol_frequencies (vector<double>& composition) const;
};
// a Vector_weight_profile is an ungapped probability profile like a Weight_profile,
// but stored as a vector
typedef vector<vector<Prob> > Vector_weight_profile;
// a Score_profile is an ungapped profile in score space
//
typedef pair <int, Score> Symbol_score;
typedef map <int, Score> Symbol_score_map;
typedef Symbol_score_map::const_iterator Symbol_score_iterator;
class Score_profile : public vector <Symbol_score_map>
{
public:
Score_profile () {}
Score_profile (const Digitized_biosequence& dsq);
Score_profile (const Weight_profile& profile);
Score_profile (int len, const Symbol_score_map& score_map) : vector<Symbol_score_map> (len, score_map) {}
// Should really make Symbol_score_map a proper class and put this class's static methods there (ditto Symbol_weight_map)
//
static int score_map_consensus (const Symbol_score_map& s); // returns highest scoring symbol in a Symbol_score_map
int consensus (int pos) const { return score_map_consensus ((*this)[pos]); }
static int score_map_sample (const Symbol_score_map& s, double kT = 1); // samples from a Symbol_score_map
int sample (int pos, double kT = 1) const { return score_map_sample ((*this)[pos]); }
static int score_map_score (const Symbol_score_map& ssm, const vector<int>& null_model);
int null_score (const vector<int>& null_model) const;
void normalise(); // for each residue, re-scales symbol scores so that probability sum is 1
Digitized_biosequence consensus_dsq() const;
Digitized_biosequence sample_dsq (double kT = 1) const;
Score_profile evolve (const array2d<int>& substitution_matrix) const; // evolves the sequence (substitutions only)
vector<int> null_scores_by_residue (const vector<int>& null_model) const; // returns null score for each residue in the sequence
int inner_product (const Score_profile& prof) const; // score of product_{columns} sum_{residues in column} P(residue|this) . P(residue|prof)
Weight_profile score2weight() const; // turns this Score_profile into a Weight_profile
void count_symbol_frequencies (vector<double>& composition) const; // count symbol frequencies
};
// Alphabet class
//
class Alphabet
{
// every symbol in the alphabet corresponds to a non-negative integer 0,1,2...(size-1)
// negative integers represent ambiguous symbols, e.g. "N" "Y" "R" for DNA, "X" for proteins, and wildcards "*"
// these ambiguities can be represented by a weighted sum over the real symbols in the alphabet
// e.g. "Y" = 0.5 * "C" + 0.5 * "T" (NB this is a probability distribution)
// "*" = "A" + "C" + "G" + "T" (NB this is not a probability distribution)
// such weighted sums are represented by Symbol_weight_map's (in probability space) or Symbol_score_map's (in score space)
// these representations can be resolved by calling int2weight() or int2score()
// in fact there are a million trillion zillion different conversion functions we have to define, and some are as yet unused & unimplemented.
//
protected:
int alphabet_size; // number of nondegenerate symbols
// maps indexed by the string representation of an alphabet token
map<sstring,int> tok2int;
// arrays indexed by the character representation of an alphabet token
vector<int> char2int_table; // a value of (alphabet_size) indicates unknown
vector<double> char_aff_table; // table of char affinities for this alphabet; used to match sequences to alphabets
// arrays indexed by the integer representation of an alphabet token
vector<int> nondegen_comp; // complements for nondegenerate chars
vector<int> degen_comp; // complements for degenerate chars
vector<sstring> tok; // tokens for nondegenerate symbols (a token can be multiple chars)
sstring chars_lc; // nondegenerate chars, lower case
sstring chars_uc; // nondegenerate chars, upper case
sstring degen_lc; // degenerate chars, lower case
sstring display_lc; // what to output for nondegenerate symbols with no chars (used by hidden alphabets)
vector<Symbol_weight_map> nondegen_weight; // nondegenerate weight mappings
vector<Symbol_score_map> nondegen_score; // nondegenerate score mappings
vector<Symbol_weight_map> degen_weight; // degenerate weight mappings
vector<Symbol_score_map> degen_score; // degenerate score mappings
int unknown_int; // "N" for DNA, "X" for protein
bool warn_randomise; // flag saying whether to warn when randomising degenerate chars; set to 0 after first warning
bool warn_unknown; // flag for unknown chars warning
public:
// name
sstring name; // name of alphabet, e.g. "DNA", "Protein"
// case-sensitivity flag
bool case_sensitive;
// multiplier for counts, to turn fractional degenerate counts into integers. 12 for DNA/RNA, 20 for protein
// (explanation: ambiguated counts for DNA can be 1/2 (e.g. "r", "y"), 1/3 (e.g. "h", "v"), 1/4 ("n")
// lowest common denominator of (1/2, 1/3, 1/4) is 12)
unsigned int degenerate_lcd;
// Felsenstein wildcards
Symbol_score_map wild_ssm;
Symbol_weight_map wild_swm;
// builder methods
void reset (const char* name, int size);
void init_tokens (const vector<sstring>& tokens); // nondegenerate tokens
void init_chars (const char* chars, const char* comp = ""); // nondegenerate chars
void init_degen_chars (const char* chars); // degenerate chars
void init_degen_complement();
void init_display (const sstring& chars);
void set_degen_char (char degen_char, const Symbol_weight_map& p);
void set_degen_char (char degen_char, const char* possibilities);
void set_wildcard (char wildcard_char);
void set_unknown (char unknown_char);
// method to initialize a "hidden" alphabet from a "visible" alphabet
// suppose the visible alphabet has A [nondegenerate] symbols, and the hidden_classes variable is equal to C.
// then the hidden alphabet will have C*A symbols,
// with each symbol of the visible alphabet representing a C-fold degenerate symbol in the hidden alphabet.
// Symbol N in the visible alphabet is a degenerate char for symbols {N,A+N,2*A+N...(C-1)*A+N} in the hidden alphabet.
void init_hidden (const Alphabet& alph, int hidden_classes = 1);
void init_hidden (const Alphabet& base_alphabet, const vector<sstring>& hidden_class_labels);
// constructor
Alphabet (const char* name = UNINITIALIZED_ALPHABET_NAME, int size = 0);
// methods
int size() const { return alphabet_size; }
// chars() & chars_toupper() methods return C strings containing nondegenerate symbols (e.g. ACGT for DNA)
const char* nondegenerate_chars() const { return chars_lc.c_str(); }
const char* nondegenerate_chars_toupper() const { return chars_uc.c_str(); }
const char* degenerate_chars() const { return degen_lc.c_str(); }
// tokens() method is an accessor for tok vector
const vector<sstring>& tokens() const { return tok; }
// basic query/conversion methods
inline bool contains_strict (char c) const // forbids degenerate chars
{
const int i = char2int_table[c];
return i >= 0 && i < alphabet_size;
}
inline bool contains_deg (char c) const // allows degenerate chars
{
const int i = char2int_table[c];
return i > - (int) degen_lc.size() && i < alphabet_size;
}
inline int char2int_strict (char c) const // randomises degenerate/unknown chars
{
int i = char2int_table[c];
if (i >= alphabet_size)
{
if (warn_unknown) CLOGERR << "Randomising unknown characters\n";
((Alphabet*)this)->warn_unknown = 0; // cast away const
i = alphabet_size > 1 ? Rnd::rnd_int (alphabet_size) : 0; // don't call Rnd unless necessary
}
else if (i < 0) // degenerate char
{
const Symbol_weight_map& deg_map = int2weight(i);
const vector<int> degen_opt = map_keys (deg_map);
const vector<Prob> degen_prob = map_values (deg_map);
i = degen_opt.size() > 1 ? degen_opt[Rnd::choose(degen_prob)] : degen_opt[0]; // don't call Rnd unless necessary
if (degen_opt.size() > 1)
{
if (warn_randomise) CLOGERR << "Randomising degenerate characters\n";
((Alphabet*)this)->warn_randomise = 0; // cast away const
}
}
return i;
}
inline int char2int_deg (char c) const // converts unknown chars to (unknown_int)
{
int i = char2int_table[c];
if (i >= alphabet_size)
{
i = unknown_int;
if (i >= alphabet_size)
{
i = alphabet_size > 1 ? Rnd::rnd_int (alphabet_size) : 0; // don't call Rnd unless necessary
CLOGERR << "I don't think '" << c << "' matches alphabet " << name << "; randomizing to '" << int2char(i) << "'\n";
}
else
CLOGERR << "I don't think '" << c << "' matches alphabet " << name << "; setting to '" << int2char(i) << "'\n";
}
return i;
}
inline char int2char (int i) const
{
if (i >= 0)
{
if (i >= alphabet_size) return Alphabet_nonsense_char;
if (i >= (int) chars_lc.size())
return i >= (int) display_lc.size() ? Alphabet_nonsense_char : display_lc[i];
return chars_lc[i];
}
if (-i-1 >= (int) degen_lc.size()) return Alphabet_nonsense_char;
return degen_lc[-i-1];
}
inline char int2char_uc (int i) const
{
int c = int2char (i);
if (c >= 'a' && c <= 'z') c += 'A' - 'a';
return c;
}
inline char unknown_char() const { return int2char (unknown_int); }
inline int unknown_symbol_index() const { return unknown_int; }
inline bool contains_tok (const sstring& token) const
{
return tok2int.find (token) != tok2int.end();
}
inline int token2int (const sstring& token) const {
map<sstring,int>::const_iterator tok2int_iter = tok2int.find (token);
return tok2int_iter == tok2int.end() ? unknown_int : tok2int_iter->second;
}
inline const sstring& int2token (int i) const {
if (i < 0 || i >= alphabet_size)
THROWEXPR ("Bad alphabet symbol index in int2token");
if (i >= (int) tok.size())
THROWEXPR ("In int2token: tokens uninitialized");
return tok[i];
}
// fancier conversion methods
inline const Symbol_weight_map& int2weight (int i) const { return i >= 0 ? nondegen_weight[i] : degen_weight[-i-1]; }
inline const Symbol_weight_map& char2weight (char c) const { return int2weight (char2int_deg (c)); }
int weight2int (const Symbol_weight_map& w) const;
char weight2char (const Symbol_weight_map& w) const;
inline const Symbol_score_map& int2score (int i) const { return i >= 0 ? nondegen_score[i] : degen_score[-i-1]; }
inline const Symbol_score_map& char2score (char c) const { return int2score (char2int_deg (c)); }
int score2int (const Symbol_score_map& s) const;
char score2char (const Symbol_score_map& s) const;
// Whole-sequence conversions.
// Each of these methods returns the second argument as a reference
// (not really DART's most up-to-date C++ coding practise, if such a standard exists)
Digitized_biosequence& seq2dsq (const Biosequence& seq, Digitized_biosequence& dsq) const;
Biosequence& dsq2seq (const Digitized_biosequence& dsq, Biosequence& seq) const;
Weight_profile& seq2weight (const Biosequence& seq, Weight_profile& prof) const;
Biosequence& weight2seq (const Weight_profile& prof, Biosequence& seq) const;
Score_profile& seq2score (const Biosequence& seq, Score_profile& prof) const;
Biosequence& score2seq (const Score_profile& prof, Biosequence& seq) const;
Weight_profile& dsq2weight (const Digitized_biosequence& dsq, Weight_profile& prof) const;
Digitized_biosequence& weight2dsq (const Weight_profile& prof, Digitized_biosequence& dsq) const; // no implementation yet
Score_profile& dsq2score (const Digitized_biosequence& dsq, Score_profile& prof) const;
Digitized_biosequence& score2dsq (const Score_profile& prof, Digitized_biosequence& dsq) const; // no implementation yet
// sequence conversion methods that dynamically allocate & return a new sequence
Digitized_biosequence new_seq2dsq (const Biosequence& seq) const;
Biosequence new_dsq2seq (const Digitized_biosequence& dsq) const;
Weight_profile new_seq2weight (const Biosequence& seq) const;
Biosequence new_weight2seq (const Weight_profile& prof) const;
Score_profile new_seq2score (const Biosequence& seq) const;
Biosequence new_score2seq (const Score_profile& prof) const;
Weight_profile new_dsq2weight (const Digitized_biosequence& dsq) const;
Digitized_biosequence new_weight2dsq (const Weight_profile& prof) const; // no implementation yet
Score_profile new_dsq2score (const Digitized_biosequence& dsq) const;
Digitized_biosequence new_score2dsq (const Score_profile& prof) const; // no implementation yet
// wildcard stuff
bool all_wild (const Score_profile& prof) const; // returns TRUE if Score_profile contains nothing but wildcards
bool all_wild (const Weight_profile& prof) const; // returns TRUE if Weight_profile contains nothing but wildcards
Symbol_weight_map flat_weight_map (double weight) const; // "*" (weight = 1) or "N" (DNA, weight = .25) or "X" (protein, weight = .05)
Symbol_score_map flat_score_map (int score) const; // like flat_weight_map() but in score space
// complement stuff
bool has_complement() const { return nondegen_comp.size() != 0; }
int complement (int sym) const;
Symbol_weight_map complement_weight (const Symbol_weight_map& m) const;
Symbol_score_map complement_score (const Symbol_score_map& m) const;
char complement_char (char c) const;
Weight_profile revcomp_weight_profile (const Weight_profile& p) const; // NB does a _reverse_ complement
Score_profile revcomp_score_profile (const Score_profile& p) const;
Digitized_biosequence revcomp_digitized_sequence (const Digitized_biosequence& dsq) const;
Biosequence revcomp_sequence (const Biosequence& s) const; // preserves case
// nmer stuff
int nmer_token (int word_len, const Digitized_biosequence::const_iterator iter);
int nmer_token (const char* nmer);
// regular expression helper
sstring degenerate_seq_pattern (const Biosequence& seq) const; // converts IUPAC degenerate sequence to regexp, e.g. "n" -> "[acgt]" for DNA
// method to return the affinity of an alphabet for a particular character
// used to figure out which alphabet corresponds to an anonymous input sequence (typically "is it DNA or Protein?")
inline double char_affinity (char c) const { return char_aff_table[c]; }
};
struct Empty_alphabet : public Alphabet
{
Empty_alphabet() : Alphabet ("", 0) { }
};
struct Simple_alphabet : public Alphabet
{
Simple_alphabet (const char* name, const char* chars);
};
struct DNA_alphabet_type : public Alphabet
{
DNA_alphabet_type();
};
struct RNA_alphabet_type : public Alphabet
{
RNA_alphabet_type();
};
struct Protein_alphabet_type : public Alphabet
{
Protein_alphabet_type();
};
struct Roman_alphabet_type : public Alphabet
{
Roman_alphabet_type();
};
struct Text_alphabet_type : public Alphabet
{
Text_alphabet_type();
};
extern DNA_alphabet_type DNA_alphabet;
extern RNA_alphabet_type RNA_alphabet;
extern Protein_alphabet_type Protein_alphabet;
extern Text_alphabet_type Text_alphabet;
extern Roman_alphabet_type Roman_alphabet;
extern Simple_alphabet Dummy_alphabet; // only has one symbol
// A Metascore is an array of scores associated with each residue in a sequence,
// representing splice site predictions, masking, or any other secondary scoring source.
typedef vector<Score> Metascore;
typedef vector<Prob> Metaprob; // probability-space version of a Metascore
// A Meta_profile is a base class for Named_profile, holding Score_profile & Metascore vector.
// (Metascores can be used to store scores associated with sequences, like splice site predictions or masks)
struct Meta_profile
{
Score_profile prof_sc; // sequence in score profile form
vector<Metascore> meta_sc; // metascores
};
// Profile_flags_enum, for specifying type conversions and the like
struct Profile_flags_enum
{
enum Profile_flags { NONE = 0, SEQ = 1, SCORE = 2, WEIGHT = 4, DSQ = 8, ALL = 15 };
};
// (name, start, end) tuple
struct NSE
{
// data
sstring seqname;
int start;
int end;
// constructors
NSE();
NSE (const sstring& name, int start, int end);
};
// A Named_profile holds name, "cruft" (i.e. the text following the name in FASTA files),
// sequence data (in text, digitized, score and/or probability form) and Metascores.
//
//
// No attempt is made to maintain consistency between alternative forms of sequence -- that is the caller's job.
// Sometimes, when non-IUPAC degenerate score profiles are used, it may not even be possible to interconvert.
//
// ...yes I *know* this is messy. it keeps me awake at nights. maybe one day i'll fix it. (chyeah)
//
struct Named_profile : Meta_profile, Profile_flags_enum
{
sstring name; // name
sstring cruft; // "cruft" (text following name in FASTA definition line)
Biosequence seq; // sequence in text form
Weight_profile prof_w; // sequence in probability profile form
Digitized_biosequence dsq; // sequence in digitised form
// regexps for FASTA-format input
static Regexp name_regexp;
static Regexp quick_name_regexp;
// regexp for Stockholm "NAME/START-END"
static Regexp nse_regexp;
// methods
// clear method
void clear();
// FASTA IO
void write_FASTA (ostream& out, int column_width = 50) const;
void read_FASTA (istream& in);
// metascore IO
void write_metaprob (int meta_idx, ostream& out, int column_width = 50) const;
void read_metaprob (int meta_idx, istream& in, bool expect_name = 1);
// method to check a file for FASTA form (initial ">")
static bool detect_FASTA (istream& in);
// quick FASTA output wrapper
void dump() const { write_FASTA (cout, 50); }
// all-purpose conversion method
void seq_update (const Alphabet& a, Profile_flags convert = ALL);
// conversion methods
void seq2score (const Alphabet& a) { a.seq2score (seq, prof_sc); }
void seq2weight (const Alphabet& a) { a.seq2weight (seq, prof_w); }
void seq2dsq (const Alphabet& a) { a.seq2dsq (seq, dsq); }
void score2seq (const Alphabet& a) { a.score2seq (prof_sc, seq); }
void score2weight() { prof_w = prof_sc.score2weight(); }
void score2dsq (const Alphabet& a); // no implementation yet
void weight2seq (const Alphabet& a) { a.weight2seq (prof_w, seq); }
void weight2score() { prof_sc = Score_profile (prof_w); }
void weight2dsq (const Alphabet& a); // no implementation yet
void dsq2seq (const Alphabet& a) { a.dsq2seq (dsq, seq); }
void dsq2score (const Alphabet& a) { a.dsq2score (dsq, prof_sc); }
void dsq2weight (const Alphabet& a); // no implementation yet
// consensus conversion methods
void consensus_score2seq (const Alphabet& a) { a.dsq2seq (prof_sc.consensus_dsq(), seq); }
void consensus_weight2seq (const Alphabet& a) { a.dsq2seq (prof_w.consensus_dsq(), seq); }
void consensus_score2dsq (const Alphabet& a) { dsq = prof_sc.consensus_dsq(); }
void consensus_weight2dsq (const Alphabet& a) { dsq = prof_w.consensus_dsq(); }
// reverse complement method
Named_profile revcomp (const Alphabet& a, const char* name_suffix = "", const char* cruft_suffix = "") const;
// metascore reset & update methods
void reset_metascores (int n_metascores, Score default_sc = 0);
void add_metascores (int metascore_idx, const Metascore& delta);
// size method
int size() const; // does a simple consistency check between profile lengths
// subseq extraction method
Named_profile subseq (int start, int len) const;
// Stockholm "NAME/START-END" parsing method
bool name_is_NSE() const;
NSE parse_name() const;
// the following method, assert_profiles_consistent(), checks the various alternate profile sizes for self-consistency.
// this is not a perfect check but it's close enough for National Lab work.
// each profile is checked if its length is nonzero (as with size()) OR if its test flag is set.
void assert_profiles_consistent (Profile_flags test = ALL) const;
};
// A Sequence_database is a very basic container for Named_profile's.
// NB it is generally not a great idea to instantiate a Sequence_database anywhere except
// at the top level of your program (i.e. don't give objects their own Sequence_database's)
// due to the ill-thought-out way that Alignments point to Score_profiles
// (the Alignment's pointers can be invalidated if the Sequence_database is copied).
// A general DART principle is that Sequence_database's are considered persistent,
// and iterators pointing to the Named_profile's inside will remain valid.
// This is why Sequence_database inherits from list<>, rather than e.g. vector<>
// (which tends to invalidate its iterators whenever a resizing is necessary).
//
struct Sequence_database : list<Named_profile>, Profile_flags_enum
{
// virtual destructor
virtual ~Sequence_database() { }
// methods
void write_FASTA (ostream& out, int column_width = 50) const;
void read_FASTA (istream& in);
void write_metaprob (int meta_idx, ostream& out, int column_width = 50) const;
void read_metaprob (int meta_idx, istream& in);
void to_lower();
void to_upper();
void seqs2scores (const Alphabet& a);
void seqs2weights (const Alphabet& a);
void seqs2dsqs (const Alphabet& a);
void seqs_update (const Alphabet& a, Profile_flags convert = ALL);
void scores2seqs (const Alphabet& a);
void scores2weights();
void scores2dsqs(); // no implementation yet
void weights2seqs (const Alphabet& a);
void weights2scores();
void weights2dsqs(); // no implementation yet
void dsqs2seqs (const Alphabet& a); // no implementation yet
void dsqs2scores(); // no implementation yet
void dsqs2weights(); // no implementation yet
void consensus_scores2seqs (const Alphabet& a);
void consensus_weights2seqs (const Alphabet& a);
void consensus_scores2dsqs (const Alphabet& a); // no implementation yet
void consensus_weights2dsqs (const Alphabet& a); // no implementation yet
void clear_seqs() { for_contents (list<Named_profile>, *this, prof) (*prof).seq.clear(); }
void clear_scores() { for_contents (list<Named_profile>, *this, prof) (*prof).prof_sc.clear(); }
void clear_weights() { for_contents (list<Named_profile>, *this, prof) (*prof).prof_w.clear(); }
void clear_dsqs() { for_contents (list<Named_profile>, *this, prof) (*prof).dsq.clear(); }
void reset_metascores (int n_metascores, Score default_sc = 0)
{ for_contents (list<Named_profile>, *this, prof) (*prof).reset_metascores (n_metascores, default_sc); }
Sequence_database revcomp (const Alphabet& a, const char* name_suffix = "", const char* cruft_suffix = "") const;
bool matches_alphabet (const Alphabet& alphabet, double threshold = .95) const; // returns TRUE if fraction of symbols represented by alphabet is greater than threshold
double alphabet_similarity (const Alphabet& alphabet) const; // each nondegenerate symbol score 1, each degenerate scores 0.5
const Alphabet& detect_alphabet() const; // returns DNA_alphabet, RNA_alphabet, Protein_alphabet or Text_alphabet, or dies
void count_symbol_frequencies (vector<double>& composition) const; // requires Digitized_biosequence's to be pre-computed by e.g. calling seqs2dsqs()
vector<Prob> get_null_model (int alphabet_size, double pseudocount = 0.) const;
Prob get_null_extend (double pseudocount = 0.) const; // optimal extension prob. for null model
double mean_length() const;
int total_residues() const;
// helper to append an entire Sequence_database
// returns iterator pointing to first new sequence
virtual iterator append_sequence_database (const Sequence_database& seq_db);
};
// a Sequence_database_index is a by-name indexing of a Sequence_database
//
struct Sequence_database_index
{
vector<Named_profile*> profile;
vector<sstring> name;
Phonebook name2profile_index;
Sequence_database_index() { }
Sequence_database_index (Sequence_database& seq_db) { update (seq_db); }
void update (Sequence_database& seq_db); // rebuilds profile, name & name2profile_index
int size() const { return profile.size(); }
void assert_names_unique (bool test_NSE_conflict = TRUE) const;
Named_profile* name2profile (const sstring& name) const;
};
// FASTA_sequence_database is a convenient class for reading in FASTA files
class FASTA_sequence_database : public Sequence_database
{
// data
private:
const Alphabet* my_alphabet;
public:
Sequence_database_index index;
// accessors
const Alphabet& alphabet() const { return *my_alphabet; }
int size() const { return index.size(); } // override size method for speed (vector::size() is constant)
const Named_profile& get_seq (int i) const { return *index.profile[i]; }
// constructors
FASTA_sequence_database (const char* filename = 0,
const Alphabet* alphabet = 0,
Profile_flags convert = ALL); // calls read_from_file()
FASTA_sequence_database (const Sequence_database& db,
const Alphabet* alphabet = 0);
// I/O
void read_FASTA (istream& in); // calls superclass method & then updates index, but does not call seqs_update()
void read_from_file (const char* filename,
const Alphabet* alphabet = 0,
Profile_flags convert = ALL); // calls seqs_update()
// helpers for updating index, alphabet
void update_index() { index.update (*this); }
void update_alphabet (const Alphabet* alphabet = 0);
// helper for manually adding sequences
void add_seq (const char* name, const char* seq, Profile_flags convert = ALL); // calls seq_update() on new sequence
// helper to append an entire Sequence_database (overrides Sequence_database method)
// returns iterator pointing to first new sequence
iterator append_sequence_database (const Sequence_database& seq_db);
};
#endif