src/seq/biosequence.h

// biosequence code using STL
// ihh@lanl.gov, 1999/04/28

#ifndef BIOSEQUENCE_INCLUDED
#define BIOSEQUENCE_INCLUDED

#include <stdio.h>
#include <iostream>
#include <map>
#include <list>
#include <vector>
#include <algorithm>
#include "util/sstring.h"
#include "util/macros.h"
#include "util/score.h"
#include "util/phonebook.h"
#include "util/rnd.h"
#include "util/map_keys.h"

#define Alphabet_nonsense_char '?'

#define UNINITIALIZED_ALPHABET_NAME "Uninitialized"

// a Biosequence is a text representation of a sequence
//
typedef sstring Biosequence;

// a Digitized_biosequence is a numerical representation of a sequence, related to a Biosequence by an Alphabet (see below)
// NB implementation isn't quite there (will throw an exception for negative (=ambiguous) symbols)
//
class Digitized_biosequence : public vector<int>
{
public:
  Digitized_biosequence() : vector<int>() { }
  Digitized_biosequence (const vector<int>& intvec) : vector<int> (intvec) { }
  Digitized_biosequence (size_type sz, int i = -1) : vector<int> (sz, i) { }
  int  null_score (const vector<int>& null_model) const;
  int  stuttered_null_score (const vector<int>& null_model, const int stutter_sc) const;
  void count_symbol_frequencies (vector<double>& composition) const;
  // the "context" methods a Digitized_biosequence where every site
  // encodes the last 'context_order' residues of the parent sequence
  // ...a hacky way of doing higher-order Markov chains. probably best to ignore these methods.
  Digitized_biosequence make_context_dsq (int alphabet_size, int context_order) const;
  Digitized_biosequence make_reverse_context_dsq (int alphabet_size, int context_order) const;
  static Digitized_biosequence context_to_nmer (int context, int alphabet_size, int context_order);
};

// a Weight_profile is an ungapped profile in probability space
// it should probably have all the same methods as Score_profile (see below), but it is more rarely used
//
typedef pair <int, Prob>                   Symbol_weight;
typedef map <int, Prob>                    Symbol_weight_map;
typedef Symbol_weight_map::const_iterator  Symbol_weight_iterator;

class Weight_profile : public vector <Symbol_weight_map>
{
 public:
  Weight_profile () {}
  Weight_profile (const Digitized_biosequence& dsq);
  Weight_profile (int len, const Symbol_weight_map& weight_map) : vector<Symbol_weight_map> (len, weight_map) {}

  static int weight_map_consensus (const Symbol_weight_map& w);           // returns most heavily weighted symbol in a Symbol_score_map
  int        consensus (int pos) const { return weight_map_consensus ((*this)[pos]); }

  Digitized_biosequence consensus_dsq() const;

  void       normalise();               // for each residue, re-scales symbol weights so that probability sum is 1
  void       count_symbol_frequencies (vector<double>& composition) const;
};

// a Vector_weight_profile is an ungapped probability profile like a Weight_profile,
// but stored as a vector
typedef vector<vector<Prob> > Vector_weight_profile;

// a Score_profile is an ungapped profile in score space
//
typedef pair <int, Score>                 Symbol_score;
typedef map <int, Score>                  Symbol_score_map;
typedef Symbol_score_map::const_iterator  Symbol_score_iterator;

class Score_profile : public vector <Symbol_score_map>
{
 public:
  Score_profile () {}
  Score_profile (const Digitized_biosequence& dsq);
  Score_profile (const Weight_profile& profile);
  Score_profile (int len, const Symbol_score_map& score_map) : vector<Symbol_score_map> (len, score_map) {}

  // Should really make Symbol_score_map a proper class and put this class's static methods there (ditto Symbol_weight_map)
  //
  static int     score_map_consensus (const Symbol_score_map& s);                // returns highest scoring symbol in a Symbol_score_map
  int            consensus (int pos) const { return score_map_consensus ((*this)[pos]); }

  static int     score_map_sample (const Symbol_score_map& s, double kT = 1);    // samples from a Symbol_score_map
  int            sample (int pos, double kT = 1) const { return score_map_sample ((*this)[pos]); }

  static int     score_map_score (const Symbol_score_map& ssm, const vector<int>& null_model);
  int            null_score (const vector<int>& null_model) const;

  void           normalise();               // for each residue, re-scales symbol scores so that probability sum is 1
  
  Digitized_biosequence consensus_dsq() const;
  Digitized_biosequence sample_dsq (double kT = 1) const;

  Score_profile  evolve (const array2d<int>& substitution_matrix) const;            // evolves the sequence (substitutions only)
  vector<int>    null_scores_by_residue (const vector<int>& null_model) const;      // returns null score for each residue in the sequence

  int            inner_product (const Score_profile& prof) const;    // score of product_{columns} sum_{residues in column} P(residue|this) . P(residue|prof)
  
  Weight_profile score2weight() const;         // turns this Score_profile into a Weight_profile

  void           count_symbol_frequencies (vector<double>& composition) const;       // count symbol frequencies
};

// Alphabet class
//
class Alphabet
{
  // every symbol in the alphabet corresponds to a non-negative integer 0,1,2...(size-1)
  // negative integers represent ambiguous symbols, e.g. "N" "Y" "R" for DNA, "X" for proteins, and wildcards "*"
  // these ambiguities can be represented by a weighted sum over the real symbols in the alphabet
  //  e.g. "Y" = 0.5 * "C" + 0.5 * "T" (NB this is a probability distribution)
  //       "*" = "A" + "C" + "G" + "T" (NB this is not a probability distribution)
  // such weighted sums are represented by Symbol_weight_map's (in probability space) or Symbol_score_map's (in score space)
  //  these representations can be resolved by calling int2weight() or int2score()
  //  in fact there are a million trillion zillion different conversion functions we have to define, and some are as yet unused & unimplemented.
  //
 protected:
  int alphabet_size;  // number of nondegenerate symbols

  // maps indexed by the string representation of an alphabet token
  map<sstring,int> tok2int;

  // arrays indexed by the character representation of an alphabet token
  vector<int> char2int_table;     // a value of (alphabet_size) indicates unknown
  vector<double> char_aff_table;  // table of char affinities for this alphabet; used to match sequences to alphabets

  // arrays indexed by the integer representation of an alphabet token
  vector<int> nondegen_comp;      // complements for nondegenerate chars
  vector<int> degen_comp;         // complements for degenerate chars

  vector<sstring> tok;  // tokens for nondegenerate symbols (a token can be multiple chars)
  sstring chars_lc;  // nondegenerate chars, lower case
  sstring chars_uc;  // nondegenerate chars, upper case
  sstring degen_lc;  // degenerate chars, lower case
  sstring display_lc;  // what to output for nondegenerate symbols with no chars (used by hidden alphabets)

  vector<Symbol_weight_map> nondegen_weight;  // nondegenerate weight mappings
  vector<Symbol_score_map>  nondegen_score;   // nondegenerate score mappings
  vector<Symbol_weight_map> degen_weight;     // degenerate weight mappings
  vector<Symbol_score_map>  degen_score;      // degenerate score mappings

  int unknown_int;  // "N" for DNA, "X" for protein

  bool warn_randomise;  // flag saying whether to warn when randomising degenerate chars; set to 0 after first warning
  bool warn_unknown;    // flag for unknown chars warning

 public:
  // name
  sstring name;  // name of alphabet, e.g. "DNA", "Protein"

  // case-sensitivity flag
  bool case_sensitive;

  // multiplier for counts, to turn fractional degenerate counts into integers. 12 for DNA/RNA, 20 for protein
  //   (explanation: ambiguated counts for DNA can be 1/2 (e.g. "r", "y"), 1/3 (e.g. "h", "v"), 1/4 ("n")
  //    lowest common denominator of (1/2, 1/3, 1/4) is 12)
  unsigned int degenerate_lcd;

  // Felsenstein wildcards
  Symbol_score_map wild_ssm;
  Symbol_weight_map wild_swm;

  // builder methods
  void reset (const char* name, int size);
  void init_tokens (const vector<sstring>& tokens);  // nondegenerate tokens
  void init_chars (const char* chars, const char* comp = "");  // nondegenerate chars
  void init_degen_chars (const char* chars);  // degenerate chars
  void init_degen_complement();
  void init_display (const sstring& chars);
  void set_degen_char (char degen_char, const Symbol_weight_map& p);
  void set_degen_char (char degen_char, const char* possibilities);
  void set_wildcard (char wildcard_char);
  void set_unknown (char unknown_char);

  // method to initialize a "hidden" alphabet from a "visible" alphabet
  // suppose the visible alphabet has A [nondegenerate] symbols, and the hidden_classes variable is equal to C.
  // then the hidden alphabet will have C*A symbols,
  // with each symbol of the visible alphabet representing a C-fold degenerate symbol in the hidden alphabet.
  // Symbol N in the visible alphabet is a degenerate char for symbols {N,A+N,2*A+N...(C-1)*A+N} in the hidden alphabet.
  void init_hidden (const Alphabet& alph, int hidden_classes = 1);
  void init_hidden (const Alphabet& base_alphabet, const vector<sstring>& hidden_class_labels);

  // constructor
  Alphabet (const char* name = UNINITIALIZED_ALPHABET_NAME, int size = 0);

  // methods
  int size() const { return alphabet_size; }

  // chars() & chars_toupper() methods return C strings containing nondegenerate symbols (e.g. ACGT for DNA)
  const char* nondegenerate_chars() const { return chars_lc.c_str(); }
  const char* nondegenerate_chars_toupper() const { return chars_uc.c_str(); }
  const char* degenerate_chars() const { return degen_lc.c_str(); }

  // tokens() method is an accessor for tok vector
  const vector<sstring>& tokens() const { return tok; }

  // basic query/conversion methods

  inline bool contains_strict (char c) const  // forbids degenerate chars
  {
    const int i = char2int_table[c];
    return i >= 0 && i < alphabet_size;
  }

  inline bool contains_deg (char c) const  // allows degenerate chars
  {
    const int i = char2int_table[c];
    return i > - (int) degen_lc.size() && i < alphabet_size;
  }

  inline int char2int_strict (char c) const  // randomises degenerate/unknown chars
  {
    int i = char2int_table[c];
    if (i >= alphabet_size)
      {
	if (warn_unknown) CLOGERR << "Randomising unknown characters\n";
	((Alphabet*)this)->warn_unknown = 0;  // cast away const
	i = alphabet_size > 1 ? Rnd::rnd_int (alphabet_size) : 0;  // don't call Rnd unless necessary
      }
    else if (i < 0)  // degenerate char
      {
	const Symbol_weight_map& deg_map = int2weight(i);
	const vector<int> degen_opt = map_keys (deg_map);
	const vector<Prob> degen_prob = map_values (deg_map);
	i = degen_opt.size() > 1 ? degen_opt[Rnd::choose(degen_prob)] : degen_opt[0];  // don't call Rnd unless necessary
	if (degen_opt.size() > 1)
	  {
	    if (warn_randomise) CLOGERR << "Randomising degenerate characters\n";
	    ((Alphabet*)this)->warn_randomise = 0;  // cast away const
	  }
      }
    return i;
  }
  
  inline int char2int_deg (char c) const  // converts unknown chars to (unknown_int)
  {
    int i = char2int_table[c];
    if (i >= alphabet_size)
      {
	i = unknown_int;
	if (i >= alphabet_size)
	  {
	    i = alphabet_size > 1 ? Rnd::rnd_int (alphabet_size) : 0;  // don't call Rnd unless necessary
	    CLOGERR << "I don't think '" << c << "' matches alphabet " << name << "; randomizing to '" << int2char(i) << "'\n";
	  }
	else
	  CLOGERR << "I don't think '" << c << "' matches alphabet " << name << "; setting to '" << int2char(i) << "'\n";
      }
    return i;
  }

  inline char int2char (int i) const
  {
    if (i >= 0)
      {
	if (i >= alphabet_size) return Alphabet_nonsense_char;
	if (i >= (int) chars_lc.size())
	  return i >= (int) display_lc.size() ? Alphabet_nonsense_char : display_lc[i];
	return chars_lc[i];
      }
    if (-i-1 >= (int) degen_lc.size()) return Alphabet_nonsense_char;
    return degen_lc[-i-1];
  }

  inline char int2char_uc (int i) const
  {
    int c = int2char (i);
    if (c >= 'a' && c <= 'z') c += 'A' - 'a';
    return c;
  }

  inline char unknown_char() const { return int2char (unknown_int); }
  inline int unknown_symbol_index() const { return unknown_int; }

  inline bool contains_tok (const sstring& token) const
  {
    return tok2int.find (token) != tok2int.end();
  }

  inline int token2int (const sstring& token) const {
    map<sstring,int>::const_iterator tok2int_iter = tok2int.find (token);
    return tok2int_iter == tok2int.end() ? unknown_int : tok2int_iter->second;
  }

  inline const sstring& int2token (int i) const {
    if (i < 0 || i >= alphabet_size)
      THROWEXPR ("Bad alphabet symbol index in int2token");
    if (i >= (int) tok.size())
      THROWEXPR ("In int2token: tokens uninitialized");
    return tok[i];
  }

  // fancier conversion methods

  inline const Symbol_weight_map& int2weight (int i) const { return i >= 0 ? nondegen_weight[i] : degen_weight[-i-1]; }
  inline const Symbol_weight_map& char2weight (char c) const { return int2weight (char2int_deg (c)); }
  
  int weight2int (const Symbol_weight_map& w) const;
  char weight2char (const Symbol_weight_map& w) const;

  inline const Symbol_score_map& int2score (int i) const { return i >= 0 ? nondegen_score[i] : degen_score[-i-1]; }
  inline const Symbol_score_map& char2score (char c) const { return int2score (char2int_deg (c)); }

  int score2int (const Symbol_score_map& s) const;
  char score2char (const Symbol_score_map& s) const;

  // Whole-sequence conversions.
  // Each of these methods returns the second argument as a reference
  // (not really DART's most up-to-date C++ coding practise, if such a standard exists)

  Digitized_biosequence&  seq2dsq  (const Biosequence& seq,           Digitized_biosequence& dsq) const;
  Biosequence&            dsq2seq  (const Digitized_biosequence& dsq, Biosequence& seq)           const;

  Weight_profile&         seq2weight  (const Biosequence& seq,     Weight_profile& prof) const;
  Biosequence&            weight2seq  (const Weight_profile& prof, Biosequence& seq)     const;

  Score_profile&          seq2score  (const Biosequence& seq,    Score_profile& prof) const;
  Biosequence&            score2seq  (const Score_profile& prof, Biosequence& seq) const;

  Weight_profile&         dsq2weight  (const Digitized_biosequence& dsq, Weight_profile& prof) const;
  Digitized_biosequence&  weight2dsq  (const Weight_profile& prof,       Digitized_biosequence& dsq) const;   // no implementation yet

  Score_profile&          dsq2score  (const Digitized_biosequence& dsq,  Score_profile& prof) const;
  Digitized_biosequence&  score2dsq  (const Score_profile& prof,         Digitized_biosequence& dsq) const;   // no implementation yet

  // sequence conversion methods that dynamically allocate & return a new sequence

  Digitized_biosequence  new_seq2dsq  (const Biosequence& seq) const;
  Biosequence            new_dsq2seq  (const Digitized_biosequence& dsq) const;

  Weight_profile         new_seq2weight  (const Biosequence& seq) const;
  Biosequence            new_weight2seq  (const Weight_profile& prof)     const;

  Score_profile          new_seq2score  (const Biosequence& seq) const;
  Biosequence            new_score2seq  (const Score_profile& prof) const;

  Weight_profile         new_dsq2weight  (const Digitized_biosequence& dsq) const;
  Digitized_biosequence  new_weight2dsq  (const Weight_profile& prof) const;   // no implementation yet

  Score_profile          new_dsq2score  (const Digitized_biosequence& dsq) const;
  Digitized_biosequence  new_score2dsq  (const Score_profile& prof) const;  // no implementation yet

  // wildcard stuff
  bool all_wild (const Score_profile& prof) const;     // returns TRUE if Score_profile contains nothing but wildcards
  bool all_wild (const Weight_profile& prof) const;    // returns TRUE if Weight_profile contains nothing but wildcards

  Symbol_weight_map       flat_weight_map (double weight) const;      // "*" (weight = 1) or "N" (DNA, weight = .25) or "X" (protein, weight = .05)
  Symbol_score_map        flat_score_map (int score) const;           // like flat_weight_map() but in score space

  // complement stuff
  bool                    has_complement() const { return nondegen_comp.size() != 0; }
  int                     complement (int sym) const;

  Symbol_weight_map       complement_weight (const Symbol_weight_map& m) const;
  Symbol_score_map        complement_score (const Symbol_score_map& m) const;
  char                    complement_char (char c) const;

  Weight_profile          revcomp_weight_profile (const Weight_profile& p) const;    // NB does a _reverse_ complement
  Score_profile           revcomp_score_profile (const Score_profile& p) const;
  Digitized_biosequence   revcomp_digitized_sequence (const Digitized_biosequence& dsq) const;
  Biosequence             revcomp_sequence (const Biosequence& s) const;             // preserves case

  // nmer stuff
  int nmer_token (int word_len, const Digitized_biosequence::const_iterator iter);
  int nmer_token (const char* nmer);

  // regular expression helper
  sstring degenerate_seq_pattern (const Biosequence& seq) const;  // converts IUPAC degenerate sequence to regexp, e.g. "n" -> "[acgt]" for DNA

  // method to return the affinity of an alphabet for a particular character
  // used to figure out which alphabet corresponds to an anonymous input sequence (typically "is it DNA or Protein?")
  inline double char_affinity (char c) const { return char_aff_table[c]; }
};

struct Empty_alphabet : public Alphabet
{
  Empty_alphabet() : Alphabet ("", 0) { }
};

struct Simple_alphabet : public Alphabet
{
  Simple_alphabet (const char* name, const char* chars);
};

struct DNA_alphabet_type : public Alphabet
{
  DNA_alphabet_type();
};

struct RNA_alphabet_type : public Alphabet
{
  RNA_alphabet_type();
};

struct Protein_alphabet_type : public Alphabet
{
  Protein_alphabet_type();
};

struct Roman_alphabet_type : public Alphabet
{
  Roman_alphabet_type();
};

struct Text_alphabet_type : public Alphabet
{
  Text_alphabet_type();
};

extern DNA_alphabet_type DNA_alphabet;
extern RNA_alphabet_type RNA_alphabet;
extern Protein_alphabet_type Protein_alphabet;
extern Text_alphabet_type Text_alphabet;
extern Roman_alphabet_type Roman_alphabet;
extern Simple_alphabet Dummy_alphabet;  // only has one symbol

// A Metascore is an array of scores associated with each residue in a sequence,
// representing splice site predictions, masking, or any other secondary scoring source.

typedef vector<Score> Metascore;
typedef vector<Prob>  Metaprob;   // probability-space version of a Metascore

// A Meta_profile is a base class for Named_profile, holding Score_profile & Metascore vector.
// (Metascores can be used to store scores associated with sequences, like splice site predictions or masks)
struct Meta_profile
{
  Score_profile         prof_sc;  // sequence in score profile form
  vector<Metascore>     meta_sc;  // metascores
};

// Profile_flags_enum, for specifying type conversions and the like
struct Profile_flags_enum
{
  enum Profile_flags { NONE = 0, SEQ = 1, SCORE = 2, WEIGHT = 4, DSQ = 8, ALL = 15 };
};

// (name, start, end) tuple
struct NSE
{
  // data
  sstring seqname;
  int     start;
  int     end;
  // constructors
  NSE();
  NSE (const sstring& name, int start, int end);
};

// A Named_profile holds name, "cruft" (i.e. the text following the name in FASTA files),
// sequence data (in text, digitized, score and/or probability form) and Metascores.
// 
//
// No attempt is made to maintain consistency between alternative forms of sequence -- that is the caller's job.
// Sometimes, when non-IUPAC degenerate score profiles are used, it may not even be possible to interconvert.
//
// ...yes I *know* this is messy. it keeps me awake at nights. maybe one day i'll fix it. (chyeah)
//
struct Named_profile : Meta_profile, Profile_flags_enum
{
  sstring               name;     // name
  sstring               cruft;    // "cruft" (text following name in FASTA definition line)
  Biosequence           seq;      // sequence in text form
  Weight_profile        prof_w;   // sequence in probability profile form
  Digitized_biosequence dsq;      // sequence in digitised form

  // regexps for FASTA-format input
  static Regexp name_regexp;
  static Regexp quick_name_regexp;

  // regexp for Stockholm "NAME/START-END"
  static Regexp nse_regexp;

  // methods
  // clear method
  void clear();

  // FASTA IO
  void write_FASTA (ostream& out, int column_width = 50) const;
  void read_FASTA (istream& in);

  // metascore IO
  void write_metaprob (int meta_idx, ostream& out, int column_width = 50) const;
  void read_metaprob (int meta_idx, istream& in, bool expect_name = 1);

  // method to check a file for FASTA form (initial ">")
  static bool detect_FASTA (istream& in);

  // quick FASTA output wrapper
  void dump() const { write_FASTA (cout, 50); }

  // all-purpose conversion method
  void seq_update (const Alphabet& a, Profile_flags convert = ALL);

  // conversion methods
  void seq2score (const Alphabet& a) { a.seq2score (seq, prof_sc); }
  void seq2weight (const Alphabet& a) { a.seq2weight (seq, prof_w); }
  void seq2dsq (const Alphabet& a) { a.seq2dsq (seq, dsq); }

  void score2seq (const Alphabet& a) { a.score2seq (prof_sc, seq); }
  void score2weight() { prof_w = prof_sc.score2weight(); }
  void score2dsq (const Alphabet& a);    // no implementation yet

  void weight2seq (const Alphabet& a) { a.weight2seq (prof_w, seq); }
  void weight2score() { prof_sc = Score_profile (prof_w); }
  void weight2dsq (const Alphabet& a);   // no implementation yet

  void dsq2seq (const Alphabet& a) { a.dsq2seq (dsq, seq); }
  void dsq2score (const Alphabet& a) { a.dsq2score (dsq, prof_sc); }
  void dsq2weight (const Alphabet& a);   // no implementation yet

  // consensus conversion methods
  void consensus_score2seq (const Alphabet& a) { a.dsq2seq (prof_sc.consensus_dsq(), seq); }
  void consensus_weight2seq (const Alphabet& a) { a.dsq2seq (prof_w.consensus_dsq(), seq); }

  void consensus_score2dsq (const Alphabet& a) { dsq = prof_sc.consensus_dsq(); }
  void consensus_weight2dsq (const Alphabet& a) { dsq = prof_w.consensus_dsq(); }

  // reverse complement method
  Named_profile revcomp (const Alphabet& a, const char* name_suffix = "", const char* cruft_suffix = "") const;

  // metascore reset & update methods
  void reset_metascores (int n_metascores, Score default_sc = 0);
  void add_metascores (int metascore_idx, const Metascore& delta);

  // size method
  int size() const;  // does a simple consistency check between profile lengths

  // subseq extraction method
  Named_profile subseq (int start, int len) const;

  // Stockholm "NAME/START-END" parsing method
  bool name_is_NSE() const;
  NSE parse_name() const;

  // the following method, assert_profiles_consistent(), checks the various alternate profile sizes for self-consistency.
  // this is not a perfect check but it's close enough for National Lab work.
  // each profile is checked if its length is nonzero (as with size()) OR if its test flag is set.
  void assert_profiles_consistent (Profile_flags test = ALL) const;
};


// A Sequence_database is a very basic container for Named_profile's.
// NB it is generally not a great idea to instantiate a Sequence_database anywhere except
// at the top level of your program (i.e. don't give objects their own Sequence_database's)
// due to the ill-thought-out way that Alignments point to Score_profiles
// (the Alignment's pointers can be invalidated if the Sequence_database is copied).
// A general DART principle is that Sequence_database's are considered persistent,
// and iterators pointing to the Named_profile's inside will remain valid.
// This is why Sequence_database inherits from list<>, rather than e.g. vector<>
// (which tends to invalidate its iterators whenever a resizing is necessary).
//
struct Sequence_database : list<Named_profile>, Profile_flags_enum
{
  // virtual destructor
  virtual ~Sequence_database() { }

  // methods
  void write_FASTA (ostream& out, int column_width = 50) const;
  void read_FASTA (istream& in);

  void write_metaprob (int meta_idx, ostream& out, int column_width = 50) const;
  void read_metaprob (int meta_idx, istream& in);

  void to_lower();
  void to_upper();

  void seqs2scores (const Alphabet& a);
  void seqs2weights (const Alphabet& a);
  void seqs2dsqs (const Alphabet& a);

  void seqs_update (const Alphabet& a, Profile_flags convert = ALL);

  void scores2seqs (const Alphabet& a);
  void scores2weights();
  void scores2dsqs();      // no implementation yet

  void weights2seqs (const Alphabet& a);
  void weights2scores();
  void weights2dsqs();     // no implementation yet

  void dsqs2seqs (const Alphabet& a);   // no implementation yet
  void dsqs2scores();                   // no implementation yet
  void dsqs2weights();                  // no implementation yet

  void consensus_scores2seqs (const Alphabet& a);
  void consensus_weights2seqs (const Alphabet& a);

  void consensus_scores2dsqs (const Alphabet& a);       // no implementation yet
  void consensus_weights2dsqs (const Alphabet& a);      // no implementation yet

  void clear_seqs() { for_contents (list<Named_profile>, *this, prof) (*prof).seq.clear(); }
  void clear_scores() { for_contents (list<Named_profile>, *this, prof) (*prof).prof_sc.clear(); }
  void clear_weights() { for_contents (list<Named_profile>, *this, prof) (*prof).prof_w.clear(); }
  void clear_dsqs() { for_contents (list<Named_profile>, *this, prof) (*prof).dsq.clear(); }

  void reset_metascores (int n_metascores, Score default_sc = 0)
  { for_contents (list<Named_profile>, *this, prof) (*prof).reset_metascores (n_metascores, default_sc); }

  Sequence_database revcomp (const Alphabet& a, const char* name_suffix = "", const char* cruft_suffix = "") const;

  bool            matches_alphabet (const Alphabet& alphabet, double threshold = .95) const;   // returns TRUE if fraction of symbols represented by alphabet is greater than threshold
  double alphabet_similarity (const Alphabet& alphabet) const;  // each nondegenerate symbol score 1, each degenerate scores 0.5
  const Alphabet& detect_alphabet() const;  // returns DNA_alphabet, RNA_alphabet, Protein_alphabet or Text_alphabet, or dies

  void count_symbol_frequencies (vector<double>& composition) const;   // requires Digitized_biosequence's to be pre-computed by e.g. calling seqs2dsqs()
  vector<Prob> get_null_model (int alphabet_size, double pseudocount = 0.) const;
  Prob get_null_extend (double pseudocount = 0.) const;  // optimal extension prob. for null model
  double mean_length() const;
  int total_residues() const;

  // helper to append an entire Sequence_database
  // returns iterator pointing to first new sequence
  virtual iterator append_sequence_database (const Sequence_database& seq_db);
};

// a Sequence_database_index is a by-name indexing of a Sequence_database
//
struct Sequence_database_index
{
  vector<Named_profile*> profile;
  vector<sstring>        name;
  Phonebook              name2profile_index;

  Sequence_database_index() { }
  Sequence_database_index (Sequence_database& seq_db) { update (seq_db); }

  void update (Sequence_database& seq_db);  // rebuilds profile, name & name2profile_index
  int  size() const { return profile.size(); }
  void assert_names_unique (bool test_NSE_conflict = TRUE) const;

  Named_profile* name2profile (const sstring& name) const;
};

// FASTA_sequence_database is a convenient class for reading in FASTA files

class FASTA_sequence_database : public Sequence_database
{
  // data
private:
  const Alphabet* my_alphabet;
public:
  Sequence_database_index index;

  // accessors
  const Alphabet& alphabet() const { return *my_alphabet; }
  int size() const { return index.size(); }    // override size method for speed (vector::size() is constant)

  const Named_profile& get_seq (int i) const { return *index.profile[i]; }

  // constructors
  FASTA_sequence_database (const char* filename = 0,
			   const Alphabet* alphabet = 0,
			   Profile_flags convert = ALL);   // calls read_from_file()

  FASTA_sequence_database (const Sequence_database& db,
			   const Alphabet* alphabet = 0);

  // I/O
  void read_FASTA (istream& in);  // calls superclass method & then updates index, but does not call seqs_update()
  void read_from_file (const char* filename,
		       const Alphabet* alphabet = 0,
		       Profile_flags convert = ALL);   // calls seqs_update()

  // helpers for updating index, alphabet
  void update_index() { index.update (*this); }
  void update_alphabet (const Alphabet* alphabet = 0);

  // helper for manually adding sequences
  void add_seq (const char* name, const char* seq, Profile_flags convert = ALL);  // calls seq_update() on new sequence

  // helper to append an entire Sequence_database (overrides Sequence_database method)
  // returns iterator pointing to first new sequence
  iterator append_sequence_database (const Sequence_database& seq_db);
};

#endif