Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
2553 lines (2238 sloc) 84.7 KB
/* Implements the standard digitized alphabets for biosequences.
*
* 1. ESL_ALPHABET object for digital alphabets.
* 2. Digitized sequences (ESL_DSQ *).
* 3. Other routines in the API.
* 4. Unit tests.
* 5. Test driver.
* 6. Examples.
*/
#include "esl_config.h"
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#ifdef HAVE_STRINGS_H
#include <strings.h> /* POSIX strcasecmp() */
#endif
#include "easel.h"
#include "esl_alphabet.h"
/*****************************************************************
* 1. The ESL_ALPHABET object
*****************************************************************/
static ESL_ALPHABET *create_rna(void);
static ESL_ALPHABET *create_dna(void);
static ESL_ALPHABET *create_amino(void);
static ESL_ALPHABET *create_coins(void);
static ESL_ALPHABET *create_dice(void);
static int set_complementarity(ESL_ALPHABET *a);
/* Function: esl_alphabet_Create()
* Synopsis: Create alphabet of a standard type.
*
* Purpose: Creates one of the three standard bio alphabets:
* <eslDNA>, <eslRNA>, or <eslAMINO>, and returns
* a pointer to it.
*
* Args: type - <eslDNA>, <eslRNA>, or <eslAMINO>.
*
* Returns: pointer to the new alphabet.
*
* Throws: <NULL> if any allocation or initialization fails.
*/
ESL_ALPHABET *
esl_alphabet_Create(int type)
{
ESL_ALPHABET *a = NULL;
switch(type) {
case eslRNA: a = create_rna(); break;
case eslDNA: a = create_dna(); break;
case eslAMINO: a = create_amino(); break;
case eslCOINS: a = create_coins(); break;
case eslDICE: a = create_dice(); break;
default: esl_fatal("bad alphabet type: unrecognized"); // violation: must be a code error, not user.
}
return a;
}
/* Function: esl_alphabet_CreateCustom()
* Synopsis: Create a custom alphabet.
*
* Purpose: Creates a customized biosequence alphabet,
* and returns a ptr to it. The alphabet type is set
* to <eslNONSTANDARD>.
*
* <alphabet> is the internal alphabet string;
* <K> is the size of the base alphabet;
* <Kp> is the total size of the alphabet string.
*
* In the alphabet string, residues <0..K-1> are the base alphabet;
* residue <K> is the canonical gap (indel) symbol;
* residues <K+1..Kp-4> are additional degeneracy symbols (possibly 0 of them);
* residue <Kp-3> is an "any" symbol (such as N or X);
* residue <Kp-2> is a "nonresidue" symbol (such as *);
* and residue <Kp-1> is a "missing data" gap symbol.
*
* The two gap symbols, the nonresidue, and the "any"
* symbol are mandatory even for nonstandard alphabets, so
* <Kp> $\geq$ <K+4>.
*
* Args: alphabet - internal alphabet; example "ACGT-RYMKSWHBVDN*~"
* K - base size; example 4
* Kp - total size, including gap, degeneracies; example 18
*
* Returns: pointer to new <ESL_ALPHABET> structure.
*
* Throws: <NULL> if any allocation or initialization fails.
*/
ESL_ALPHABET *
esl_alphabet_CreateCustom(const char *alphabet, int K, int Kp)
{
ESL_ALPHABET *a = NULL;
int c,x,y;
int status;
/* Argument checks.
*/
if (strlen(alphabet) != Kp) ESL_XEXCEPTION(eslEINVAL, "alphabet length != Kp");
if (Kp < K+4) ESL_XEXCEPTION(eslEINVAL, "Kp too small in alphabet");
/* Allocation/init, level 1.
*/
ESL_ALLOC(a, sizeof(ESL_ALPHABET));
a->sym = NULL;
a->degen = NULL;
a->ndegen = NULL;
a->complement = NULL;
/* Allocation/init, level 2.
*/
ESL_ALLOC(a->sym, sizeof(char) * (Kp+1));
ESL_ALLOC(a->ndegen, sizeof(int) * Kp);
ESL_ALLOC(a->degen, sizeof(char *) * Kp);
a->degen[0] = NULL;
/* Allocation/init, level 3.
*/
ESL_ALLOC(a->degen[0], sizeof(char) * (Kp*K));
for (x = 1; x < Kp; x++)
a->degen[x] = a->degen[0]+(K*x);
/* Initialize the internal alphabet:
*/
a->type = eslNONSTANDARD;
a->K = K;
a->Kp = Kp;
strcpy(a->sym, alphabet);
/* Initialize the input map, mapping ASCII seq chars to digital codes,
* and eslDSQ_ILLEGAL for everything else.
*/
for (c = 0; c < 128; c++) a->inmap[c] = eslDSQ_ILLEGAL;
for (x = 0; x < a->Kp; x++) a->inmap[(int) a->sym[x]] = x;
/* Initialize the degeneracy map:
* Base alphabet (first K syms) are automatically
* mapped uniquely; (Kp-3) is assumed to be
* the "any" character; other degen chars (K+1..Kp-4) are
* unset; gap, nonresidue, missing character are unmapped (ndegen=0)
*/
for (x = 0; x < a->Kp; x++) /* clear everything */
{
a->ndegen[x] = 0;
for (y = 0; y < a->K; y++) a->degen[x][y] = 0;
}
for (x = 0; x < a->K; x++) /* base alphabet */
{
a->ndegen[x] = 1;
a->degen[x][x] = 1;
}
/* "any" character */
a->ndegen[Kp-3] = K;
for (x = 0; x < a->K; x++) a->degen[Kp-3][x] = 1;
return a;
ERROR:
esl_alphabet_Destroy(a);
return NULL;
}
/* create_rna():
* Creates a standard RNA alphabet.
*/
static ESL_ALPHABET *
create_rna(void)
{
ESL_ALPHABET *a = NULL;
int status;
/* Create the fundamental alphabet
*/
if ((a = esl_alphabet_CreateCustom("ACGU-RYMKSWHBVDN*~", 4, 18)) == NULL) return NULL;
a->type = eslRNA;
/* Add desired synonyms in the input map.
*/
esl_alphabet_SetEquiv(a, 'T', 'U'); /* read T as a U */
esl_alphabet_SetEquiv(a, 'X', 'N'); /* read X as an N (many seq maskers use X) */
esl_alphabet_SetEquiv(a, 'I', 'A'); /* Inosine is a deaminated Adenosine, appears in some RNACentral sequences */
esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
/* Define degenerate symbols.
*/
esl_alphabet_SetDegeneracy(a, 'R', "AG");
esl_alphabet_SetDegeneracy(a, 'Y', "CU");
esl_alphabet_SetDegeneracy(a, 'M', "AC");
esl_alphabet_SetDegeneracy(a, 'K', "GU");
esl_alphabet_SetDegeneracy(a, 'S', "CG");
esl_alphabet_SetDegeneracy(a, 'W', "AU");
esl_alphabet_SetDegeneracy(a, 'H', "ACU");
esl_alphabet_SetDegeneracy(a, 'B', "CGU");
esl_alphabet_SetDegeneracy(a, 'V', "ACG");
esl_alphabet_SetDegeneracy(a, 'D', "AGU");
if ( (status = set_complementarity(a)) != eslOK) goto ERROR;
return a;
ERROR:
esl_alphabet_Destroy(a);
return NULL;
}
/* create_dna():
* creates and returns a standard DNA alphabet.
*/
static ESL_ALPHABET *
create_dna(void)
{
ESL_ALPHABET *a = NULL;
int status;
/* Create the fundamental alphabet.
*/
if ((a = esl_alphabet_CreateCustom("ACGT-RYMKSWHBVDN*~", 4, 18)) == NULL) return NULL;
a->type = eslDNA;
/* Add desired synonyms in the input map.
*/
esl_alphabet_SetEquiv(a, 'U', 'T'); /* read U as a T */
esl_alphabet_SetEquiv(a, 'X', 'N'); /* read X as an N (many seq maskers use X) */
esl_alphabet_SetEquiv(a, 'I', 'A'); /* Inosine is a deaminated Adenosine, appears in some RNACentral sequences */
esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
/* Define IUBMB degenerate symbols other than the N.
*/
esl_alphabet_SetDegeneracy(a, 'R', "AG");
esl_alphabet_SetDegeneracy(a, 'Y', "CT");
esl_alphabet_SetDegeneracy(a, 'M', "AC");
esl_alphabet_SetDegeneracy(a, 'K', "GT");
esl_alphabet_SetDegeneracy(a, 'S', "CG");
esl_alphabet_SetDegeneracy(a, 'W', "AT");
esl_alphabet_SetDegeneracy(a, 'H', "ACT");
esl_alphabet_SetDegeneracy(a, 'B', "CGT");
esl_alphabet_SetDegeneracy(a, 'V', "ACG");
esl_alphabet_SetDegeneracy(a, 'D', "AGT");
if ( (status = set_complementarity(a)) != eslOK) goto ERROR;
return a;
ERROR:
esl_alphabet_Destroy(a);
return NULL;
}
/* create_amino():
* Creates a new standard amino acid alphabet.
*/
static ESL_ALPHABET *
create_amino(void)
{
ESL_ALPHABET *a = NULL;
/* Create the internal alphabet
*/
if ((a = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZOUX*~", 20, 29)) == NULL) return NULL;
a->type = eslAMINO;
/* Add desired synonyms in the input map.
*/
esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
/* Define IUPAC degenerate symbols other than the X.
*/
esl_alphabet_SetDegeneracy(a, 'B', "ND");
esl_alphabet_SetDegeneracy(a, 'J', "IL");
esl_alphabet_SetDegeneracy(a, 'Z', "QE");
/* Define unusual residues as one-to-one degeneracies.
*/
esl_alphabet_SetDegeneracy(a, 'U', "C"); /* selenocysteine is scored as cysteine */
esl_alphabet_SetDegeneracy(a, 'O', "K"); /* pyrrolysine is scored as lysine */
return a;
}
/* create_coins():
* Creates a toy alphabet for coin examples
*/
static ESL_ALPHABET *
create_coins(void)
{
ESL_ALPHABET *a = NULL;
/* Create the internal alphabet
*/
if ((a = esl_alphabet_CreateCustom("HT-X*~", 2, 6)) == NULL) return NULL;
a->type = eslCOINS;
/* Add desired synonyms in the input map.
*/
esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
/* There are no degeneracies in the coin alphabet. */
return a;
}
/* create_dice():
* Creates a toy alphabet for dice examples
*/
static ESL_ALPHABET *
create_dice(void)
{
ESL_ALPHABET *a = NULL;
/* Create the internal alphabet
*/
if ((a = esl_alphabet_CreateCustom("123456-X*~", 6, 10)) == NULL) return NULL;
a->type = eslCOINS;
/* Add desired synonyms in the input map.
*/
esl_alphabet_SetEquiv(a, '_', '-'); /* allow _ as a gap too */
esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap too */
esl_alphabet_SetCaseInsensitive(a); /* allow lower case input */
/* There are no degeneracies in the dice alphabet. */
return a;
}
/* set_complementarity()
* Builds the "complement" lookup table for DNA, RNA alphabets.
*
* Throws <eslEINVAL> if the alphabet isn't <eslDNA> or <eslRNA>.
*/
static int
set_complementarity(ESL_ALPHABET *a)
{
int status;
if (a->type != eslRNA && a->type != eslDNA)
ESL_EXCEPTION(eslEINVAL, "alphabet isn't nucleic: no complementarity to set");
/* We will assume that Kp=18 and sym="ACGT-RYMKSWHBVDN*~" (or RNA equiv).
* Bug #h108 happened because routine fell out of sync w/ a change in alphabet.
* Don't let that happen again.
*/
ESL_DASSERT1(( a->Kp == 18 ));
ESL_DASSERT1(( a->sym[17] == '~' ));
ESL_ALLOC(a->complement, sizeof(ESL_DSQ) * a->Kp);
a->complement[0] = 3; /* A->T */
a->complement[1] = 2; /* C->G */
a->complement[2] = 1; /* G->C */
a->complement[3] = 0; /* T->A */
a->complement[4] = 4; /* - - */
a->complement[5] = 6; /* R->Y */
a->complement[6] = 5; /* Y->R */
a->complement[7] = 8; /* M->K */
a->complement[8] = 7; /* K->M */
a->complement[9] = 9; /* S S */
a->complement[10]= 10; /* W W */
a->complement[11]= 14; /* H->D */
a->complement[12]= 13; /* B->V */
a->complement[13]= 12; /* V->B */
a->complement[14]= 11; /* D->H */
a->complement[15]= 15; /* N N */
a->complement[16]= 16; /* * * */
a->complement[17]= 17; /* ~ ~ */
return eslOK;
ERROR:
return status;
}
/* Function: esl_alphabet_SetEquiv()
* Synopsis: Define an equivalent symbol.
*
* Purpose: Maps an additional input alphabetic symbol <sym> to
* an internal alphabet symbol <c>; for example,
* we might map T to U for an RNA alphabet, so that we
* allow for reading input DNA sequences.
*
* Args: sym - symbol to allow in the input alphabet; 'T' for example
* c - symbol to map <sym> to in the internal alphabet; 'U' for example
*
* Returns: <eslOK> on success.
*
* Throws: <eslEINVAL> if <c> is not in the internal alphabet, or if <sym> is.
*/
int
esl_alphabet_SetEquiv(ESL_ALPHABET *a, char sym, char c)
{
char *sp = NULL;
ESL_DSQ x;
/* Contract checks */
if ((sp = strchr(a->sym, sym)) != NULL)
ESL_EXCEPTION(eslEINVAL, "symbol %c is already in internal alphabet, can't equivalence it", sym);
if ((sp = strchr(a->sym, c)) == NULL)
ESL_EXCEPTION(eslEINVAL, "char %c not in the alphabet, can't map to it", c);
x = sp - a->sym;
a->inmap[(int) sym] = x;
return eslOK;
}
/* Function: esl_alphabet_SetCaseInsensitive()
* Synopsis: Make an alphabet's input map case-insensitive.
*
* Purpose: Given a custom alphabet <a>, with all equivalences set,
* make the input map case-insensitive: for every
* letter that is mapped in either lower or upper
* case, map the other case to the same internal
* residue.
*
* For the standard alphabets, this is done automatically.
*
* Args: a - alphabet to make case-insensitive.
*
* Returns: <eslOK> on success.
*
* Throws: <eslECORRUPT> if any lower/uppercase symbol pairs
* are already both mapped to different symbols.
*/
int
esl_alphabet_SetCaseInsensitive(ESL_ALPHABET *a)
{
int lc, uc;
for (lc = 'a'; lc <= 'z'; lc++)
{
uc = toupper(lc);
if (esl_abc_CIsValid(a, lc) && ! esl_abc_CIsValid(a, uc)) a->inmap[uc] = a->inmap[lc];
else if (esl_abc_CIsValid(a, uc) && ! esl_abc_CIsValid(a, lc)) a->inmap[lc] = a->inmap[uc];
else if (esl_abc_CIsValid(a, lc) && esl_abc_CIsValid(a, uc) && a->inmap[uc] != a->inmap[lc])
ESL_EXCEPTION(eslECORRUPT, "symbols %c and %c map differently already (%c vs. %c)",
lc, uc, a->inmap[lc], a->inmap[uc]);
}
return eslOK;
}
/* Function: esl_alphabet_SetDegeneracy()
* Synopsis: Define degenerate symbol in custom alphabet.
*
* Purpose: Given an alphabet under construction,
* define the degenerate character <c> to mean
* any of the characters in the string <ds>.
*
* <c> must exist in the digital alphabet, as
* one of the optional degenerate residues (<K+1>..<Kp-3>).
* All the characters in the <ds> string must exist
* in the canonical alphabet (<0>..<K-1>).
*
* You may not redefine the mandatory all-degenerate character
* (typically <N> or <X>; <Kp-3> in the digital alphabet).
* It is defined automatically in all alphabets.
*
* Args: a - an alphabet under construction.
* c - degenerate character code; example: 'R'
* ds - string of base characters for c; example: "AG"
*
* Returns: <eslOK> on success.
*
* Throws: <eslEINVAL> if <c> or <ds> arguments aren't valid.
*/
int
esl_alphabet_SetDegeneracy(ESL_ALPHABET *a, char c, char *ds)
{
char *sp;
ESL_DSQ x,y;
if ((sp = strchr(a->sym, c)) == NULL)
ESL_EXCEPTION(eslEINVAL, "no such degenerate character");
x = sp - a->sym;
/* A degenerate character must have code K+1..Kp-4.
* Kp-3, the all-degenerate character, is automatically
* created, and can't be remapped.
*/
if (x == a->Kp-3)
ESL_EXCEPTION(eslEINVAL, "can't redefine all-degenerate char %c", c);
if (x < a->K+1 || x >= a->Kp-2)
ESL_EXCEPTION(eslEINVAL, "char %c isn't in expected position in alphabet", c);
while (*ds != '\0') {
if ((sp = strchr(a->sym, *ds)) == NULL) ESL_EXCEPTION(eslEINVAL, "no such base character");
y = sp - a->sym;
if (! esl_abc_XIsCanonical(a, y)) ESL_EXCEPTION(eslEINVAL, "can't map degeneracy to noncanonical character");
a->degen[x][y] = 1;
a->ndegen[x]++;
ds++;
}
return eslOK;
}
/* Function: esl_alphabet_SetIgnored()
* Synopsis: Define a set of characters to be ignored in input.
*
* Purpose: Given an alphabet <a> (either standard or custom), define
* all the characters in string <ignoredchars> to be
* unmapped: valid, but ignored when converting input text.
*
* By default, the standard alphabets do not define any
* ignored characters.
*
* The most common ignored characters would be space, tab,
* and digits, to skip silently over whitespace and
* sequence coordinates when parsing loosely-defined
* sequence file formats.
*
* Args: a - alphabet to modify
* ignoredchars - string listing characters to ignore; i.e. " \t"
*
* Returns: <eslOK> on success.
*/
int
esl_alphabet_SetIgnored(ESL_ALPHABET *a, const char *ignoredchars)
{
int i;
for (i = 0; ignoredchars[i] != '\0'; i++) a->inmap[(int)ignoredchars[i]] = eslDSQ_IGNORED;
return eslOK;
}
/* Function: esl_alphabet_Sizeof()
* Synopsis: Returns size of an alphabet object, in bytes.
*
* Purpose: Returns the size of alphabet <a> object, in bytes.
*/
size_t
esl_alphabet_Sizeof(ESL_ALPHABET *a)
{
size_t n = 0;
n += sizeof(ESL_ALPHABET);
n += sizeof(char) * a->Kp; /* a->sym */
n += sizeof(char *) * a->Kp; /* a->degen */
n += sizeof(char) * (a->Kp * a->K); /* a->degen[][] */
n += sizeof(int) * a->Kp; /* a->ndegen */
if (a->complement) n += sizeof(ESL_DSQ) * a->Kp; /* a->complement */
return n;
}
/* Function: esl_alphabet_Destroy()
* Synopsis: Frees an alphabet object.
*
* Purpose: Free's an <ESL_ALPHABET> structure.
*
* Args: a - the <ESL_ALPHABET> to free.
*
* Returns: (void).
*/
void
esl_alphabet_Destroy(ESL_ALPHABET *a)
{
if (a)
{
if (a->sym) free(a->sym);
if (a->ndegen) free(a->ndegen);
if (a->degen)
{
if (a->degen[0]) free(a->degen[0]);
free(a->degen);
}
if (a->complement) free(a->complement);
free(a);
}
}
/*--------------- end, ESL_ALPHABET object ----------------------*/
/*****************************************************************
* 2. Digitized sequences (ESL_DSQ *)
*****************************************************************/
/* Design note: SRE, Mon Sep 18 09:11:41 2006
*
* An ESL_DSQ is considered to a special string type, equivalent to
* <char *>, and is not considered to be an Easel "object". Thus it
* does not have a standard object API. Rather, the caller deals with
* an ESL_DSQ directly: allocate for <(L+2)*sizeof(ESL_DSQ)> to leave
* room for sentinels at <0> and <L+1>.
*
* Additionally, an ESL_DSQ is considered to be "trusted"
* data: we're 'guaranteed' that anything in an ESL_DSQ is a valid
* symbol, so we don't need to error-check. Anything else is a programming
* error.
*/
/* Function: esl_abc_CreateDsq()
* Synopsis: Digitizes a sequence into new space.
*
* Purpose: Given an alphabet <a> and an ASCII sequence <seq>,
* digitize the sequence into newly allocated space, and
* return a pointer to that space in <ret_dsq>.
*
* Args: a - internal alphabet
* seq - text sequence to be digitized
* ret_dsq - RETURN: the new digital sequence
*
* Returns: <eslOK> on success, and <ret_dsq> contains the digitized
* sequence; caller is responsible for free'ing this
* memory. Returns <eslEINVAL> if <seq> contains
* one or more characters that are not in the input map of
* alphabet <a>. If this happens, <ret_dsq> is still valid upon
* return: invalid characters are replaced by full ambiguities
* (typically X or N).
*
* Throws: <eslEMEM> on allocation failure.
*
* Xref: STL11/63
*/
int
esl_abc_CreateDsq(const ESL_ALPHABET *a, const char *seq, ESL_DSQ **ret_dsq)
{
ESL_DSQ *dsq = NULL;
int status;
int64_t L;
L = strlen(seq);
ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (L+2));
status = esl_abc_Digitize(a, seq, dsq);
if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);
return status;
ERROR:
if (dsq != NULL) free(dsq);
if (ret_dsq != NULL) *ret_dsq = NULL;
return status;
}
/* Function: esl_abc_Digitize()
* Synopsis: Digitizes a sequence into existing space.
*
* Purpose: Given an alphabet <a> and a nul-terminated ASCII sequence
* <seq>, digitize the sequence and put it in <dsq>. Caller
* provides space in <dsq> allocated for at least <L+2>
* <ESL_DSQ> residues, where <L> is the length of <seq>.
*
* Args: a - internal alphabet
* seq - text sequence to be digitized (\0-terminated)
* dsq - RETURN: the new digital sequence (caller allocates,
* at least <(L+2) * sizeof(ESL_DSQ)>).
*
* Returns: <eslOK> on success.
* Returns <eslEINVAL> if <seq> contains one or more characters
* that are not recognized in the alphabet <a>. (This is classed
* as a normal error, because the <seq> may be untrusted user input.)
* If this happens, the digital sequence <dsq> is still valid upon
* return; invalid ASCII characters are replaced by ambiguities
* (X or N).
*/
int
esl_abc_Digitize(const ESL_ALPHABET *a, const char *seq, ESL_DSQ *dsq)
{
int status;
int64_t i; /* position in seq */
int64_t j; /* position in dsq */
ESL_DSQ x;
status = eslOK;
dsq[0] = eslDSQ_SENTINEL;
for (i = 0, j = 1; seq[i] != '\0'; i++)
{
x = a->inmap[(int) seq[i]];
if (esl_abc_XIsValid(a, x)) dsq[j] = x;
else if (x == eslDSQ_IGNORED) continue;
else {
status = eslEINVAL;
dsq[j] = esl_abc_XGetUnknown(a);
}
j++;
}
dsq[j] = eslDSQ_SENTINEL;
return status;
}
/* Function: esl_abc_Textize()
* Synopsis: Convert digital sequence to text.
*
* Purpose: Make an ASCII sequence <seq> by converting a digital
* sequence <dsq> of length <L> back to text, according to
* the digital alphabet <a>.
*
* Caller provides space in <seq> allocated for at least
* <L+1> bytes (<(L+1) * sizeof(char)>).
*
* Args: a - internal alphabet
* dsq - digital sequence to be converted (1..L)
* L - length of dsq
* seq - RETURN: the new text sequence (caller allocated
* space, at least <(L+1) * sizeof(char)>).
*
* Returns: <eslOK> on success.
*/
int
esl_abc_Textize(const ESL_ALPHABET *a, const ESL_DSQ *dsq, int64_t L, char *seq)
{
int64_t i;
for (i = 0; i < L; i++)
seq[i] = a->sym[dsq[i+1]];
seq[i] = '\0';
return eslOK;
}
/* Function: esl_abc_TextizeN()
* Synopsis: Convert subsequence from digital to text.
*
* Purpose: Similar in semantics to <strncpy()>, this procedure takes
* a window of <L> residues in a digitized sequence
* starting at the residue pointed to by <dptr>,
* converts them to ASCII text representation, and
* copies them into the buffer <buf>.
*
* <buf> must be at least <L> residues long; <L+1>, if the
* caller needs to NUL-terminate it.
*
* If a sentinel byte is encountered in the digitized
* sequence before <L> residues have been copied, <buf> is
* NUL-terminated there. Otherwise, like <strncpy()>, <buf>
* will not be NUL-terminated.
*
* Note that because digital sequences are indexed <1..N>,
* not <0..N-1>, the caller must be careful about
* off-by-one errors in <dptr>. For example, to copy from
* the first residue of a digital sequence <dsq>, you must
* pass <dptr=dsq+1>, not <dptr=dsq>. The text in <buf>
* on the other hand is a normal C string indexed <0..L-1>.
*
* Args: a - reference to an internal alphabet
* dptr - ptr to starting residue in a digital sequence
* L - number of residues to convert and copy
* buf - text buffer to store the <L> converted residues in
*
* Returns: <eslOK> on success.
*/
int
esl_abc_TextizeN(const ESL_ALPHABET *a, const ESL_DSQ *dptr, int64_t L, char *buf)
{
int64_t i;
for (i = 0; i < L; i++)
{
if (dptr[i] == eslDSQ_SENTINEL)
{
buf[i] = '\0';
return eslOK;
}
buf[i] = a->sym[dptr[i]];
}
return eslOK;
}
/* Function: esl_abc_dsqcpy()
*
* Purpose: Given a digital sequence <dsq> of length <L>,
* make a copy of it in <dcopy>. Caller provides
* storage in <dcopy> for at least <L+2> <ESL_DSQ>
* residues.
*
* Returns: <eslOK> on success.
*/
int
esl_abc_dsqcpy(const ESL_DSQ *dsq, int64_t L, ESL_DSQ *dcopy)
{
memcpy(dcopy, dsq, sizeof(ESL_DSQ) * (L+2));
return eslOK;
}
/* Function: esl_abc_dsqdup()
* Synopsis: Duplicate a digital sequence.
*
* Purpose: Like <esl_strdup()>, but for digitized sequences:
* make a duplicate of <dsq> and leave it in <ret_dup>.
* Caller can pass the string length <L> if it's known, saving
* some overhead; else pass <-1> and the length will be
* determined for you.
*
* Tolerates <dsq> being <NULL>; in which case, returns
* <eslOK> with <*ret_dup> set to <NULL>.
*
* Args: dsq - digital sequence to duplicate (w/ sentinels at 0,L+1)
* L - length of dsq in residues, if known; -1 if unknown
* ret_dup - RETURN: allocated duplicate of <dsq>, which caller will
* free.
*
* Returns: <eslOK> on success, and leaves a pointer in <ret_dup>.
*
* Throws: <eslEMEM> on allocation failure.
*
* Xref: STL11/48
*/
int
esl_abc_dsqdup(const ESL_DSQ *dsq, int64_t L, ESL_DSQ **ret_dup)
{
int status;
ESL_DSQ *new = NULL;
if (ret_dup == NULL) return eslOK; /* no-op. */
*ret_dup = NULL;
if (dsq == NULL) return eslOK;
if (L < 0) L = esl_abc_dsqlen(dsq);
ESL_ALLOC(new, sizeof(ESL_DSQ) * (L+2));
memcpy(new, dsq, sizeof(ESL_DSQ) * (L+2));
*ret_dup = new;
return eslOK;
ERROR:
if (new != NULL) free(new);
if (ret_dup != NULL) *ret_dup = NULL;
return status;
}
/* Function: esl_abc_dsqcat()
* Synopsis: Concatenate and map input chars to a digital sequence.
*
* Purpose: Append the contents of string or memory line <s> of
* length <n> to a digital sequence, after digitizing
* each input character in <s> according to an Easel
* <inmap>. The destination sequence and its length
* are passed by reference, <*dsq> and <*L>, so that
* the sequence may be reallocated and the length updated
* upon return.
*
* The input map <inmap> may map characters to
* <eslDSQ_IGNORED> or <eslDSQ_ILLEGAL>, but not to <eslDSQ_EOL>,
* <eslDSQ_EOD>, or <eslDSQ_SENTINEL> codes. <inmap[0]> is
* special, and must be set to the code for the 'unknown'
* residue (such as 'X' for proteins, 'N' for DNA) that
* will be used to replace any invalid <eslDSQ_ILLEGAL>
* characters.
*
* If <*dsq> is properly terminated digital sequence and
* the caller doesn't know its length, <*L> may be passed
* as -1. Providing the length when it's known saves an
* <esl_abc_dsqlen()> call. If <*dsq> is unterminated, <*L>
* is mandatory. Essentially the same goes for <*s>, which
* may be a NUL-terminated string (pass <n=-1> if length unknown),
* or a memory line (<n> is mandatory).
*
* <*dsq> may also be <NULL>, in which case it is allocated
* and initialized here.
*
* Caller should provide an <s> that is expected to be
* essentially all appendable to <*dsq> except for a small
* number of chars that map to <eslDSQ_IGNORE>, like an
* input sequence data line from a file, for example. We're
* going to reallocate <*dsq> to size <*L+n>; if <n> is an
* entire large buffer or file, this reallocation will be
* inefficient.
*
* Args: inmap - an Easel input map, inmap[0..127];
* inmap[0] is special, set to the 'unknown' character
* to replace invalid input chars.
* dsq - reference to the current digital seq to append to
* (with sentinel bytes at 0,L+1); may be <NULL>.
* Upon return, this will probably have
* been reallocated, and it will contain the original
* <dsq> with <s> digitized and appended.
* L - reference to the current length of <dsq> in residues;
* may be <-1> if unknown and if <*dsq> is a properly
* terminated digital sequence. Upon return, <L> is set to
* the new length of <dsq>, after <s> is appended.
* s - ASCII text sequence to append. May
* contain ignored text characters (flagged with
* <eslDSQ_IGNORED> in the input map of alphabet <abc>).
* n - Length of <s> in characters, if known; or <-1> if
* unknown and if <s> is a NUL-terminated string.
*
* Returns: <eslOK> on success; <*dsq> contains the result of digitizing
* and appending <s> to the original <*dsq>; and <*L> contains
* the new length of the <dsq> result in residues.
*
* If any of the characters in <s> are illegal in the
* alphabet <abc>, these characters are digitized as
* unknown residues (using <inmap[0]>) and
* concatenation/digitization proceeds to completion, but
* the function returns <eslEINVAL>. The caller might then
* want to call <esl_abc_ValidateSeq()> on <s> if it wants
* to figure out where digitization goes awry and get a
* more informative error report. This is a normal error,
* because the string <s> might be user input.
*
* Throws: <eslEMEM> on allocation or reallocation failure;
* <eslEINCONCEIVABLE> on coding error.
*
* Xref: SRE:STL11/48; SRE:J7/145.
*
* Note: This closely parallels a text mode version, <esl_strmapcat()>.
*/
int
esl_abc_dsqcat(const ESL_DSQ *inmap, ESL_DSQ **dsq, int64_t *L, const char *s, esl_pos_t n)
{
int status = eslOK;
if (*L < 0) *L = ((*dsq) ? esl_abc_dsqlen(*dsq) : 0);
if ( n < 0) n = ( (s) ? strlen(s) : 0);
if (n == 0) { goto ERROR; } /* that'll return eslOK, leaving *dest untouched, and *ldest its length */
if (*dsq == NULL) { /* an entirely new dsq is allocated *and* initialized with left sentinel. */
ESL_ALLOC(*dsq, sizeof(ESL_DSQ) * (n+2));
(*dsq)[0] = eslDSQ_SENTINEL;
} else /* else, existing dsq is just reallocated; leftmost sentinel already in place. */
ESL_REALLOC(*dsq, sizeof(ESL_DSQ) * (*L+n+2)); /* most we'll need */
return esl_abc_dsqcat_noalloc(inmap, *dsq, L, s, n);
ERROR:
return status;
}
/* Function: esl_abc_dsqcat_noalloc()
* Synopsis: Version of esl_abc_dsqcat() that does no reallocation.
*
* Purpose: Same as <esl_abc_dsqcat()>, but with no reallocation of
* <dsq>. The pointer to the destination string <dsq> is
* passed by value not by reference, because it will not
* be reallocated or moved. Caller has already allocated
* at least <*L + n + 2> bytes in <dsq>. <*L> and <n> are
* not optional; caller must know (and provide) the lengths
* of both the old string and the new source.
*
* Note: This version was needed in selex format parsing, where
* we need to prepend and append some number of gaps on
* each new line of each block of input; allocating once
* then adding the gaps and the sequence seemed most efficient.
*/
int
esl_abc_dsqcat_noalloc(const ESL_DSQ *inmap, ESL_DSQ *dsq, int64_t *L, const char *s, esl_pos_t n)
{
int64_t xpos;
esl_pos_t cpos;
ESL_DSQ x;
int status = eslOK;
/* Watch these coords. Start in the 0..n-1 text string at 0;
* start in the 1..L dsq at L+1, overwriting its terminal
* sentinel byte.
*/
for (xpos = *L+1, cpos = 0; cpos < n; cpos++)
{
if (! isascii(s[cpos])) { dsq[xpos++] = inmap[0]; status = eslEINVAL; continue; }
x = inmap[(int) s[cpos]];
if (x <= 127) dsq[xpos++] = x;
else switch (x) {
case eslDSQ_SENTINEL: ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_SENTINEL"); break;
case eslDSQ_ILLEGAL: dsq[xpos++] = inmap[0]; status = eslEINVAL; break;
case eslDSQ_IGNORED: break;
case eslDSQ_EOL: ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_EOL"); break;
case eslDSQ_EOD: ESL_EXCEPTION(eslEINCONCEIVABLE, "input char mapped to eslDSQ_EOD"); break;
default: ESL_EXCEPTION(eslEINCONCEIVABLE, "bad inmap, no such ESL_DSQ code"); break;
}
}
dsq[xpos] = eslDSQ_SENTINEL;
*L = xpos-1;
return status;
}
/* Function: esl_abc_dsqlen()
* Synopsis: Returns the length of a digital sequence.
*
* Purpose: Returns the length of digitized sequence <dsq> in
* positions (including gaps, if any). The <dsq> must be
* properly terminated by a sentinel byte
* (<eslDSQ_SENTINEL>).
*/
int64_t
esl_abc_dsqlen(const ESL_DSQ *dsq)
{
int64_t n = 0;
while (dsq[n+1] != eslDSQ_SENTINEL) n++;
return n;
}
/* Function: esl_abc_dsqrlen()
* Synopsis: Returns the number of residues in a digital seq.
*
* Purpose: Returns the unaligned length of digitized sequence
* <dsq>, in residues, not counting any gaps or
* missing data symbols.
*/
int64_t
esl_abc_dsqrlen(const ESL_ALPHABET *abc, const ESL_DSQ *dsq)
{
int64_t n = 0;
int64_t i;
for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++)
if (esl_abc_XIsResidue(abc, dsq[i])) n++;
return n;
}
/* Function: esl_abc_CDealign()
* Synopsis: Dealigns a text string, using a reference digital aseq.
*
* Purpose: Dealigns <s> in place by removing characters aligned to
* gaps (or missing data symbols) in the reference digital
* aligned sequence <ref_ax>. Gaps and missing data symbols
* in <ref_ax> are defined by its digital alphabet <abc>.
*
* <s> is typically going to be some kind of textual
* annotation string (secondary structure, consensus, or
* surface accessibility).
*
* Be supercareful of off-by-one errors here! The <ref_ax>
* is a digital sequence that is indexed <1..L>. The
* annotation string <s> is assumed to be <0..L-1> (a
* normal C string), off by one with respect to <ref_ax>.
* In a sequence object, ss annotation is actually stored
* <1..L> -- so if you're going to <esl_abc_CDealign()> a
* <sq->ss>, pass <sq->ss+1> as the argument <s>.
*
* Returns: Returns <eslOK> on success; optionally returns the number
* of characters in the dealigned <s> in <*opt_rlen>.
*
* Throws: (no abnormal error conditions)
*/
int
esl_abc_CDealign(const ESL_ALPHABET *abc, char *s, const ESL_DSQ *ref_ax, int64_t *opt_rlen)
{
int64_t apos;
int64_t n = 0;
if (s == NULL) return eslOK;
for (n=0, apos=1; ref_ax[apos] != eslDSQ_SENTINEL; apos++)
if (! esl_abc_XIsGap(abc, ref_ax[apos]) && ! esl_abc_XIsMissing(abc, ref_ax[apos]) )
s[n++] = s[apos-1]; /* apos-1 because we assume s was 0..alen-1, whereas ref_ax was 1..alen */
s[n] = '\0';
if (opt_rlen != NULL) *opt_rlen = n;
return eslOK;
}
/* Function: esl_abc_XDealign()
* Synopsis: Dealigns a digital string, using a reference digital aseq.
*
* Purpose: Dealigns <x> in place by removing characters aligned to
* gaps (or missing data) in the reference digital aligned
* sequence <ref_ax>. Gaps and missing data symbols in
* <ref_ax> are defined by its digital alphabet <abc>.
*
* Returns: Returns <eslOK> on success; optionally returns the number
* of characters in the dealigned <x> in <*opt_rlen>.
*
* Throws: (no abnormal error conditions)
*/
int
esl_abc_XDealign(const ESL_ALPHABET *abc, ESL_DSQ *x, const ESL_DSQ *ref_ax, int64_t *opt_rlen)
{
int64_t apos;
int64_t n = 0;
if (x == NULL) return eslOK;
x[0] = eslDSQ_SENTINEL;
for (n=1, apos=1; ref_ax[apos] != eslDSQ_SENTINEL; apos++)
if (! esl_abc_XIsGap(abc, ref_ax[apos]) && ! esl_abc_XIsMissing(abc, ref_ax[apos]) )
x[n++] = x[apos];
x[n] = eslDSQ_SENTINEL;
if (opt_rlen != NULL) *opt_rlen = n-1;
return eslOK;
}
/* Function: esl_abc_ConvertDegen2X()
* Synopsis: Convert all degenerate residues to X or N.
*
* Purpose: Convert all the degenerate residue codes in digital
* sequence <dsq> to the code for the maximally degenerate
* "unknown residue" code, as specified in digital alphabet
* <abc>. (For example, X for protein, N for nucleic acid.)
*
* This comes in handy when you're dealing with some piece
* of software that can't deal with standard residue codes,
* and you want to massage your sequences into a form that
* can be accepted. For example, WU-BLAST can't deal with O
* (pyrrolysine) residues, but UniProt has O codes.
*
* Returns: <eslOK> on success.
*
* Throws: (no abnormal error conditions)
*/
int
esl_abc_ConvertDegen2X(const ESL_ALPHABET *abc, ESL_DSQ *dsq)
{
int64_t i;
for (i = 1; dsq[i] != eslDSQ_SENTINEL; i++)
if (esl_abc_XIsDegenerate(abc, dsq[i]))
dsq[i] = esl_abc_XGetUnknown(abc);
return eslOK;
}
/* Function: esl_abc_revcomp()
* Synopsis: Reverse complement a digital sequence
* Incept: SRE, Wed Feb 10 11:54:48 2016 [JB251 BOS-MCO]
*
* Purpose: Reverse complement <dsq>, in place, according to
* its digital alphabet <abc>.
*
* Args: abc - digital alphabet
* dsq - digital sequence, 1..n
* n - length of <dsq>
*
* Returns: <eslOK> on success.
*
* Throws: <eslEINCOMPAT> if alphabet <abc> can't be reverse complemented
*/
int
esl_abc_revcomp(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int n)
{
ESL_DSQ x;
int pos;
if (abc->complement == NULL)
ESL_EXCEPTION(eslEINCOMPAT, "tried to reverse complement using an alphabet that doesn't have one");
for (pos = 1; pos <= n/2; pos++)
{
x = abc->complement[dsq[n-pos+1]];
dsq[n-pos+1] = abc->complement[dsq[pos]];
dsq[pos] = x;
}
if (n%2) dsq[pos] = abc->complement[dsq[pos]];
return eslOK;
}
/*-------------- end, digital sequences (ESL_DSQ) ---------------*/
/*****************************************************************
* 3. Other routines in the API.
*****************************************************************/
/* Function: esl_abc_ValidateType()
* Synopsis: Check that an alphabet is known and valid
* Incept: SRE, Thu Feb 11 15:48:23 2016
*
* Purpose: Returns <eslOK> if <type> is a valid and known Easel
* alphabet type code.
*
* Used to validate "user" input, where we're parsing a
* file format that has stored an Easel alphabet code.
*
* Returns <eslFAIL> for the special <eslUNKNOWN> "unset"
* value, even though that is a valid code, because it's
* not an alphabet, so shouldn't show up in a file.
*/
int
esl_abc_ValidateType(int type)
{
if (type <= 0 || type > eslNONSTANDARD) return eslFAIL;
else return eslOK;
}
/* Function: esl_abc_GuessAlphabet()
* Synopsis: Guess alphabet type from residue composition.
*
* Purpose: Guess the alphabet type from a residue composition.
* The input <ct[0..25]> array contains observed counts of
* the letters A..Z, case-insensitive.
*
* The composition <ct> must contain more than 10 residues.
*
* If it contains $\geq$98\% ACGTN and all four of the
* residues ACGT occur, call it <eslDNA>. (Analogously for
* ACGUN, ACGU: call <eslRNA>.)
*
* If it contains any amino-specific residue (EFIJLPQZ),
* call it <eslAMINO>.
*
* If it consists of $\geq$98\% canonical aa residues or X,
* and at least 15 of the different 20 aa residues occur,
* and the number of residues that are canonical aa/degenerate
* nucleic (DHKMRSVWY) is greater than the number of canonicals
* for both amino and nucleic (ACG); then call it <eslAMINO>.
*
* As a special case, if it consists entirely of N's, and
* we have >2000 residues, call it <eslDNA>. This is a
* special case that deals with genome sequence assemblies
* that lead with a swath of N's.
*
* We aim to be very conservative, essentially never making
* a false call; we err towards calling <eslUNKNOWN> if
* unsure. Our test is to classify every individual
* sequence in NCBI NR and NT (or equiv large messy
* sequence database) with no false positives, only correct
* calls or <eslUNKNOWN>.
*
* Returns: <eslOK> on success, and <*ret_type> is set to
* <eslAMINO>, <eslRNA>, or <eslDNA>.
*
* Returns <eslENOALPHABET> if unable to determine the
* alphabet type; in this case, <*ret_type> is set to
* <eslUNKNOWN>.
*
* Notes: As of Jan 2011:
* nr 10M seqs : 6999 unknown, 0 misclassified
* Pfam full 13M seqs : 7930 unknown, 0 misclassified
* Pfam seed 500K seqs: 366 unknown, 0 misclassified
* trembl 14M seqs : 7748 unknown, 0 misclassified
*
* nt 10M seqs : 35620 unknown, 0 misclassified
* Rfam full 3M seqs : 8146 unknown, 0 misclassified
* Rfam seed 27K seqs : 49 unknown, 0 misclassified
*
* xref: esl_alphabet_example3 collects per-sequence classification
* 2012/0201-easel-guess-alphabet
* J1/62; 2007-0517-easel-guess-alphabet
*/
int
esl_abc_GuessAlphabet(const int64_t *ct, int *ret_type)
{
int type = eslUNKNOWN;
char aaonly[] = "EFIJLOPQZ";
char allcanon[] = "ACG";
char aacanon[] = "DHKMRSVWY";
int64_t n1, n2, n3, nn, nt, nu, nx, n; /* n's are counts */
int x1, x2, x3, xn, xt, xu; /* x's are how many different residues are represented */
int i, x;
x1 = x2 = x3 = xn = xt = xu = 0;
n1 = n2 = n3 = n = 0;
for (i = 0; i < 26; i++) n += ct[i];
for (i = 0; aaonly[i] != '\0'; i++) { x = ct[aaonly[i] - 'A']; if (x > 0) { n1 += x; x1++; } }
for (i = 0; allcanon[i] != '\0'; i++) { x = ct[allcanon[i] - 'A']; if (x > 0) { n2 += x; x2++; } }
for (i = 0; aacanon[i] != '\0'; i++) { x = ct[aacanon[i] - 'A']; if (x > 0) { n3 += x; x3++; } }
nt = ct['T' - 'A']; xt = (nt ? 1 : 0);
nu = ct['U' - 'A']; xu = (nu ? 1 : 0);
nx = ct['X' - 'A'];
nn = ct['N' - 'A']; xn = (nn ? 1 : 0);
if (n <= 10) type = eslUNKNOWN; // small sample, don't guess
else if (n > 2000 && nn == n) type = eslDNA; // special case of many N's leading a genome assembly
else if (n1 > 0) type = eslAMINO; // contains giveaway, aa-only chars
else if (n-(n2+nt+nn) <= 0.02*n && x2+xt == 4) type = eslDNA; // nearly all DNA canon (or N), all four seen
else if (n-(n2+nu+nn) <= 0.02*n && x2+xu == 4) type = eslRNA; // nearly all RNA canon (or N), all four seen
else if (n-(n1+n2+n3+nn+nt+nx) <= 0.02*n && n3>n2 && x1+x2+x3+xn+xt >= 15) type = eslAMINO; // nearly all aa canon (or X); more aa canon than ambig; all 20 seen
*ret_type = type;
if (type == eslUNKNOWN) return eslENOALPHABET;
else return eslOK;
}
/* Function: esl_abc_Match()
* Synopsis: Returns the probability that two symbols match.
*
* Purpose: Given two digital symbols <x> and <y> in alphabet
* <abc>, calculate and return the probability that
* <x> and <y> match, taking degenerate residue codes
* into account.
*
* If <p> residue probability vector is NULL, the
* calculation is a simple average. For example, for DNA,
* R/A gives 0.5, C/N gives 0.25, N/R gives 0.25, R/R gives
* 0.5.
*
* If <p> residue probability vector is non-NULL, it gives
* a 0..K-1 array of background frequencies, and the
* returned match probability is an expectation (weighted
* average) given those residue frequencies.
*
* <x> and <y> should only be residue codes. Any other
* comparison, including comparisons involving gap or
* missing data characters, or even comparisons involving
* illegal digital codes, returns 0.0.
*
* Note that comparison of residues from "identical"
* sequences (even a self-comparison) will not result in an
* identity of 1.0, if the sequence(s) contain degenerate
* residue codes.
*
* Args: abc - digtal alphabet to use
* x,y - two symbols to compare
* p - NULL, or background probabilities of the
* canonical residues in this alphabet [0..K-1]
*
* Returns: the probability of an identity (match) between
* residues <x> and <y>.
*/
double
esl_abc_Match(const ESL_ALPHABET *abc, ESL_DSQ x, ESL_DSQ y, double *p)
{
int i;
double prob;
double sx, sy;
/* Easy cases */
if (esl_abc_XIsCanonical(abc, x) && esl_abc_XIsCanonical(abc, y))
{
if (x==y) return 1.0; else return 0.0;
}
if ( ! esl_abc_XIsResidue(abc, x) || ! esl_abc_XIsResidue(abc, x)) return 0.0;
/* Else, we have at least one degenerate residue, so calc an average or expectation.
*/
if (p != NULL)
{
prob = sx = sy = 0.;
for (i = 0; i < abc->K; i++)
{
if (abc->degen[(int)x][i]) sx += p[i];
if (abc->degen[(int)y][i]) sy += p[i];
if (abc->degen[(int)x][i] && abc->degen[(int)y][i]) prob += p[i] * p[i];
}
prob = prob / (sx*sy);
}
else
{
double uniformp = 1. / (double) abc->K;
prob = sx = sy = 0.;
for (i = 0; i < abc->K; i++)
{
if (abc->degen[(int)x][i]) sx += uniformp;
if (abc->degen[(int)y][i]) sy += uniformp;
if (abc->degen[(int)x][i] && abc->degen[(int)y][i]) prob += uniformp * uniformp;
}
prob = prob / (sx*sy);
}
return prob;
}
/* Function: esl_abc_IAvgScore()
* Synopsis: Returns average score for degenerate residue.
*
* Purpose: Given a residue code <x> in alphabet <a>, and an array of
* integer scores <sc> for the residues in the base
* alphabet, calculate and return the average score
* (rounded to nearest integer).
*
* <x> would usually be a degeneracy code, but it
* may also be a canonical residue. It must not
* be a gap, missing data, or illegal symbol; if it
* is, these functions return a score of 0 without
* raising an error.
*
* <esl_abc_FAvgScore()> and <esl_abc_DAvgScore()> do the
* same, but for float and double scores instead of integers
* (and for real-valued scores, no rounding is done).
*
* Args: a - digital alphabet to use
* x - a symbol to score
* sc - score vector for canonical residues [0..K-1]
*
* Returns: average score for symbol <x>
*/
int
esl_abc_IAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc)
{
float result = 0.;
int i;
if (! esl_abc_XIsResidue(a, x)) return 0;
for (i = 0; i < a->K; i++)
if (a->degen[(int) x][i]) result += (float) sc[i];
result /= (float) a->ndegen[(int) x];
if (result < 0) return (int) (result - 0.5);
else return (int) (result + 0.5);
}
float
esl_abc_FAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc)
{
float result = 0.;
int i;
if (! esl_abc_XIsResidue(a, x)) return 0.;
for (i = 0; i < a->K; i++)
if (a->degen[(int) x][i]) result += sc[i];
result /= (float) a->ndegen[(int) x];
return result;
}
double
esl_abc_DAvgScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc)
{
double result = 0.;
int i;
if (! esl_abc_XIsResidue(a, x)) return 0.;
for (i = 0; i < a->K; i++)
if (a->degen[(int) x][i]) result += sc[i];
result /= (double) a->ndegen[(int) x];
return result;
}
/* Function: esl_abc_IExpectScore()
* Synopsis: Returns expected score for degenerate residue.
*
* Purpose: Given a residue code <x> in alphabet <a>, an
* array of integer scores <sc> for the residues in the base
* alphabet, and background frequencies <p> for the
* occurrence frequencies of the residues in the base
* alphabet, calculate and return the expected score
* (weighted by the occurrence frequencies <p>).
*
* <x> would usually be a degeneracy code, but it
* may also be a canonical residue. It must not
* be a gap, missing data, or illegal symbol; if it
* is, these functions return a score of 0 without
* raising an error.
*
* <esl_abc_FExpectScore()> and <esl_abc_DExpectScore()> do the
* same, but for float and double scores instead of integers
* (for real-valued scores, no rounding is done).
*
* Args: a - digital alphabet to use
* x - a symbol to score
* sc - score vector for canonical residues [0..K-1]
* p - background prob's of canonicals [0..K-1]
*
* Returns: average score for symbol <x>
*/
int
esl_abc_IExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const int *sc, const float *p)
{
float result = 0.;
float denom = 0.;
int i;
if (! esl_abc_XIsResidue(a, x)) return 0;
for (i = 0; i < a->K; i++)
if (a->degen[(int) x][i]) {
result += (float) sc[i] * p[i];
denom += p[i];
}
result /= denom;
if (result < 0) return (int) (result - 0.5);
else return (int) (result + 0.5);
}
float
esl_abc_FExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const float *sc, const float *p)
{
float result = 0.;
float denom = 0.;
int i;
if (! esl_abc_XIsResidue(a, x)) return 0.;
for (i = 0; i < a->K; i++)
if (a->degen[(int) x][i]) {
result += sc[i] * p[i];
denom += p[i];
}
result /= denom;
return result;
}
double
esl_abc_DExpectScore(const ESL_ALPHABET *a, ESL_DSQ x, const double *sc, const double *p)
{
double result = 0.;
double denom = 0.;
int i;
if (! esl_abc_XIsResidue(a, x)) return 0.;
for (i = 0; i < a->K; i++)
if (a->degen[(int) x][i]) {
result += sc[i] * p[i];
denom += p[i];
}
result /= denom;
return result;
}
/* Function: esl_abc_IAvgScVec()
* Synopsis: Fill out score vector with average degenerate scores.
*
* Purpose: Given an alphabet <a> and a score vector <sc> of length
* <a->Kp> that contains integer scores for the base
* alphabet (<0..a->K-1>), fill out the rest of the score
* vector, calculating average scores for
* degenerate residues using <esl_abc_IAvgScore()>.
*
* The score, if any, for a gap character <K>, the
* nonresidue <Kp-2>, and the missing data character <Kp-1>
* are untouched by this function. Only the degenerate
* scores <K+1..Kp-3> are filled in.
*
* <esl_abc_FAvgScVec()> and <esl_abc_DAvgScVec()> do
* the same, but for score vectors of floats or doubles,
* respectively.
*
* Returns: <eslOK> on success.
*/
int
esl_abc_IAvgScVec(const ESL_ALPHABET *a, int *sc)
{
ESL_DSQ x;
for (x = a->K+1; x <= a->Kp-3; x++)
sc[x] = esl_abc_IAvgScore(a, x, sc);
return eslOK;
}
int
esl_abc_FAvgScVec(const ESL_ALPHABET *a, float *sc)
{
ESL_DSQ x;
for (x = a->K+1; x <= a->Kp-3; x++)
sc[x] = esl_abc_FAvgScore(a, x, sc);
return eslOK;
}
int
esl_abc_DAvgScVec(const ESL_ALPHABET *a, double *sc)
{
ESL_DSQ x;
for (x = a->K+1; x <= a->Kp-3; x++)
sc[x] = esl_abc_DAvgScore(a, x, sc);
return eslOK;
}
/* Function: esl_abc_IExpectScVec()
* Synopsis: Fill out score vector with average expected scores.
*
* Purpose: Given an alphabet <a>, a score vector <sc> of length
* <a->Kp> that contains integer scores for the base
* alphabet (<0..a->K-1>), and residue occurrence probabilities
* <p[0..a->K-1]>; fill in the scores for the
* degenerate residues <K+1..Kp-3> using <esl_abc_IExpectScore()>.
*
* The score, if any, for a gap character <K>, the
* nonresidue <Kp-2>, and the missing data character <Kp-1>
* are untouched by this function. Only the degenerate
* scores <K+1..Kp-3> are filled in.
*
* <esl_abc_FExpectScVec()> and <esl_abc_DExpectScVec()> do
* the same, but for score vectors of floats or doubles,
* respectively. The probabilities <p> are floats for the
* integer and float versions, and doubles for the double
* version.
*
* Returns: <eslOK> on success.
*/
int
esl_abc_IExpectScVec(const ESL_ALPHABET *a, int *sc, const float *p)
{
ESL_DSQ x;
for (x = a->K+1; x <= a->Kp-3; x++)
sc[x] = esl_abc_IExpectScore(a, x, sc, p);
return eslOK;
}
int
esl_abc_FExpectScVec(const ESL_ALPHABET *a, float *sc, const float *p)
{
ESL_DSQ x;
for (x = a->K+1; x <= a->Kp-3; x++)
sc[x] = esl_abc_FExpectScore(a, x, sc, p);
return eslOK;
}
int
esl_abc_DExpectScVec(const ESL_ALPHABET *a, double *sc, const double *p)
{
ESL_DSQ x;
for (x = a->K+1; x <= a->Kp-3; x++)
sc[x] = esl_abc_DExpectScore(a, x, sc, p);
return eslOK;
}
/* Function: esl_abc_FCount()
* Synopsis: Count a degenerate symbol into a count vector.
*
* Purpose: Count a possibly degenerate digital symbol <x> (0..Kp-1)
* into a counts array <ct> for base symbols (0..K-1).
* Assign the symbol a weight of <wt> (often just 1.0).
* The count weight of a degenerate symbol is divided equally
* across the possible base symbols.
*
* <x> can be a gap; if so, <ct> must be allocated 0..K,
* not 0..K-1. If <x> is a missing data symbol, or a nonresidue
* data symbol, nothing is done.
*
* <esl_abc_DCount()> does the same, but for double-precision
* count vectors and weights.
*
* Returns: <eslOK> on success.
*/
int
esl_abc_FCount(const ESL_ALPHABET *abc, float *ct, ESL_DSQ x, float wt)
{
ESL_DSQ y;
if (esl_abc_XIsCanonical(abc, x) || esl_abc_XIsGap(abc, x))
ct[x] += wt;
else if (esl_abc_XIsMissing(abc, x) || esl_abc_XIsNonresidue(abc, x))
return eslOK;
else
for (y = 0; y < abc->K; y++) {
if (abc->degen[x][y])
ct[y] += wt / (float) abc->ndegen[x];
}
return eslOK;
}
int
esl_abc_DCount(const ESL_ALPHABET *abc, double *ct, ESL_DSQ x, double wt)
{
ESL_DSQ y;
if (esl_abc_XIsCanonical(abc, x) || esl_abc_XIsGap(abc, x))
ct[x] += wt;
else if (esl_abc_XIsMissing(abc, x) || esl_abc_XIsNonresidue(abc, x))
return eslOK;
else
for (y = 0; y < abc->K; y++) {
if (abc->degen[x][y])
ct[y] += wt / (double) abc->ndegen[x];
}
return eslOK;
}
/* Function: esl_abc_EncodeType()
* Synopsis: Convert descriptive string to alphabet type code.
*
* Purpose: Convert a string like "amino" or "DNA" to the
* corresponding Easel internal alphabet type code
* such as <eslAMINO> or <eslDNA>; return the code.
*
* Returns: the code, such as <eslAMINO>; if <type> isn't
* recognized, returns <eslUNKNOWN>.
*/
int
esl_abc_EncodeType(char *type)
{
if (strcasecmp(type, "amino") == 0) return eslAMINO;
else if (strcasecmp(type, "rna") == 0) return eslRNA;
else if (strcasecmp(type, "dna") == 0) return eslDNA;
else if (strcasecmp(type, "coins") == 0) return eslCOINS;
else if (strcasecmp(type, "dice") == 0) return eslDICE;
else if (strcasecmp(type, "custom")== 0) return eslNONSTANDARD;
else return eslUNKNOWN;
}
/* Function: esl_abc_DecodeType()
* Synopsis: Returns descriptive string for alphabet type code.
*
* Purpose: For diagnostics and other output: given an internal
* alphabet code <type> (<eslRNA>, for example), return
* pointer to an internal string ("RNA", for example).
*/
char *
esl_abc_DecodeType(int type)
{
switch (type) {
case eslUNKNOWN: return "unknown";
case eslRNA: return "RNA";
case eslDNA: return "DNA";
case eslAMINO: return "amino";
case eslCOINS: return "coins";
case eslDICE: return "dice";
case eslNONSTANDARD: return "custom";
default: break;
}
esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "no such alphabet type code %d\n", type);
return NULL;
}
/* Function: esl_abc_ValidateSeq()
* Synopsis: Assure that a text sequence can be digitized.
*
* Purpose: Check that sequence <seq> of length <L> can be digitized
* without error; all its symbols are valid in alphabet
* <a>. If so, return <eslOK>. If not, return <eslEINVAL>.
*
* If <a> is <NULL>, we still validate that at least the
* <seq> consists only of ASCII characters.
*
* <errbuf> is either passed as <NULL>, or a pointer to an
* error string buffer allocated by the caller for
* <eslERRBUFSIZE> characters. If <errbuf> is non-NULL, and
* the sequence is invalid, an error message is placed in
* <errbuf>.
*
* Args: a - digital alphabet (or NULL, if unavailable)
* seq - sequence to validate [0..L-1]; NUL-termination unnecessary
* L - length of <seq>
* errbuf - NULL, or ptr to <eslERRBUFSIZE> chars of allocated space
* for an error message.
*
* Returns: <eslOK> if <seq> is valid; <eslEINVAL> if not.
*
* Throws: (no abnormal error conditions).
*/
int
esl_abc_ValidateSeq(const ESL_ALPHABET *a, const char *seq, int64_t L, char *errbuf)
{
int status;
int64_t i;
int64_t firstpos = -1;
int64_t nbad = 0;
if (errbuf) *errbuf = 0;
if (a) // If we have digital alphabet <a>, it has an <inmap> we can check against
{
for (i = 0; i < L; i++) {
if (! esl_abc_CIsValid(a, seq[i])) {
if (firstpos == -1) firstpos = i;
nbad++;
}
}
}
else // Else, at least validate that the text string is an ASCII text string
{
for (i = 0; i < L; i++) {
if (! isascii(seq[i])) {
if (firstpos == -1) firstpos = i;
nbad++;
}
}
}
if (nbad == 1) ESL_XFAIL(eslEINVAL, errbuf, "invalid char %c at pos %" PRId64, seq[firstpos], firstpos+1);
else if (nbad > 1) ESL_XFAIL(eslEINVAL, errbuf, "%" PRId64 " invalid chars (including %c at pos %" PRId64 ")", nbad, seq[firstpos], firstpos+1);
return eslOK;
ERROR:
return status;
}
/*---------------- end, other API functions ---------------------*/
/*****************************************************************
* 4. Unit tests.
*****************************************************************/
#ifdef eslALPHABET_TESTDRIVE
#include "esl_vectorops.h"
static int
utest_Create(void)
{
char msg[] = "esl_alphabet_Create() unit test failed";
int types[] = { eslDNA, eslRNA, eslAMINO, eslCOINS, eslDICE };
int Karr[] = { 4, 4, 20, 2, 6 };
int Kparr[] = { 18, 18, 29, 6, 10 };
int i;
ESL_ALPHABET *a;
ESL_DSQ x;
for (i = 0; i < 3; i++)
{
if ((a = esl_alphabet_Create(types[i])) == NULL) esl_fatal(msg);
if (a->type != types[i]) esl_fatal(msg);
if (a->K != Karr[i]) esl_fatal(msg);
if (a->Kp != Kparr[i]) esl_fatal(msg);
if (strlen(a->sym) != a->Kp) esl_fatal(msg);
x = esl_abc_XGetGap(a);
if (x != a->K) esl_fatal(msg);
if (a->ndegen[x] != 0) esl_fatal(msg);
x = esl_abc_XGetUnknown(a);
if (x != a->Kp-3) esl_fatal(msg);
if (a->ndegen[x] != a->K) esl_fatal(msg);
x = esl_abc_XGetNonresidue(a);
if (x != a->Kp-2) esl_fatal(msg);
if (a->ndegen[x] != 0) esl_fatal(msg);
x = esl_abc_XGetMissing(a);
if (x != a->Kp-1) esl_fatal(msg);
if (a->ndegen[x] != 0) esl_fatal(msg);
esl_alphabet_Destroy(a);
}
/* Thrown errors
*/
#ifdef eslTEST_THROWING
if (esl_alphabet_Create(-1) != NULL) esl_fatal(msg);
if (esl_alphabet_Create(eslUNKNOWN) != NULL) esl_fatal(msg);
if (esl_alphabet_Create(eslNONSTANDARD) != NULL) esl_fatal(msg);
#endif
return eslOK;
}
static int
utest_CreateCustom(void)
{
char msg[] = "esl_alphabet_CreateCustom() unit test failed";
ESL_ALPHABET *a;
char *testseq = "AaU-~Z";
ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 0, 15, 20, 26, 23, eslDSQ_SENTINEL };
ESL_DSQ *dsq;
if ((a = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZX*~", 20, 27)) == NULL) esl_fatal(msg);
if (esl_alphabet_SetEquiv(a, 'O', 'K') != eslOK) esl_fatal(msg); /* read pyrrolysine O as lysine K */
if (esl_alphabet_SetEquiv(a, 'U', 'S') != eslOK) esl_fatal(msg); /* read selenocys U as serine S */
if (esl_alphabet_SetCaseInsensitive(a) != eslOK) esl_fatal(msg); /* allow lower case input */
if (esl_alphabet_SetDegeneracy(a, 'Z', "QE") != eslOK) esl_fatal(msg);
if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
free(dsq);
esl_alphabet_Destroy(a);
#ifdef eslTEST_THROWING
if (esl_alphabet_CreateCustom("ACGT-RYMKSWHBVDN*~", 4, 21) != NULL) esl_fatal(msg); /* Kp mismatches string length */
#endif
return eslOK;
}
static int
utest_SetEquiv(void)
{
char msg[] = "esl_alphabet_SetEquiv() unit test failed";
ESL_ALPHABET *a;
char *testseq = "a1&";
ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 4, 7, eslDSQ_SENTINEL };
ESL_DSQ *dsq;
if ((a = esl_alphabet_CreateCustom("ACGT-N*~", 4, 8)) == NULL) esl_fatal(msg);
if (esl_alphabet_SetEquiv(a, 'a', 'A') != eslOK) esl_fatal(msg);
if (esl_alphabet_SetEquiv(a, '1', '-') != eslOK) esl_fatal(msg);
if (esl_alphabet_SetEquiv(a, '&', '~') != eslOK) esl_fatal(msg);
#ifdef eslTEST_THROWING
if (esl_alphabet_SetEquiv(a, 'G', 'C') != eslEINVAL) esl_fatal(msg); /* sym is already in internal alphabet */
if (esl_alphabet_SetEquiv(a, '2', 'V') != eslEINVAL) esl_fatal(msg); /* c is not in internal alphabet */
#endif
if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
free(dsq);
esl_alphabet_Destroy(a);
return eslOK;
}
static int
utest_SetCaseInsensitive(void)
{
char msg[] = "esl_alphabet_SetCaseInsensitive() unit test failed";
ESL_ALPHABET *a;
char *testseq = "ACGT";
ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, eslDSQ_SENTINEL };
ESL_DSQ *dsq;
if ((a = esl_alphabet_CreateCustom("acgt-n*~", 4, 8)) == NULL) esl_fatal(msg);
if (esl_alphabet_SetCaseInsensitive(a) != eslOK) esl_fatal(msg);
if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
free(dsq);
esl_alphabet_Destroy(a);
#ifdef TEST_THROWING
if ((a = esl_alphabet_CreateCustom("acgt-n*~", 4, 8)) == NULL) esl_fatal(msg);
if (esl_alphabet_SetEquiv(a, 'A', 'c') != eslOK) esl_fatal(msg); /* now input A maps to internal c */
if (esl_alphabet_SetCaseInsensitive(a) != eslECORRUPT) esl_fatal(msg); /* and this fails, can't remap A */
esl_alphabet_Destroy(a);
#endif
return eslOK;
}
static int
utest_SetDegeneracy(void)
{
char msg[] = "esl_alphabet_SetDegeneracy() unit test failed";
ESL_ALPHABET *a;
char *testseq = "yrn";
ESL_DSQ expect[] = { eslDSQ_SENTINEL, 6, 5, 7, eslDSQ_SENTINEL };
ESL_DSQ *dsq;
ESL_DSQ x;
if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
if (esl_alphabet_SetDegeneracy(a, 'R', "AG") != eslOK) esl_fatal(msg);
if (esl_alphabet_SetDegeneracy(a, 'Y', "CT") != eslOK) esl_fatal(msg);
if (esl_alphabet_SetCaseInsensitive(a) != eslOK) esl_fatal(msg);
if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (strlen(testseq)+2)) != 0) esl_fatal(msg);
x = esl_abc_DigitizeSymbol(a, 'a'); if (a->ndegen[x] != 1) esl_fatal(msg);
x = esl_abc_DigitizeSymbol(a, 'r'); if (a->ndegen[x] != 2) esl_fatal(msg);
x = esl_abc_DigitizeSymbol(a, 'y'); if (a->ndegen[x] != 2) esl_fatal(msg);
x = esl_abc_DigitizeSymbol(a, 'n'); if (a->ndegen[x] != 4) esl_fatal(msg);
free(dsq);
esl_alphabet_Destroy(a);
#ifdef TEST_THROWING
if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
if (esl_abc_SetDegeneracy(a, 'z', "AC") != eslEINVAL) esl_fatal(msg); /* can't map char not in alphabet */
if (esl_abc_SetDegeneracy(a, 'N', "ACGT") != eslEINVAL) esl_fatal(msg); /* can't remap N */
if (esl_abc_SetDegeneracy(a, 'A', "GT") != eslEINVAL) esl_fatal(msg); /* can't map a nondegen character */
if (esl_abc_SetDegeneracy(a, '-', "GT") != eslEINVAL) esl_fatal(msg); /* ... or a gap... */
if (esl_abc_SetDegeneracy(a, '*', "GT") != eslEINVAL) esl_fatal(msg); /* ... or nonresidues... */
if (esl_abc_SetDegeneracy(a, '~', "GT") != eslEINVAL) esl_fatal(msg); /* ... or missing data. */
if (esl_abc_SetDegeneracy(a, 'R', "XY") != eslEINVAL) esl_fatal(msg); /* can't map to unknown chars... */
if (esl_abc_SetDegeneracy(a, 'R', "YN") != eslEINVAL) esl_fatal(msg); /* ... nor to noncanonical chars... */
esl_alphabet_Destroy(a);
#endif
return eslOK;
}
static int
utest_SetIgnored(void)
{
char msg[] = "esl_alphabet_SetIgnored() unit test failed";
ESL_ALPHABET *a;
char *testseq = "y \trn";
ESL_DSQ expect[] = { eslDSQ_SENTINEL, 6, 5, 15, eslDSQ_SENTINEL };
int L = 5;
ESL_DSQ *dsq;
if ((a = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
if (esl_alphabet_SetIgnored(a, " \t") != eslOK) esl_fatal(msg);
if (esl_abc_CreateDsq(a, testseq, &dsq) != eslOK) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * L) != 0) esl_fatal(msg);
free(dsq);
esl_alphabet_Destroy(a);
return eslOK;
}
static int
utest_Destroy(void)
{
char msg[] = "esl_alphabet_Destroy() unit test failed";
ESL_ALPHABET *a;
if ((a = esl_alphabet_CreateCustom("ACGT-RYN*~", 4, 10)) == NULL) esl_fatal(msg);
esl_alphabet_Destroy(a);
esl_alphabet_Destroy(NULL); /* should be robust against NULL pointers */
return eslOK;
}
static int
utest_CreateDsq(void)
{
char msg[] = "esl_abc_CreateDsq() unit test failed";
ESL_ALPHABET *a;
char goodseq[] = "ACDEF";
char badseq[] = "1@%34";
ESL_DSQ *dsq;
ESL_DSQ x;
if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
if (dsq[1] != 0 || dsq[2] != 1) esl_fatal(msg); /* spot check */
free(dsq);
if (esl_abc_CreateDsq(a, badseq, &dsq) != eslEINVAL) esl_fatal(msg);
x = esl_abc_XGetUnknown(a);
if (dsq[1] != x || dsq[2] != x) esl_fatal(msg); /* bad chars all X's now, upon failure */
free(dsq);
if (esl_abc_CreateDsq(a, goodseq, NULL) != eslOK) esl_fatal(msg);
esl_alphabet_Destroy(a);
return eslOK;
}
static int
utest_Digitize(void)
{
char msg[] = "esl_abc_Digitize() unit test failed";
ESL_ALPHABET *a;
char goodseq[] = "ACDEF";
char badseq[] = "1@%34";
ESL_DSQ *dsq;
ESL_DSQ x;
int status;
ESL_ALLOC(dsq, sizeof(ESL_DSQ) * (strlen(goodseq)+2));
if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
esl_abc_Digitize(a, goodseq, dsq);
if (dsq[1] != 0 || dsq[2] != 1) esl_fatal(msg); /* spot check */
esl_abc_Digitize(a, badseq, dsq);
x = esl_abc_XGetUnknown(a);
if (dsq[1] != x || dsq[2] != x) esl_fatal(msg); /* bad chars all X's now, upon failure */
free(dsq);
esl_alphabet_Destroy(a);
return eslOK;
ERROR:
esl_fatal(msg);
return status;
}
static int
utest_Textize(void)
{
char msg[] = "esl_abc_Textize() unit test failed";
ESL_ALPHABET *a;
char goodseq[] = "acdef";
char *newseq;
ESL_DSQ *dsq;
int L;
int status;
L = strlen(goodseq);
ESL_ALLOC(newseq, sizeof(char) * (L+1));
if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
if (esl_abc_Textize(a, dsq, L, newseq ) != eslOK) esl_fatal(msg);
if (strcmp(newseq, "ACDEF") != 0) esl_fatal(msg);
free(dsq);
free(newseq);
esl_alphabet_Destroy(a);
return eslOK;
ERROR:
esl_fatal(msg);
return status;
}
static int
utest_TextizeN(void)
{
char msg[] = "esl_abc_TextizeN() unit test failed";
ESL_ALPHABET *a;
char goodseq[] = "acdefrynacdef";
ESL_DSQ *dsq;
ESL_DSQ *dptr;
int W;
if ((a = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
dptr = dsq+6; /* points to "r", residue 6 */
W = 5; /* copy/convert 5 residues "rynac" */
if (esl_abc_TextizeN(a, dptr, W, goodseq) != eslOK) esl_fatal(msg);
if (strcmp(goodseq, "RYNACrynacdef") != 0) esl_fatal(msg);
/* test a case where we hit eslDSQ_SENTINEL, and nul-terminate */
dptr = dsq+10; /* points to "c", residue 10 */
W = 20; /* copy/convert remaining residues "cdef" */
if (esl_abc_TextizeN(a, dptr, W, goodseq) != eslOK) esl_fatal(msg);
if (strcmp(goodseq, "CDEF") != 0) esl_fatal(msg);
free(dsq);
esl_alphabet_Destroy(a);
return eslOK;
}
static int
utest_dsqdup(void)
{
char msg[] = "esl_abc_dsqdup() unit test failed";
ESL_ALPHABET *a;
char goodseq[] = "ACGt";
ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, eslDSQ_SENTINEL };
ESL_DSQ *d1, *d2;
int L;
L = strlen(goodseq);
if ((a = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
if (esl_abc_CreateDsq(a, goodseq, &d1) != eslOK) esl_fatal(msg);
if (esl_abc_dsqdup(d1, -1, &d2) != eslOK) esl_fatal(msg);
if (memcmp(d2, expect, sizeof(ESL_DSQ) * (L+2)) != 0) esl_fatal(msg);
free(d2);
if (esl_abc_dsqdup(d1, L, &d2) != eslOK) esl_fatal(msg);
if (memcmp(d2, expect, sizeof(ESL_DSQ) * (L+2)) != 0) esl_fatal(msg);
free(d2);
free(d1);
esl_alphabet_Destroy(a);
return eslOK;
}
static int
utest_dsqcat(void)
{
char msg[] = "esl_abc_dsqcat() unit test failed";
ESL_ALPHABET *a;
char goodseq[] = "ACGt";
char addseq[] = "RYM KN";
char badseq[] = "RYM K&";
ESL_DSQ expect[] = { eslDSQ_SENTINEL, 0, 1, 2, 3, 5, 6, 7, 8, 15, eslDSQ_SENTINEL };
ESL_DSQ *dsq;
int64_t L1;
esl_pos_t L2;
if ((a = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
a->inmap[0] = esl_abc_XGetUnknown(a);
a->inmap[' '] = eslDSQ_IGNORED;
L1 = strlen(goodseq);
L2 = strlen(addseq);
if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, L2) != eslOK) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
free(dsq);
L1 = -1;
L2 = -1;
if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, L2) != eslOK) esl_fatal(msg);
if (L1 != esl_abc_dsqlen(dsq)) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
free(dsq);
L1 = 0;
dsq = NULL;
if (esl_abc_dsqcat(a->inmap, &dsq, &L1, goodseq, -1) != eslOK) esl_fatal(msg);
if (esl_abc_dsqcat(a->inmap, &dsq, &L1, addseq, -1) != eslOK) esl_fatal(msg);
if (L1 != esl_abc_dsqlen(dsq)) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
free(dsq);
L1 = -1;
L2 = strlen(badseq);
if (esl_abc_CreateDsq(a, goodseq, &dsq) != eslOK) esl_fatal(msg);
if (esl_abc_dsqcat(a->inmap, &dsq, &L1, badseq, L2) != eslEINVAL) esl_fatal(msg);
if (L1 != esl_abc_dsqlen(dsq)) esl_fatal(msg);
if (memcmp(dsq, expect, sizeof(ESL_DSQ) * (L1+2)) != 0) esl_fatal(msg);
free(dsq);
esl_alphabet_Destroy(a);
return eslOK;
}
/* dsqlen unit test goes here */
/* dsqrlen unit test goes here */
/* utest_Match goes here */
/* This serves to unit test multiple functions:
* esl_abc_IAvgScore()
* esl_abc_IExpectScore()
*/
static int
degeneracy_integer_scores(void)
{
char *msg = "degeneracy_integer_scores unit test failed";
ESL_ALPHABET *a;
ESL_DSQ x;
float p[] = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
int sc[] = { -1, -6, 6, 1};
int val;
a = esl_alphabet_Create(eslDNA);
x = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
val = esl_abc_IAvgScore(a, x, sc);
/* average of -1,-6,6,1 = 0 */
if (val != 0) esl_fatal(msg);
x = esl_abc_DigitizeSymbol(a, 'M'); /* M = A/C */
val = esl_abc_IExpectScore(a, x, sc, p);
/* expectation of -1,-6 given p = 0.4,0.1 = -2 */
if (val != -2) esl_fatal(msg);
esl_alphabet_Destroy(a);
return eslOK;
}
/* This serves to unit test multiple functions:
* esl_abc_FAvgScore()
* esl_abc_FExpectScore()
*/
static int
degeneracy_float_scores(void)
{
char *msg = "degeneracy_float_scores unit test failed";
ESL_ALPHABET *a;
ESL_DSQ x;
float p[] = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
float sc[] = { -1., -6., 6., 1.};
float val;
a = esl_alphabet_Create(eslRNA);
x = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
val = esl_abc_FAvgScore(a, x, sc);
/* average of -1,-6,6,1 = 0 */
if (fabs(val - 0.) > 0.0001) esl_fatal(msg);
x = esl_abc_DigitizeSymbol(a, 'M'); /* M = A/C */
val = esl_abc_FExpectScore(a, x, sc, p);
/* expectation of -1,-6 given p = 0.4,0.1 = -2 */
if (fabs(val + 2.) > 0.0001) esl_fatal(msg);
esl_alphabet_Destroy(a);
return eslOK;
}
/* This serves to unit test multiple functions:
* esl_abc_DAvgScore()
* esl_abc_DExpectScore()
*/
static int
degeneracy_double_scores(void)
{
char *msg = "degeneracy_double_scores unit test failed";
ESL_ALPHABET *a;
ESL_DSQ x;
double p[] = {0.4, 0.1, 0.1, 0.4}; /* A/T biased background */
double sc[] = { -1., -6., 6., 1.};
double val;
a = esl_alphabet_Create(eslRNA);
x = esl_abc_DigitizeSymbol(a, 'N'); /* any: A/C/G/T */
val = esl_abc_DAvgScore(a, x, sc);
/* average of -1,-6,6,1 = 0 */
if (fabs(val - 0.) > 0.0001) esl_fatal(msg);
x = esl_abc_DigitizeSymbol(a, 'M'); /* M = A/C */
val = esl_abc_DExpectScore(a, x, sc, p);
/* expectation of -1,-6 given p = 0.4,0.1 = -2 */
if (fabs(val + 2.) > 0.0001) esl_fatal(msg);
esl_alphabet_Destroy(a);
return eslOK;
}
/* utest_IAvgScVec */
/* utest_FAvgScVec */
/* utest_DAvgScVec */
/* utest_IExpectScVec */
/* utest_FExpectScVec */
/* utest_DExpectScVec */
static int
utest_FCount(void)
{
char *msg = "FCount unit test failure";
ESL_ALPHABET *a = NULL;
ESL_DSQ x;
char *teststring = "X~-Z.UAX";
char *s;
int status;
/* 0.1 from 2 X's; U -> +1 C; A -> +1 A; Z-> +0.5 Q,E; ~ ignored; .,- -> +2 gaps */
/* A C D E F G H I K L M N P Q R S T V W Y - */
float expect[21] = { 1.1, 1.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 2.0 };
float *vec;
a = esl_alphabet_Create(eslAMINO);
ESL_ALLOC(vec, sizeof(float) * (a->K+1)); /* include gap char for this test */
esl_vec_FSet(vec, a->K+1, 0.);
for (s = teststring; *s != '\0'; s++)
{
x = esl_abc_DigitizeSymbol(a, *s);
if (esl_abc_FCount(a, vec, x, 1.0) != eslOK) esl_fatal(msg);
}
if (esl_vec_FCompare(vec, expect, a->K+1, 0.0001) != eslOK) esl_fatal(msg);
esl_alphabet_Destroy(a);
free(vec);
return eslOK;
ERROR:
esl_fatal("allocation failed");
return status;
}
static int
utest_DCount(void)
{
char *msg = "DCount unit test failure";
ESL_ALPHABET *a = NULL;
ESL_DSQ x;
char *teststring = "X~-Z.UAX";
char *s;
int status;
/* 0.1 from 2 X's; U -> +1 C; A -> +1 A; Z-> +0.5 Q,E; ~ ignored; .,- -> +2 gaps */
/* A C D E F G H I K L M N P Q R S T V W Y - */
double expect[21] = { 1.1, 1.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.6, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 2.0 };
double *vec;
a = esl_alphabet_Create(eslAMINO);
ESL_ALLOC(vec, sizeof(double) * (a->K+1)); /* include gap char for this test */
esl_vec_DSet(vec, a->K+1, 0.);
for (s = teststring; *s != '\0'; s++)
{
x = esl_abc_DigitizeSymbol(a, *s);
if (esl_abc_DCount(a, vec, x, 1.0) != eslOK) esl_fatal(msg);
}
if (esl_vec_DCompare(vec, expect, a->K+1, 0.0001) != eslOK) esl_fatal(msg);
esl_alphabet_Destroy(a);
free(vec);
return eslOK;
ERROR:
esl_fatal("allocation failed");
return status;
}
#endif /* eslALPHABET_TESTDRIVE*/
/*-------------------- end, unit tests --------------------------*/
/*****************************************************************
* 5. Test driver.
*****************************************************************/
/* gcc -g -Wall -std=gnu99 -I. -o esl_alphabet_utest -DeslALPHABET_TESTDRIVE esl_alphabet.c esl_vectorops.c easel.c -lm
* gcc -g -Wall -std=gnu99 -I. -L. -o esl_alphabet_utest -DeslALPHABET_TESTDRIVE esl_alphabet.c -leasel
* ./test
* valgrind ./test
*/
#ifdef eslALPHABET_TESTDRIVE
static int basic_examples(void);
#include "easel.h"
#include "esl_alphabet.h"
int
main(void)
{
utest_Create();
utest_CreateCustom();
utest_SetEquiv();
utest_SetCaseInsensitive();
utest_SetDegeneracy();
utest_SetIgnored();
utest_Destroy();
utest_CreateDsq();
utest_Digitize();
utest_Textize();
utest_TextizeN();
utest_dsqdup();
utest_dsqcat();
utest_FCount();
utest_DCount();
basic_examples();
degeneracy_integer_scores();
degeneracy_float_scores();
degeneracy_double_scores();
return eslOK;
}
static int
basic_examples(void)
{
char *msg = "basic alphabet example tests failed";
ESL_ALPHABET *a1, *a2;
char dnaseq[] = "GARYtcN";
char aaseq[] = "EFILqzU";
int L;
ESL_DSQ *dsq, *dsq2;
int i;
/* Example 1.
* Create a DNA alphabet; digitize a DNA sequence.
*/
if ((a1 = esl_alphabet_Create(eslDNA)) == NULL) esl_fatal(msg);
L = strlen(dnaseq);
if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal(msg);
if (esl_abc_Digitize(a1, dnaseq, dsq) != eslOK) esl_fatal(msg);
if (esl_abc_dsqlen(dsq) != L) esl_fatal(msg);
esl_alphabet_Destroy(a1);
/* Example 2.
* Create an RNA alphabet; digitize the same DNA sequence;
* make sure it is equal to the dsq above (so T=U were
* correctly synonymous on input).
*/
if ((a2 = esl_alphabet_Create(eslRNA)) == NULL) esl_fatal(msg);
if ((dsq2 = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal(msg);
if (esl_abc_Digitize(a2, dnaseq, dsq2) != eslOK) esl_fatal(msg);
for (i = 1; i <= L; i++)
if (dsq[i] != dsq2[i]) esl_fatal(msg);
esl_alphabet_Destroy(a2);
/* Example 3.
* Create an amino alphabet; digitize a protein sequence,
* while reusing memory already allocated in dsq.
*/
if ((a1 = esl_alphabet_Create(eslAMINO)) == NULL) esl_fatal(msg);
if (esl_abc_Digitize(a1, aaseq, dsq) != eslOK) esl_fatal(msg);
/* Example 4.
* Create a custom alphabet almost the same as the amino
* acid alphabet; digitize the same protein seq, reusing
* memory in dsq2; check that seqs are identical.
*/
if ((a2 = esl_alphabet_CreateCustom("ACDEFGHIKLMNPQRSTVWY-BJZOUX~", 20, 28)) == NULL) esl_fatal(msg);
if (esl_alphabet_SetCaseInsensitive(a2) != eslOK) esl_fatal(msg); /* allow lower case input */
if (esl_alphabet_SetDegeneracy(a2, 'Z', "QE") != eslOK) esl_fatal(msg);
if (esl_abc_Digitize(a2, aaseq, dsq2) != eslOK) esl_fatal(msg);
for (i = 1; i <= L; i++)
if (dsq[i] != dsq2[i]) esl_fatal(msg);
/* clean up.
*/
esl_alphabet_Destroy(a1);
esl_alphabet_Destroy(a2);
free(dsq);
free(dsq2);
return eslOK;
}
#endif /*eslALPHABET_TESTDRIVE*/
/*****************************************************************
* 6. Examples.
*****************************************************************/
/* gcc -g -Wall -o example -I. -DeslALPHABET_EXAMPLE esl_alphabet.c easel.c
*/
#ifdef eslALPHABET_EXAMPLE
/*::cexcerpt::alphabet_example::begin::*/
#include "easel.h"
#include "esl_alphabet.h"
int main(void)
{
ESL_ALPHABET *a;
char dnaseq[] = "GARYTC";
int L = 6;
ESL_DSQ *dsq;
a = esl_alphabet_Create(eslDNA);
if ((dsq = malloc(sizeof(ESL_DSQ) * (L+2))) == NULL) esl_fatal("malloc failed");
if (esl_abc_Digitize(a, dnaseq, dsq) != eslOK) esl_fatal("failed to digitize the sequence");
free(dsq);
esl_alphabet_Destroy(a);
return 0;
}
/*::cexcerpt::alphabet_example::end::*/
#endif /*eslALPHABET_EXAMPLE*/
/* gcc -g -Wall -o example -I. -DeslALPHABET_EXAMPLE2 esl_alphabet.c easel.c
*/
#ifdef eslALPHABET_EXAMPLE2
/*::cexcerpt::alphabet_example2::begin::*/
#include "easel.h"
#include "esl_alphabet.h"
int main(void)
{
ESL_ALPHABET *a;
/* 1. Create the base alphabet structure. */
a = esl_alphabet_CreateCustom("ACDEFGHIKLMNOPQRSTUVWY-BJZX~", 22, 28);
/* 2. Set your equivalences in the input map. */
esl_alphabet_SetEquiv(a, '.', '-'); /* allow . as a gap character too */
/* 3. After all synonyms are set, (optionally) make map case-insensitive. */
esl_alphabet_SetCaseInsensitive(a); /* allow lower case input too */
/* 4. Define your optional degeneracy codes in the alphabet, one at a time.
* The 'any' character X was automatically set up. */
esl_alphabet_SetDegeneracy(a, 'B', "DN"); /* read B as {D|N} */
esl_alphabet_SetDegeneracy(a, 'J', "IL"); /* read B as {I|L} */
esl_alphabet_SetDegeneracy(a, 'Z', "QE"); /* read Z as {Q|E} */
/* 5. (do your stuff) */
/* 6. Remember to free it when you're done with it. */
esl_alphabet_Destroy(a);
return 0;
}
/*::cexcerpt::alphabet_example2::end::*/
#endif /*eslALPHABET_EXAMPLE2*/
#ifdef eslALPHABET_EXAMPLE3
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_sq.h"
#include "esl_sqio.h"
int
main(int argc, char **argv)
{
ESL_SQ *sq = esl_sq_Create();
ESL_SQFILE *sqfp;
int format = eslSQFILE_UNKNOWN;
char *seqfile = argv[1];
int type;
int status;
status = esl_sqfile_Open(seqfile, format, NULL, &sqfp);
if (status == eslENOTFOUND) esl_fatal("No such file.");
else if (status == eslEFORMAT) esl_fatal("Format couldn't be determined.");
else if (status != eslOK) esl_fatal("Open failed, code %d.", status);
while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
{
esl_sq_GuessAlphabet(sq, &type);
printf("%-25s %s\n", sq->name, esl_abc_DecodeType(type));
esl_sq_Reuse(sq);
}
if (status == eslEFORMAT) esl_fatal("Parse failed\n %s", esl_sqfile_GetErrorBuf(sqfp));
else if (status != eslEOF) esl_fatal("Unexpected read error %d", status);
esl_sq_Destroy(sq);
esl_sqfile_Close(sqfp);
return 0;
}
#endif /*eslALPHABET_EXAMPLE3*/
You can’t perform that action at this time.