Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1579 lines (1385 sloc) 53 KB
/* Pairwise identities, distances, and distance matrices.
*
* Contents:
* 1. Pairwise distances for aligned text sequences.
* 2. Pairwise distances for aligned digital seqs.
* 3. Distance matrices for aligned text sequences.
* 4. Distance matrices for aligned digital sequences.
* 5. Average pairwise identity for multiple alignments.
* 6. Private (static) functions.
* 7. Unit tests.
* 8. Test driver.
* 9. Example.
*/
#include "esl_config.h"
#include <ctype.h>
#include <string.h>
#include <math.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_dmatrix.h"
#include "esl_random.h"
#include "esl_distance.h"
/* Forward declaration of our static functions.
*/
static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance);
/*****************************************************************
* 1. Pairwise distances for aligned text sequences.
*****************************************************************/
/* Function: esl_dst_CPairId()
* Synopsis: Pairwise identity of two aligned text strings.
* Incept: SRE, Mon Apr 17 20:06:07 2006 [St. Louis]
*
* Purpose: Calculates pairwise fractional identity between two
* aligned character strings <asq1> and <asq2>.
* Return this distance in <opt_pid>; return the
* number of identities counted in <opt_nid>; and
* return the denominator <MIN(len1,len2)> in
* <opt_n>.
*
* Alphabetic symbols <[a-zA-Z]> are compared
* case-insensitively for identity. Any nonalphabetic
* character is assumed to be a gap symbol.
*
* This simple comparison rule is unaware of synonyms and
* degeneracies in biological alphabets. For a more
* sophisticated and biosequence-aware comparison, use
* digitized sequences and the <esl_dst_XPairId()> function
* instead. Note that currently <esl_dst_XPairId()> does
* not correctly handle degeneracies, but is set up to.
*
* Args: asq1 - aligned character string 1
* asq2 - aligned character string 2
* opt_pid - optRETURN: pairwise identity, 0<=x<=1
* opt_nid - optRETURN: # of identities
* opt_n - optRETURN: denominator MIN(len1,len2)
*
* Returns: <eslOK> on success. <opt_pid>, <opt_nid>, <opt_n>
* contain the answers (for whichever were passed non-NULL).
*
* Throws: <eslEINVAL> if the strings are different lengths
* (not aligned).
*/
int
esl_dst_CPairId(const char *asq1, const char *asq2,
double *opt_pid, int *opt_nid, int *opt_n)
{
int status;
int idents; /* total identical positions */
int len1, len2; /* lengths of seqs */
int i; /* position in aligned seqs */
idents = len1 = len2 = 0;
for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++)
{
if (isalpha(asq1[i])) len1++;
if (isalpha(asq2[i])) len2++;
if (isalpha(asq1[i]) && isalpha(asq2[i])
&& toupper(asq1[i]) == toupper(asq2[i]))
idents++;
}
if (asq1[i] != '\0' || asq2[i] != '\0')
ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
if (opt_pid != NULL) *opt_pid = ( len1==0 ? 0. : (double) idents / (double) ESL_MIN(len1,len2));
if (opt_nid != NULL) *opt_nid = idents;
if (opt_n != NULL) *opt_n = len1;
return eslOK;
ERROR:
if (opt_pid != NULL) *opt_pid = 0.;
if (opt_nid != NULL) *opt_nid = 0;
if (opt_n != NULL) *opt_n = 0;
return status;
}
/* Function: esl_dst_CPairMatch()
* Synopsis: Pairwise matches of two aligned text strings.
* Incept: ER, Wed Oct 29 09:02:35 EDT 2014 [janelia]
*
* Purpose: Calculates pairwise fractional matches between two
* aligned character strings <asq1> and <asq2>.
* Return this distance in <opt_pmatch>; return the
* number of matches counted in <opt_nmatch>; and
* return the denominator <alen - double_gaps> in
* <opt_n>.
*
* Alphabetic symbols <[a-zA-Z]> are compared
* case-insensitively for identity. Any nonalphabetic
* character is assumed to be a gap symbol.
*
* This simple comparison rule is unaware of synonyms and
* degeneracies in biological alphabets. For a more
* sophisticated and biosequence-aware comparison, use
* digitized sequences and the <esl_dst_XPairmatch()> function
* instead. Note that currently <esl_dst_XPairMatch()> does
* not correctly handle degeneracies, but is set up to.
*
* Args: asq1 - aligned character string 1
* asq2 - aligned character string 2
* opt_pmatch - optRETURN: pairwise matches, 0<=x<=1
* opt_nmatch - optRETURN: # of matches
* opt_n - optRETURN: denominator alen - double_gaps
*
* Returns: <eslOK> on success. <opt_pmatch>, <opt_nmatch>, <opt_n>
* contain the answers (for whichever were passed non-NULL).
*
* Throws: <eslEINVAL> if the strings are different lengths
* (not aligned).
*/
int
esl_dst_CPairMatch(const char *asq1, const char *asq2,
double *opt_pmatch, int *opt_nmatch, int *opt_n)
{
int status;
int match; /* total matched positions */
int len; /* length of alignment (no double gaps) */
int i; /* position in aligned seqs */
match = len = 0;
for (i = 0; asq1[i] != '\0' && asq2[i] != '\0'; i++)
{
if (isalpha(asq1[i]) || isalpha(asq2[i])) len++;
if (isalpha(asq1[i]) && isalpha(asq2[i])) match++;
}
if (asq1[i] != '\0' || asq2[i] != '\0')
ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
if (opt_pmatch != NULL) *opt_pmatch = ( len==0 ? 0. : (double)match / (double)len);
if (opt_nmatch != NULL) *opt_nmatch = match;
if (opt_n != NULL) *opt_n = len;
return eslOK;
ERROR:
if (opt_pmatch != NULL) *opt_pmatch = 0.;
if (opt_nmatch != NULL) *opt_nmatch = 0;
if (opt_n != NULL) *opt_n = 0;
return status;
}
/* Function: esl_dst_CJukesCantor()
* Synopsis: Jukes-Cantor distance for two aligned strings.
* Incept: SRE, Tue Apr 18 14:00:37 2006 [St. Louis]
*
* Purpose: Calculate the generalized Jukes-Cantor distance between
* two aligned character strings <as1> and <as2>, in
* substitutions/site, for an alphabet of <K> residues
* (<K=4> for nucleic acid, <K=20> for proteins). The
* maximum likelihood estimate for the distance is
* optionally returned in <opt_distance>. The large-sample
* variance for the distance estimate is
* optionally returned in <opt_variance>.
*
* Alphabetic symbols <[a-zA-Z]> are compared
* case-insensitively to count the number of identities
* (<n1>) and mismatches (<n2>>). Any nonalphabetic
* character is assumed to be a gap symbol, and aligned
* columns containing gap symbols are ignored. The
* fractional difference <D> used to calculate the
* Jukes/Cantor distance is <n2/n1+n2>.
*
* Args: K - size of the alphabet (4 or 20)
* as1 - 1st aligned seq, 0..L-1, \0-terminated
* as2 - 2nd aligned seq, 0..L-1, \0-terminated
* opt_distance - optRETURN: ML estimate of distance d
* opt_variance - optRETURN: large-sample variance of d
*
* Returns: <eslOK> on success.
*
* Infinite distances are possible, in which case distance
* and variance are both <HUGE_VAL>. Caller has to deal
* with this case as it sees fit, perhaps by enforcing
* an arbitrary maximum distance.
*
* Throws: <eslEINVAL> if the two strings aren't the same length (and
* thus can't have been properly aligned).
* <eslEDIVZERO> if no aligned residues were counted.
* On either failure, distance and variance are both returned
* as <HUGE_VAL>.
*/
int
esl_dst_CJukesCantor(int K, const char *as1, const char *as2,
double *opt_distance, double *opt_variance)
{
int status;
int n1, n2; /* number of observed identities, substitutions */
int i; /* position in aligned seqs */
/* 1. Count identities, mismatches.
*/
n1 = n2 = 0;
for (i = 0; as1[i] != '\0' && as2[i] != '\0'; i++)
{
if (isalpha(as1[i]) && isalpha(as2[i]))
{
if (toupper(as1[i]) == toupper(as2[i])) n1++; else n2++;
}
}
if (as1[i] != '\0' || as2[i] != '\0')
ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
return jukescantor(n1, n2, K, opt_distance, opt_variance); /* can throw eslEDIVZERO */
ERROR:
if (opt_distance != NULL) *opt_distance = HUGE_VAL;
if (opt_variance != NULL) *opt_variance = HUGE_VAL;
return status;
}
/*------- end, pairwise distances for aligned text seqs ---------*/
/*****************************************************************
* 2. Pairwise distances for aligned digitized sequences.
*****************************************************************/
/* Function: esl_dst_XPairId()
* Synopsis: Pairwise identity of two aligned digital seqs.
* Incept: SRE, Tue Apr 18 09:24:05 2006 [St. Louis]
*
* Purpose: Digital version of <esl_dst_CPairId()>: <adsq1> and
* <adsq2> are digitized aligned sequences, in alphabet
* <abc>. Otherwise, same as <esl_dst_CPairId()> except
* that only canonical residues are counted and checked for
* identity, while <esl_dst_CPairId()> (which has no
* alphabet) counts and checks identity of all alphanumeric
* characters.
*
* This function does not use <esl_abc_Match()> to handle
* degeneracies but it is set up to do so. Doing that would
* require that <opt_nid> be changed to a float or double,
* or its meaning be changed to be the number of canonical
* identities.
*
* Args: abc - digital alphabet in use
* ax1 - aligned digital seq 1
* ax2 - aligned digital seq 2
* opt_pid - optRETURN: pairwise identity, 0<=x<=1
* opt_nid - optRETURN: # of identities
* opt_n - optRETURN: denominator MIN(len1,len2)
*
* Returns: <eslOK> on success. <opt_distance>, <opt_nid>, <opt_n>
* contain the answers, for any of these that were passed
* non-<NULL> pointers.
*
* Throws: <eslEINVAL> if the strings are different lengths (not aligned).
*/
int
esl_dst_XPairId(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2,
double *opt_distance, int *opt_nid, int *opt_n)
{
int status;
int idents; /* total identical positions */
int len1, len2; /* lengths of seqs */
int i; /* position in aligned seqs */
idents = len1 = len2 = 0;
for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++)
{
if (esl_abc_XIsCanonical(abc, ax1[i])) len1++;
if (esl_abc_XIsCanonical(abc, ax2[i])) len2++;
if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])
&& ax1[i] == ax2[i])
idents++;
}
if (len2 < len1) len1 = len2;
if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL)
ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
if (opt_distance != NULL) *opt_distance = ( len1==0 ? 0. : (double) idents / (double) len1 );
if (opt_nid != NULL) *opt_nid = idents;
if (opt_n != NULL) *opt_n = len1;
return eslOK;
ERROR:
if (opt_distance != NULL) *opt_distance = 0.;
if (opt_nid != NULL) *opt_nid = 0;
if (opt_n != NULL) *opt_n = 0;
return status;
}
/* Function: esl_dst_XPairMatch()
* Synopsis: Pairwise matches of two aligned digital seqs.
* Incept: ER, Wed Oct 29 09:09:07 EDT 2014 [janelia]
*
* Purpose: Digital version of <esl_dst_CPairMatch()>: <adsq1> and
* <adsq2> are digitized aligned sequences, in alphabet
* <abc>. Otherwise, same as <esl_dst_CPairId()> except
* that only canonical residues are counted and checked for
* identity, while <esl_dst_CPairId()> (which has no
* alphabet) counts and checks identity of all alphanumeric
* characters.
*
* Args: abc - digital alphabet in use
* ax1 - aligned digital seq 1
* ax2 - aligned digital seq 2
* opt_pmatch - optRETURN: pairwise matches, 0<=x<=1
* opt_nmatch - optRETURN: # of maches
* opt_n - optRETURN: denominator alen-double_gaps
*
* Returns: <eslOK> on success. <opt_distance>, <opt_nmatch>, <opt_n>
* contain the answers, for any of these that were passed
* non-<NULL> pointers.
*
* Throws: <eslEINVAL> if the strings are different lengths (not aligned).
*/
int
esl_dst_XPairMatch(const ESL_ALPHABET *abc, const ESL_DSQ *ax1, const ESL_DSQ *ax2,
double *opt_distance, int *opt_nmatch, int *opt_n)
{
int status;
int match; /* total matched positions */
int len; /* length of alignment (no double gaps) */
int i; /* position in aligned seqs */
match = len = 0;
for (i = 1; ax1[i] != eslDSQ_SENTINEL && ax2[i] != eslDSQ_SENTINEL; i++)
{
if (esl_abc_XIsCanonical(abc, ax1[i]) || esl_abc_XIsCanonical(abc, ax2[i])) len ++;
if (esl_abc_XIsCanonical(abc, ax1[i]) && esl_abc_XIsCanonical(abc, ax2[i])) match++;
}
if (ax1[i] != eslDSQ_SENTINEL || ax2[i] != eslDSQ_SENTINEL)
ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
if (opt_distance != NULL) *opt_distance = ( len==0 ? 0. : (double)match / (double)len );
if (opt_nmatch != NULL) *opt_nmatch = match;
if (opt_n != NULL) *opt_n = len;
return eslOK;
ERROR:
if (opt_distance != NULL) *opt_distance = 0.;
if (opt_nmatch != NULL) *opt_nmatch = 0;
if (opt_n != NULL) *opt_n = 0;
return status;
}
/* Function: esl_dst_XJukesCantor()
* Synopsis: Jukes-Cantor distance for two aligned digitized seqs.
* Incept: SRE, Tue Apr 18 15:26:51 2006 [St. Louis]
*
* Purpose: Calculate the generalized Jukes-Cantor distance between two
* aligned digital strings <ax> and <ay>, in substitutions/site,
* using alphabet <abc> to evaluate identities and differences.
* The maximum likelihood estimate for the distance is optionally returned in
* <opt_distance>. The large-sample variance for the distance
* estimate is optionally returned in <opt_variance>.
*
* Identical to <esl_dst_CJukesCantor()>, except that it takes
* digital sequences instead of character strings.
*
* Args: abc - bioalphabet to use for comparisons
* ax - 1st digital aligned seq
* ay - 2nd digital aligned seq
* opt_distance - optRETURN: ML estimate of distance d
* opt_variance - optRETURN: large-sample variance of d
*
* Returns: <eslOK> on success. As in <esl_dst_CJukesCantor()>, the
* distance and variance may be infinite, in which case they
* are returned as <HUGE_VAL>.
*
* Throws: <eslEINVAL> if the two strings aren't the same length (and
* thus can't have been properly aligned).
* <eslEDIVZERO> if no aligned residues were counted.
* On either failure, the distance and variance are set
* to <HUGE_VAL>.
*/
int
esl_dst_XJukesCantor(const ESL_ALPHABET *abc, const ESL_DSQ *ax, const ESL_DSQ *ay,
double *opt_distance, double *opt_variance)
{
int status;
int n1, n2; /* number of observed identities, substitutions */
int i; /* position in aligned seqs */
n1 = n2 = 0;
for (i = 1; ax[i] != eslDSQ_SENTINEL && ay[i] != eslDSQ_SENTINEL; i++)
{
if (esl_abc_XIsCanonical(abc, ax[i]) && esl_abc_XIsCanonical(abc, ay[i]))
{
if (ax[i] == ay[i]) n1++;
else n2++;
}
}
if (ax[i] != eslDSQ_SENTINEL || ay[i] != eslDSQ_SENTINEL)
ESL_XEXCEPTION(eslEINVAL, "strings not same length, not aligned");
return jukescantor(n1, n2, abc->K, opt_distance, opt_variance);
ERROR:
if (opt_distance != NULL) *opt_distance = HUGE_VAL;
if (opt_variance != NULL) *opt_variance = HUGE_VAL;
return status;
}
/*---------- end pairwise distances, digital seqs --------------*/
/*****************************************************************
* 3. Distance matrices for aligned text sequences.
*****************************************************************/
/* Function: esl_dst_CPairIdMx()
* Synopsis: NxN identity matrix for N aligned text sequences.
* Incept: SRE, Thu Apr 27 08:46:08 2006 [New York]
*
* Purpose: Given a multiple sequence alignment <as>, consisting
* of <N> aligned character strings; calculate
* a symmetric fractional pairwise identity matrix by $N(N-1)/2$
* calls to <esl_dst_CPairId()>, and return it in
* <ret_D>.
*
* Args: as - aligned seqs (all same length), [0..N-1]
* N - # of aligned sequences
* ret_S - RETURN: symmetric fractional identity matrix
*
* Returns: <eslOK> on success, and <ret_S> contains the fractional
* identity matrix. Caller free's <S> with
* <esl_dmatrix_Destroy()>.
*
* Throws: <eslEINVAL> if a seq has a different
* length than others. On failure, <ret_D> is returned <NULL>
* and state of inputs is unchanged.
*
* <eslEMEM> on allocation failure.
*/
int
esl_dst_CPairIdMx(char **as, int N, ESL_DMATRIX **ret_S)
{
ESL_DMATRIX *S = NULL;
int status;
int i,j;
if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; }
for (i = 0; i < N; i++)
{
S->mx[i][i] = 1.;
for (j = i+1; j < N; j++)
{
status = esl_dst_CPairId(as[i], as[j], &(S->mx[i][j]), NULL, NULL);
if (status != eslOK)
ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
S->mx[j][i] = S->mx[i][j];
}
}
if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
return eslOK;
ERROR:
if (S != NULL) esl_dmatrix_Destroy(S);
if (ret_S != NULL) *ret_S = NULL;
return status;
}
/* Function: esl_dst_CDiffMx()
* Synopsis: NxN difference matrix for N aligned text sequences.
* Incept: SRE, Fri Apr 28 06:27:20 2006 [New York]
*
* Purpose: Same as <esl_dst_CPairIdMx()>, but calculates
* the fractional difference <d=1-s> instead of the
* fractional identity <s> for each pair.
*
* Args: as - aligned seqs (all same length), [0..N-1]
* N - # of aligned sequences
* ret_D - RETURN: symmetric fractional difference matrix
*
* Returns: <eslOK> on success, and <ret_D> contains the
* fractional difference matrix. Caller free's <D> with
* <esl_dmatrix_Destroy()>.
*
* Throws: <eslEINVAL> if any seq has a different
* length than others. On failure, <ret_D> is returned <NULL>
* and state of inputs is unchanged.
*/
int
esl_dst_CDiffMx(char **as, int N, ESL_DMATRIX **ret_D)
{
ESL_DMATRIX *D = NULL;
int status;
int i,j;
status = esl_dst_CPairIdMx(as, N, &D);
if (status != eslOK) goto ERROR;
for (i = 0; i < N; i++)
{
D->mx[i][i] = 0.;
for (j = i+1; j < N; j++)
{
D->mx[i][j] = 1. - D->mx[i][j];
D->mx[j][i] = D->mx[i][j];
}
}
if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
return eslOK;
ERROR:
if (D != NULL) esl_dmatrix_Destroy(D);
if (ret_D != NULL) *ret_D = NULL;
return status;
}
/* Function: esl_dst_CJukesCantorMx()
* Synopsis: NxN Jukes/Cantor distance matrix for N aligned text seqs.
* Incept: SRE, Tue Apr 18 16:00:16 2006 [St. Louis]
*
* Purpose: Given a multiple sequence alignment <aseq>, consisting of
* <nseq> aligned character sequences in an alphabet of
* <K> letters (usually 4 for DNA, 20 for protein);
* calculate a symmetric Jukes/Cantor pairwise distance
* matrix for all sequence pairs, and optionally return the distance
* matrix in <ret_D>, and optionally return a symmetric matrix of the
* large-sample variances for those ML distance estimates
* in <ret_V>.
*
* Infinite distances (and variances) are possible; they
* are represented as <HUGE_VAL> in <D> and <V>. Caller must
* be prepared to deal with them as appropriate.
*
* Args: K - size of the alphabet (usually 4 or 20)
* aseq - aligned sequences [0.nseq-1][0..L-1]
* nseq - number of aseqs
* opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
* opt_V - optRETURN: matrix of variances.
*
* Returns: <eslOK> on success. <D> and <V> contain the
* distance matrix (and variances); caller frees these with
* <esl_dmatrix_Destroy()>.
*
* Throws: <eslEINVAL> if any pair of sequences have differing lengths
* (and thus cannot have been properly aligned).
* <eslEDIVZERO> if some pair of sequences had no aligned
* residues. On failure, <D> and <V> are both returned <NULL>
* and state of inputs is unchanged.
*
* <eslEMEM> on allocation failure.
*/
int
esl_dst_CJukesCantorMx(int K, char **aseq, int nseq,
ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
{
int status;
ESL_DMATRIX *D = NULL;
ESL_DMATRIX *V = NULL;
int i,j;
if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
for (i = 0; i < nseq; i++)
{
D->mx[i][i] = 0.;
V->mx[i][i] = 0.;
for (j = i+1; j < nseq; j++)
{
status = esl_dst_CJukesCantor(K, aseq[i], aseq[j],
&(D->mx[i][j]), &(V->mx[i][j]));
if (status != eslOK)
ESL_XEXCEPTION(status, "J/C calculation failed at seqs %d,%d", i,j);
D->mx[j][i] = D->mx[i][j];
V->mx[j][i] = V->mx[i][j];
}
}
if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D);
if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V);
return eslOK;
ERROR:
if (D != NULL) esl_dmatrix_Destroy(D);
if (V != NULL) esl_dmatrix_Destroy(V);
if (opt_D != NULL) *opt_D = NULL;
if (opt_V != NULL) *opt_V = NULL;
return status;
}
/*----------- end, distance matrices for aligned text seqs ---------*/
/*****************************************************************
* 4. Distance matrices for aligned digital sequences.
*****************************************************************/
/* Function: esl_dst_XPairIdMx()
* Synopsis: NxN identity matrix for N aligned digital seqs.
* Incept: SRE, Thu Apr 27 09:08:11 2006 [New York]
*
* Purpose: Given a digitized multiple sequence alignment <ax>, consisting
* of <N> aligned digital sequences in alphabet <abc>; calculate
* a symmetric pairwise fractional identity matrix by $N(N-1)/2$
* calls to <esl_dst_XPairId()>, and return it in <ret_S>.
*
* Args: abc - digital alphabet in use
* ax - aligned dsq's, [0..N-1][1..alen]
* N - number of aligned sequences
* ret_S - RETURN: NxN matrix of fractional identities
*
* Returns: <eslOK> on success, and <ret_S> contains the distance
* matrix. Caller is obligated to free <S> with
* <esl_dmatrix_Destroy()>.
*
* Throws: <eslEINVAL> if a seq has a different
* length than others. On failure, <ret_S> is returned <NULL>
* and state of inputs is unchanged.
*
* <eslEMEM> on allocation failure.
*/
int
esl_dst_XPairIdMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S)
{
int status;
ESL_DMATRIX *S = NULL;
int i,j;
if (( S = esl_dmatrix_Create(N,N) ) == NULL) { status = eslEMEM; goto ERROR; }
for (i = 0; i < N; i++)
{
S->mx[i][i] = 1.;
for (j = i+1; j < N; j++)
{
status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL);
if (status != eslOK)
ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
S->mx[j][i] = S->mx[i][j];
}
}
if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
return eslOK;
ERROR:
if (S != NULL) esl_dmatrix_Destroy(S);
if (ret_S != NULL) *ret_S = NULL;
return status;
}
/* Function: esl_dst_XDiffMx()
* Synopsis: NxN difference matrix for N aligned digital seqs.
* Incept: SRE, Fri Apr 28 06:37:29 2006 [New York]
*
* Purpose: Same as <esl_dst_XPairIdMx()>, but calculates fractional
* difference <1-s> instead of fractional identity <s> for
* each pair.
*
* Args: abc - digital alphabet in use
* ax - aligned dsq's, [0..N-1][1..alen]
* N - number of aligned sequences
* ret_D - RETURN: NxN matrix of fractional differences
*
* Returns: <eslOK> on success, and <ret_D> contains the difference
* matrix; caller is obligated to free <D> with
* <esl_dmatrix_Destroy()>.
*
* Throws: <eslEINVAL> if a seq has a different
* length than others. On failure, <ret_D> is returned <NULL>
* and state of inputs is unchanged.
*/
int
esl_dst_XDiffMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_D)
{
int status;
ESL_DMATRIX *D = NULL;
int i,j;
status = esl_dst_XPairIdMx(abc, ax, N, &D);
if (status != eslOK) goto ERROR;
for (i = 0; i < N; i++)
{
D->mx[i][i] = 0.;
for (j = i+1; j < N; j++)
{
D->mx[i][j] = 1. - D->mx[i][j];
D->mx[j][i] = D->mx[i][j];
}
}
if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
return eslOK;
ERROR:
if (D != NULL) esl_dmatrix_Destroy(D);
if (ret_D != NULL) *ret_D = NULL;
return status;
}
/* Function: esl_dst_XJukesCantorMx()
* Synopsis: NxN Jukes/Cantor distance matrix for N aligned digital seqs.
* Incept: SRE, Thu Apr 27 08:38:08 2006 [New York City]
*
* Purpose: Given a digitized multiple sequence alignment <ax>,
* consisting of <nseq> aligned digital sequences in
* bioalphabet <abc>, calculate a symmetric Jukes/Cantor
* pairwise distance matrix for all sequence pairs;
* optionally return the distance matrix in <ret_D> and
* a matrix of the large-sample variances for those ML distance
* estimates in <ret_V>.
*
* Infinite distances (and variances) are possible. They
* are represented as <HUGE_VAL> in <D> and <V>. Caller must
* be prepared to deal with them as appropriate.
*
* Args: abc - bioalphabet for <aseq>
* ax - aligned digital sequences [0.nseq-1][1..L]
* nseq - number of aseqs
* opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
* opt_V - optRETURN: matrix of variances.
*
* Returns: <eslOK> on success. <D> (and optionally <V>) contain the
* distance matrix (and variances). Caller frees these with
* <esl_dmatrix_Destroy()>.
*
* Throws: <eslEINVAL> if any pair of sequences have differing lengths
* (and thus cannot have been properly aligned).
* <eslEDIVZERO> if some pair of sequences had no aligned
* residues. On failure, <D> and <V> are both returned <NULL>
* and state of inputs is unchanged.
*
* <eslEMEM> on allocation failure.
*/
int
esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq,
ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
{
ESL_DMATRIX *D = NULL;
ESL_DMATRIX *V = NULL;
int status;
int i,j;
if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) { status = eslEMEM; goto ERROR; }
for (i = 0; i < nseq; i++)
{
D->mx[i][i] = 0.;
V->mx[i][i] = 0.;
for (j = i+1; j < nseq; j++)
{
status = esl_dst_XJukesCantor(abc, ax[i], ax[j],
&(D->mx[i][j]), &(V->mx[i][j]));
if (status != eslOK)
ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j);
D->mx[j][i] = D->mx[i][j];
V->mx[j][i] = V->mx[i][j];
}
}
if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D);
if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V);
return eslOK;
ERROR:
if (D != NULL) esl_dmatrix_Destroy(D);
if (V != NULL) esl_dmatrix_Destroy(V);
if (opt_D != NULL) *opt_D = NULL;
if (opt_V != NULL) *opt_V = NULL;
return status;
}
/*------- end, distance matrices for digital alignments ---------*/
/*****************************************************************
* 5. Average pairwise identity for multiple alignments
*****************************************************************/
/* Function: esl_dst_CAverageId()
* Synopsis: Calculate avg identity for multiple alignment
* Incept: SRE, Fri May 18 15:02:38 2007 [Janelia]
*
* Purpose: Calculates the average pairwise fractional identity in
* a multiple sequence alignment <as>, consisting of <N>
* aligned character sequences of identical length.
*
* If an exhaustive calculation would require more than
* <max_comparisons> pairwise comparisons, then instead of
* looking at all pairs, calculate the average over a
* stochastic sample of <max_comparisons> random pairs.
* This allows the routine to work efficiently even on very
* deep MSAs.
*
* Each fractional pairwise identity (range $[0..$ pid $..1]$
* is calculated using <esl_dst_CPairId()>.
*
* Returns: <eslOK> on success, and <*ret_id> contains the average
* fractional identity.
*
* Throws: <eslEMEM> on allocation failure.
* <eslEINVAL> if any of the aligned sequence pairs aren't
* of the same length.
* In either case, <*ret_id> is set to 0.
*/
int
esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id)
{
int status;
double id;
double sum = 0.;
int i,j,n;
if (N <= 1) { *ret_id = 1.; return eslOK; }
*ret_id = 0.;
/* Is nseq small enough that we can average over all pairwise comparisons? */
if ((N * (N-1) / 2) <= max_comparisons)
{
for (i = 0; i < N; i++)
for (j = i+1; j < N; j++)
{
if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
sum += id;
}
sum /= (double) (N * (N-1) / 2);
}
/* If nseq is large, calculate average over a stochastic sample. */
else
{
ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
for (n = 0; n < max_comparisons; n++)
{
do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
sum += id;
}
sum /= (double) max_comparisons;
esl_randomness_Destroy(r);
}
*ret_id = sum;
return eslOK;
}
/* Function: esl_dst_CAverageMatch()
* Synopsis: Calculate avg matches for multiple alignment
* Incept: ER, Wed Oct 29 09:25:09 EDT 2014 [Janelia]
*
* Purpose: Calculates the average pairwise fractional matches in
* a multiple sequence alignment <as>, consisting of <N>
* aligned character sequences of identical length.
*
* If an exhaustive calculation would require more than
* <max_comparisons> pairwise comparisons, then instead of
* looking at all pairs, calculate the average over a
* stochastic sample of <max_comparisons> random pairs.
* This allows the routine to work efficiently even on very
* deep MSAs.
*
* Each fractional pairwise matches (range $[0..$ pid $..1]$
* is calculated using <esl_dst_CPairMatch()>.
*
* Returns: <eslOK> on success, and <*ret_match> contains the average
* fractional matches.
*
* Throws: <eslEMEM> on allocation failure.
* <eslEINVAL> if any of the aligned sequence pairs aren't
* of the same length.
* In either case, <*ret_match> is set to 0.
*/
int
esl_dst_CAverageMatch(char **as, int N, int max_comparisons, double *ret_match)
{
int status;
double match;
double sum = 0.;
int i,j,n;
if (N <= 1) { *ret_match = 1.; return eslOK; }
*ret_match = 0.;
/* Is nseq small enough that we can average over all pairwise comparisons? */
if ((N * (N-1) / 2) <= max_comparisons)
{
for (i = 0; i < N; i++)
for (j = i+1; j < N; j++)
{
if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status;
sum += match;
}
sum /= (double) (N * (N-1) / 2);
}
/* If nseq is large, calculate average over a stochastic sample. */
else
{
ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
for (n = 0; n < max_comparisons; n++)
{
do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
if ((status = esl_dst_CPairMatch(as[i], as[j], &match, NULL, NULL)) != eslOK) return status;
sum += match;
}
sum /= (double) max_comparisons;
esl_randomness_Destroy(r);
}
*ret_match = sum;
return eslOK;
}
/* Function: esl_dst_XAverageId()
* Synopsis: Calculate avg identity for digital MSA
* Incept: SRE, Fri May 18 15:19:14 2007 [Janelia]
*
* Purpose: Calculates the average pairwise fractional identity in
* a digital multiple sequence alignment <ax>, consisting of <N>
* aligned digital sequences of identical length.
*
* If an exhaustive calculation would require more than
* <max_comparisons> pairwise comparisons, then instead of
* looking at all pairs, calculate the average over a
* stochastic sample of <max_comparisons> random pairs.
* This allows the routine to work efficiently even on very
* deep MSAs.
*
* Each fractional pairwise identity (range $[0..$ pid $..1]$
* is calculated using <esl_dst_XPairId()>.
*
* Returns: <eslOK> on success, and <*ret_id> contains the average
* fractional identity.
*
* Throws: <eslEMEM> on allocation failure.
* <eslEINVAL> if any of the aligned sequence pairs aren't
* of the same length.
* In either case, <*ret_id> is set to 0.
*/
int
esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id)
{
int status;
double id;
double sum = 0.;
int i,j,n;
if (N <= 1) { *ret_id = 1.; return eslOK; }
*ret_id = 0.;
/* Is N small enough that we can average over all pairwise comparisons?
watch out for numerical overflow in this: Pfam N's easily overflow when squared
*/
if (N <= max_comparisons &&
N <= sqrt(2. * max_comparisons) &&
(N * (N-1) / 2) <= max_comparisons)
{
for (i = 0; i < N; i++)
for (j = i+1; j < N; j++)
{
if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
sum += id;
}
sum /= (double) (N * (N-1) / 2);
}
/* If nseq is large, calculate average over a stochastic sample. */
else
{
ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
for (n = 0; n < max_comparisons; n++)
{
do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
sum += id;
}
sum /= (double) max_comparisons;
esl_randomness_Destroy(r);
}
*ret_id = sum;
return eslOK;
}
/* Function: esl_dst_XAverageMatch()
* Synopsis: Calculate avg matches for digital MSA
* Incept: ER, ed Oct 29 09:29:05 EDT 2014 [Janelia]
*
* Purpose: Calculates the average pairwise fractional matches in
* a digital multiple sequence alignment <ax>, consisting of <N>
* aligned digital sequences of identical length.
*
* If an exhaustive calculation would require more than
* <max_comparisons> pairwise comparisons, then instead of
* looking at all pairs, calculate the average over a
* stochastic sample of <max_comparisons> random pairs.
* This allows the routine to work efficiently even on very
* deep MSAs.
*
* Each fractional pairwise matches (range $[0..$ pid $..1]$
* is calculated using <esl_dst_XPairMatch()>.
*
* Returns: <eslOK> on success, and <*ret_match> contains the average
* fractional identity.
*
* Throws: <eslEMEM> on allocation failure.
* <eslEINVAL> if any of the aligned sequence pairs aren't
* of the same length.
* In either case, <*ret_match> is set to 0.
*/
int
esl_dst_XAverageMatch(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_match)
{
int status;
double match;
double sum = 0.;
int i,j,n;
if (N <= 1) { *ret_match = 1.; return eslOK; }
*ret_match = 0.;
/* Is N small enough that we can average over all pairwise comparisons?
watch out for numerical overflow in this: Pfam N's easily overflow when squared
*/
if (N <= max_comparisons &&
N <= sqrt(2. * max_comparisons) &&
(N * (N-1) / 2) <= max_comparisons)
{
for (i = 0; i < N; i++)
for (j = i+1; j < N; j++)
{
if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status;
sum += match;
}
sum /= (double) (N * (N-1) / 2);
}
/* If nseq is large, calculate average over a stochastic sample. */
else
{
ESL_RANDOMNESS *r = esl_randomness_Create(42); /* fixed seed, suppress stochastic variation */
for (n = 0; n < max_comparisons; n++)
{
do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
if ((status = esl_dst_XPairMatch(abc, ax[i], ax[j], &match, NULL, NULL)) != eslOK) return status;
sum += match;
}
sum /= (double) max_comparisons;
esl_randomness_Destroy(r);
}
*ret_match = sum;
return eslOK;
}
/*****************************************************************
* 6. Private (static) functions
*****************************************************************/
/* jukescantor()
*
* The generalized Jukes/Cantor distance calculation.
* Given <n1> identities and <n2> differences, for a
* base alphabet size of <alphabet_size> (4 or 20);
* calculate J/C distance in substitutions/site and
* return it in <ret_distance>; calculate large-sample
* variance and return it in <ret_variance>.
*
* Returns <eslEDIVZERO> if there are no data (<n1+n2=0>).
*/
static int
jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance)
{
int status;
double D, K, N;
double x;
double distance, variance;
ESL_DASSERT1( (n1 >= 0) );
ESL_DASSERT1( (n2 >= 0) );
ESL_DASSERT1( (alphabet_size >= 0) );
if (n1+n2 == 0) { status = eslEDIVZERO; goto ERROR; }
K = (double) alphabet_size;
D = (double) n2 / (double) (n1+n2);
N = (double) (n1+n2);
x = 1. - D * K/(K-1.);
if (x <= 0.)
{
distance = HUGE_VAL;
variance = HUGE_VAL;
}
else
{
distance = -log(x) * K/(K-1);
variance = exp( 2.*K*distance/(K-1) ) * D * (1.-D) / N;
}
if (opt_distance != NULL) *opt_distance = distance;
if (opt_variance != NULL) *opt_variance = variance;
return eslOK;
ERROR:
if (opt_distance != NULL) *opt_distance = HUGE_VAL;
if (opt_variance != NULL) *opt_variance = HUGE_VAL;
return status;
}
/*--------------- end of private functions ----------------------*/
/*****************************************************************
* 7. Unit tests.
*****************************************************************/
#ifdef eslDISTANCE_TESTDRIVE
/* Each unit test is given an alignment with certain known
* properties:
* seqs 0,1 are identical
* seqs 0,2 are completely different
* seqs 3..N are random
* The alignment may contain gaps, so don't assume that the
* # of compared residues == alignment length. The alignment
* contains only canonical residues, because one of our tests
* is that C and X functions give the same results.
*/
static int
utest_CPairId(char **as, int N)
{
double pid;
int nid;
int nres;
int L;
int i,j;
/* Self comparison gives identity = 1. */
L = strlen(as[0]);
if (esl_dst_CPairId(as[0], as[0], &pid, &nid, &nres) != eslOK) abort();
if (pid != 1.0 || nid != L || nres > L) abort();
/* So does 0,1 comparison */
if (esl_dst_CPairId(as[0], as[1], &pid, &nid, &nres) != eslOK) abort();
if (pid != 1.0 || nid != L || nres > L) abort();
/* 0,2 comparison gives 0.0, 0 */
if (esl_dst_CPairId(as[0], as[2], &pid, &nid, &nres) != eslOK) abort();
if (pid != 0.0 || nid != 0 || nres > L) abort();
/* remaining comparisons shouldn't fail */
for (i = 3; i < N; i++)
for (j = i; j < N; j++)
{
if (esl_dst_CPairId(as[i], as[j], &pid, &nid, &nres) != eslOK) abort();
if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort();
}
/* API should accept NULL for return values */
if (esl_dst_CPairId(as[0], as[0], NULL, NULL, NULL) != eslOK) abort();
return eslOK;
}
static int
utest_CJukesCantor(int K, char **as, int N)
{
double d, V;
int i,j;
/* Self comparison gives distance = 0. */
if (esl_dst_CJukesCantor(K, as[0], as[0], &d, &V) != eslOK) abort();
if (d != 0.0) abort();
/* So does 0,1 comparison */
if (esl_dst_CJukesCantor(K, as[0], as[1], &d, &V) != eslOK) abort();
if (d != 0.0) abort();
/* 0,2 comparison gives infinite distance (HUGE_VAL) */
if (esl_dst_CJukesCantor(K, as[0], as[2], &d, &V) != eslOK) abort();
if (d != HUGE_VAL) abort();
/* remaining comparisons shouldn't fail */
for (i = 3; i < N; i++)
for (j = i; j < N; j++)
if (esl_dst_CJukesCantor(K, as[i], as[j], &d, &V) != eslOK) abort();
/* API should accept NULL for return values */
if (esl_dst_CJukesCantor(K, as[0], as[0], NULL, NULL) != eslOK) abort();
return eslOK;
}
static int
utest_XPairId(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
double pid, pid2;
int nid, nid2;
int nres, nres2;
int dL, L;
int i,j;
/* Self comparison gives identity = 1. */
dL = esl_abc_dsqlen(ax[0]);
L = strlen(as[0]);
if (dL != L) abort();
if (esl_dst_XPairId(abc, ax[0], ax[0], &pid, &nid, &nres) != eslOK) abort();
if (pid != 1.0 || nid != L || nres > dL) abort();
/* So does 0,1 comparison */
if (esl_dst_XPairId(abc, ax[0], ax[1], &pid, &nid, &nres) != eslOK) abort();
if (pid != 1.0 || nid != L || nres > L) abort();
/* 0,2 comparison gives 0.0, 0 */
if (esl_dst_XPairId(abc, ax[0], ax[2], &pid, &nid, &nres) != eslOK) abort();
if (pid != 0.0 || nid != 0 || nres > L) abort();
/* remaining comparisons shouldn't fail, and should be identical to text mode */
for (i = 3; i < N; i++)
for (j = i; j < N; j++)
{
if (esl_dst_XPairId(abc, ax[i], ax[j], &pid, &nid, &nres) != eslOK) abort();
if (esl_dst_CPairId(as[i], as[j], &pid2, &nid2, &nres2) != eslOK) abort();
if (pid < 0. || pid > 1. || nid < 0 || nid > L || nres > L) abort();
if (pid != pid2 || nid != nid2 || nres != nres2) abort();
}
/* API should accept NULL for return values */
if (esl_dst_XPairId(abc, ax[0], ax[0], NULL, NULL, NULL) != eslOK) abort();
return eslOK;
}
static int
utest_XJukesCantor(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
double d, V;
int i,j;
/* Self comparison gives distance = 0. */
if (esl_dst_XJukesCantor(abc, ax[0], ax[0], &d, &V) != eslOK) abort();
if (d != 0.0) abort();
/* So does 0,1 comparison */
if (esl_dst_XJukesCantor(abc, ax[0], ax[1], &d, &V) != eslOK) abort();
if (d != 0.0) abort();
/* 0,2 comparison gives infinite distance (HUGE_VAL) */
if (esl_dst_XJukesCantor(abc, ax[0], ax[2], &d, &V) != eslOK) abort();
if (d != HUGE_VAL) abort();
/* remaining comparisons shouldn't fail */
for (i = 3; i < N; i++)
for (j = i; j < N; j++)
if (esl_dst_XJukesCantor(abc, ax[i], ax[j], &d, &V) != eslOK) abort();
/* API should accept NULL for return values */
if (esl_dst_XJukesCantor(abc, ax[0], ax[0], NULL, NULL) != eslOK) abort();
return eslOK;
}
static int
utest_CPairIdMx(char **as, int N)
{
ESL_DMATRIX *S;
int i,j;
double pid;
if (esl_dst_CPairIdMx(as, N, &S) != eslOK) abort();
for (i = 0; i < N; i++)
if (S->mx[i][i] != 1.0) abort();
pid = 0.;
for (i = 3; i < N; i++)
for (j = i+1; j < N; j++)
pid += S->mx[i][j];
pid /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */
if (pid < 0.15 || pid > 0.35) abort(); /* should be 0.25 */
esl_dmatrix_Destroy(S);
return eslOK;
}
static int
utest_CDiffMx(char **as, int N)
{
ESL_DMATRIX *D;
int i,j;
double diff;
if (esl_dst_CDiffMx(as, N, &D) != eslOK) abort();
for (i = 0; i < N; i++)
if (D->mx[i][i] != 0.0) abort();
diff = 0.;
for (i = 3; i < N; i++)
for (j = i+1; j < N; j++)
diff += D->mx[i][j];
diff /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */
if (diff < 0.65 || diff > 0.85) abort(); /* should be 0.75 */
esl_dmatrix_Destroy(D);
return eslOK;
}
static int
utest_CJukesCantorMx(int K, char **as, int N)
{
ESL_DMATRIX *D, *V;
/* just a crash test */
if (esl_dst_CJukesCantorMx(K, as, N, &D, &V) != eslOK) abort();
esl_dmatrix_Destroy(D);
esl_dmatrix_Destroy(V);
return eslOK;
}
static int
utest_XPairIdMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
ESL_DMATRIX *S, *S2;
int i, j;
if (esl_dst_XPairIdMx(abc, ax, N, &S) != eslOK) abort();
if (esl_dst_CPairIdMx(as, N, &S2) != eslOK) abort();
for (i = 0; i < N; i++)
for (j = i; j < N; j++)
if (fabs(S->mx[i][j] - S2->mx[j][i]) > 0.01) abort();
esl_dmatrix_Destroy(S);
esl_dmatrix_Destroy(S2);
return eslOK;
}
static int
utest_XDiffMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
ESL_DMATRIX *D, *D2;
int i, j;
if (esl_dst_XDiffMx(abc, ax, N, &D) != eslOK) abort();
if (esl_dst_CDiffMx(as, N, &D2) != eslOK) abort();
for (i = 0; i < N; i++)
for (j = i; j < N; j++)
if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();
esl_dmatrix_Destroy(D);
esl_dmatrix_Destroy(D2);
return eslOK;
}
static int
utest_XJukesCantorMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
ESL_DMATRIX *D, *D2, *V, *V2;
int i, j;
if (esl_dst_XJukesCantorMx(abc, ax, N, &D, &V) != eslOK) abort();
if (esl_dst_CJukesCantorMx(abc->K, as, N, &D2, &V2) != eslOK) abort();
for (i = 0; i < N; i++)
for (j = i; j < N; j++)
{
if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();
if (fabs(V->mx[i][j] - V2->mx[j][i]) > 0.01) abort();
}
esl_dmatrix_Destroy(D);
esl_dmatrix_Destroy(D2);
esl_dmatrix_Destroy(V);
esl_dmatrix_Destroy(V2);
return eslOK;
}
#endif /* eslDISTANCE_TESTDRIVE */
/*------------------ end of unit tests --------------------------*/
/*****************************************************************
* 8. Test driver.
*****************************************************************/
#ifdef eslDISTANCE_TESTDRIVE
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_arr2.h"
#include "esl_dmatrix.h"
#include "esl_getopts.h"
#include "esl_random.h"
#include "esl_randomseq.h"
#include "esl_distance.h"
static ESL_OPTIONS options[] = {
/* name type def env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
{ "-N", eslARG_INT, "10", NULL,"n>3", NULL, NULL, NULL, "number of iid seqs in alignment",0},
{ "-L", eslARG_INT, "50", NULL,"n>0", NULL, NULL, NULL, "length of seqs in alignment", 0},
{ "--seed", eslARG_INT, "42", NULL,"n>=0",NULL, NULL, NULL, "random # seed", 0},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "Usage: ./testdrive-distance [-options]";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = NULL;
ESL_RANDOMNESS *r = NULL;
char **as = NULL; /* aligned character seqs (random, iid) */
int N,L; /* # of seqs, and their aligned lengths */
int seed;
int i,j;
int status;
double p[4]; /// ACGT probabilities
ESL_DSQ **ax = NULL; // digitized alignment
ESL_ALPHABET *abc = NULL;
/* Process command line
*/
go = esl_getopts_Create(options);
esl_opt_ProcessCmdline(go, argc, argv);
esl_opt_VerifyConfig(go);
if (esl_opt_GetBoolean(go, "-h") == TRUE) {
puts(usage);
puts("\n where options are:");
esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
return 0;
}
L = esl_opt_GetInteger(go, "-L");
N = esl_opt_GetInteger(go, "-N");
seed = esl_opt_GetInteger(go, "--seed");
if (esl_opt_ArgNumber(go) != 0) {
puts("Incorrect number of command line arguments.");
puts(usage);
return 1;
}
esl_getopts_Destroy(go);
/* Create a random DNA alignment;
* force it to obey the conventions of the unit tests:
* 0,1 are identical
* 0,2 are completely dissimilar
*/
r = esl_randomness_Create(seed);
for (i = 0; i < 4; i++) p[i] = 0.25;
ESL_ALLOC(as, sizeof(char *) * N);
for (i = 0; i < N; i++)
ESL_ALLOC(as[i], sizeof(char) * (L+1));
esl_rsq_IID(r, "ACGT", p, 4, L, as[0]);
strcpy(as[1], as[0]);
esl_rsq_IID(r, "ACGT", p, 4, L, as[2]);
for (j = 0; j < L; j++)
while (as[2][j] == as[0][j])
as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)];
for (i = 3; i < N; i++)
esl_rsq_IID(r, "ACGT", p, 4, L, as[i]);
abc = esl_alphabet_Create(eslDNA);
ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N);
for (i = 0; i < N; i++)
esl_abc_CreateDsq(abc, as[i], &(ax[i]));
/* Unit tests
*/
if (utest_CPairId(as, N) != eslOK) return eslFAIL;
if (utest_CJukesCantor(4, as, N) != eslOK) return eslFAIL;
if (utest_XPairId(abc, as, ax, N) != eslOK) return eslFAIL;
if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL;
if (utest_CPairIdMx(as, N) != eslOK) return eslFAIL;
if (utest_CDiffMx(as, N) != eslOK) return eslFAIL;
if (utest_CJukesCantorMx(4, as, N) != eslOK) return eslFAIL;
if (utest_XPairIdMx(abc, as, ax, N) != eslOK) return eslFAIL;
if (utest_XDiffMx(abc, as, ax, N) != eslOK) return eslFAIL;
if (utest_XJukesCantorMx(abc, as, ax, N) != eslOK) return eslFAIL;
esl_randomness_Destroy(r);
esl_alphabet_Destroy(abc);
esl_arr2_Destroy((void **) as, N);
esl_arr2_Destroy((void **) ax, N);
return eslOK;
ERROR:
return eslFAIL;
}
#endif /*eslDISTANCE_TESTDRIVE*/
/*****************************************************************
* 9. Example.
*****************************************************************/
#ifdef eslDISTANCE_EXAMPLE
/*::cexcerpt::distance_example::begin::*/
/* gcc -g -Wall -o example -I. -DeslDISTANCE_EXAMPLE esl_distance.c\
esl_dmatrix.c esl_msa.c easel.c -lm
./example <msa file>
*/
#include "easel.h"
#include "esl_distance.h"
#include "esl_dmatrix.h"
#include "esl_msa.h"
int main(int argc, char **argv)
{
ESL_MSAFILE *afp;
ESL_MSA *msa;
ESL_DMATRIX *P;
int i,j;
double min, avg, max;
int status;
if ((status = esl_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
esl_msafile_OpenFailure(afp, status);
if ((status = esl_msafile_Read(afp, &msa)) != eslOK)
esl_msafile_ReadFailure(afp, status);
esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P);
min = 1.0;
max = 0.0;
avg = 0.0;
for (i = 0; i < msa->nseq; i++)
for (j = i+1; j < msa->nseq; j++)
{
avg += P->mx[i][j];
if (P->mx[i][j] < min) min = P->mx[i][j];
if (P->mx[i][j] > max) max = P->mx[i][j];
}
avg /= (double) (msa->nseq * (msa->nseq-1) / 2);
printf("Average pairwise %% id: %.1f%%\n", avg * 100.);
printf("Minimum pairwise %% id: %.1f%%\n", min * 100.);
printf("Maximum pairwise %% id: %.1f%%\n", max * 100.);
esl_dmatrix_Destroy(P);
esl_msa_Destroy(msa);
esl_msafile_Close(afp);
return 0;
}
/*::cexcerpt::distance_example::end::*/
#endif /*eslDISTANCE_EXAMPLE*/
You can’t perform that action at this time.