Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
2189 lines (1908 sloc) 72.1 KB
/* Phylogenetic trees.
*
* Contents:
* 1. The ESL_TREE object.
* 2. Newick format i/o
* 3. Tree comparison algorithms.
* 4. Clustering algorithms for distance-based tree construction.
* 5. Generating simulated trees.
* 6. Unit tests.
* 7. Test driver.
* 8. Examples.
*/
#include "esl_config.h"
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include "easel.h"
#include "esl_arr2.h"
#include "esl_dmatrix.h"
#include "esl_random.h"
#include "esl_stack.h"
#include "esl_vectorops.h"
#include "esl_tree.h"
/*****************************************************************
*# 1. The ESL_TREE object.
*****************************************************************/
/* Function: esl_tree_Create()
*
* Purpose: Allocate an empty tree structure for <ntaxa> taxa
* and return a pointer to it. <ntaxa> must be $\geq 2$.
*
* Args: <ntaxa> - number of taxa
*
* Returns: pointer to the new <ESL_TREE> object; caller frees
* this with <esl_tree_Destroy()>.
*
* Throws: <NULL> if allocation fails.
*/
ESL_TREE *
esl_tree_Create(int ntaxa)
{
ESL_TREE *T = NULL;
int i;
int status;
/* Contract verification */
ESL_DASSERT1((ntaxa >= 2));
/* 1st allocation round */
ESL_ALLOC(T, sizeof(ESL_TREE));
T->parent = NULL;
T->left = NULL;
T->right = NULL;
T->ld = NULL;
T->rd = NULL;
/* Optional info starts NULL
*/
T->taxaparent = NULL;
T->cladesize = NULL;
T->taxonlabel = NULL;
T->nodelabel = NULL;
/* Additive trees are assumed by default, as opposed to linkage trees */
T->is_linkage_tree = FALSE;
/* Tree output options default to PHYLIP style
*/
T->show_unrooted = FALSE;
T->show_node_labels = TRUE;
T->show_root_branchlength = FALSE;
T->show_branchlengths = TRUE;
T->show_quoted_labels = FALSE;
T->show_numeric_taxonlabels = TRUE;
/* 2nd allocation round */
T->N = ntaxa;
ESL_ALLOC(T->parent, sizeof(int) * (ntaxa-1));
ESL_ALLOC(T->left, sizeof(int) * (ntaxa-1));
ESL_ALLOC(T->right, sizeof(int) * (ntaxa-1));
ESL_ALLOC(T->ld, sizeof(double) * (ntaxa-1));
ESL_ALLOC(T->rd, sizeof(double) * (ntaxa-1));
for (i = 0; i < ntaxa-1; i++)
{
T->parent[i] = 0;
T->left[i ] = 0;
T->right[i] = 0;
T->ld[i] = 0.;
T->rd[i] = 0.;
}
T->nalloc = ntaxa;
return T;
ERROR:
esl_tree_Destroy(T);
return NULL;
}
/* Function: esl_tree_CreateGrowable()
*
* Purpose: Allocate a growable tree structure for an initial
* allocation of <nalloc> taxa, and return a pointer to it.
* <nalloc> must be $\geq 2$.
*
* Args: <nalloc> - initial allocation size for number of taxa
*
* Returns: pointer to a new growable <ESL_TREE> object; caller frees
* this with <esl_tree_Destroy()>.
*
* Throws: <NULL> if allocation fails.
*/
ESL_TREE *
esl_tree_CreateGrowable(int nalloc)
{
ESL_TREE *T = esl_tree_Create(nalloc);
if (T == NULL) return NULL;
T->N = 0;
return T;
}
/* Function: esl_tree_CreateFromString()
*
* Purpose: A convenience for making small test cases in the test
* suites: given the contents of a Newick file as a
* single string <s>, convert it to an <ESL_TREE>.
*
* Returns: a pointer to the new <ESL_TREE> on success.
*
* Throws: <NULL> if it fails to obtain, open, or read the
* temporary file that it puts the string <s> in.
*/
ESL_TREE *
esl_tree_CreateFromString(char *s)
{
char tmpfile[16] = "esltmpXXXXXX";
FILE *fp = NULL;
ESL_TREE *T = NULL;
if (esl_tmpfile(tmpfile, &fp) != eslOK) goto ERROR;
fputs(s, fp);
rewind(fp);
if (esl_tree_ReadNewick(fp, NULL, &T) != eslOK) goto ERROR;
fclose(fp);
return T;
ERROR:
if (fp != NULL) fclose(fp);
if (T != NULL) esl_tree_Destroy(T);
return NULL;
}
/* Function: esl_tree_Grow()
*
* Purpose: Given a tree <T>, make sure it can hold one more taxon;
* reallocate internally if necessary by doubling the
* number of taxa it is currently allocated to hold.
*
* Returns: <eslOK> on success.
*
* Throws: <eslEMEM> on allocation failure. In this case,
* the data in the tree are unaffected.
*/
int
esl_tree_Grow(ESL_TREE *T)
{
void *tmp;
int nnew;
int status;
int i;
if (T->N < T->nalloc) return eslOK; /* do we have room for next taxon? */
nnew = T->nalloc * 2;
/* There are N-1 interior nodes, so arrays of info for
* interior nodes are allocated for (nnew-1), whereas
* arrays of info for the N taxa are allocated (nnew).
*/
ESL_RALLOC(T->parent, tmp, sizeof(int) * (nnew-1));
ESL_RALLOC(T->left, tmp, sizeof(int) * (nnew-1));
ESL_RALLOC(T->right, tmp, sizeof(int) * (nnew-1));
ESL_RALLOC(T->ld, tmp, sizeof(double) * (nnew-1));
ESL_RALLOC(T->rd, tmp, sizeof(double) * (nnew-1));
/* 0..N-2 were already initialized or used.
* Initialize newly alloced space N-1..nnew-2.
*/
for (i = T->nalloc-1; i < nnew-1; i++)
{
T->parent[i] = 0;
T->left[i ] = 0;
T->right[i] = 0;
T->ld[i] = 0.;
T->rd[i] = 0.;
}
if (T->taxaparent != NULL)
{
ESL_RALLOC(T->taxaparent, tmp, sizeof(int) * nnew);
for (i = T->nalloc; i < nnew; i++) T->taxaparent[i] = 0;
}
if (T->cladesize != NULL)
{
ESL_RALLOC(T->cladesize, tmp, sizeof(int) * nnew);
for (i = T->nalloc; i < nnew; i++) T->cladesize[i] = 0;
}
if (T->taxonlabel != NULL)
{
ESL_RALLOC(T->taxonlabel, tmp, sizeof(char *) * nnew);
for (i = T->nalloc; i < nnew; i++) T->taxonlabel[i] = NULL;
}
if (T->nodelabel != NULL)
{
ESL_RALLOC(T->nodelabel, tmp, sizeof(char *) * (nnew-1));
for (i = T->nalloc-1; i < nnew-1; i++) T->nodelabel[i] = NULL;
}
T->nalloc = nnew;
return eslOK;
ERROR:
return eslEMEM;
}
/* Function: esl_tree_SetTaxaParents()
*
* Purpose: Constructs the <T->taxaparent[]> array in the tree
* structure <T>, by an O(N) traversal of the tree.
* Upon return, <T->taxaparent[i]> is the index
* of the internal node that taxon <i> is a child of.
*
* Args: T - the tree structure to map
*
* Returns: <eslOK> on success.
*
* Throws: <eslEMEM> on internal allocation error. In this case, the tree is
* returned unchanged.
*
* Xref: STL11/63
*/
int
esl_tree_SetTaxaParents(ESL_TREE *T)
{
int i;
int status;
if (T->taxaparent != NULL) return eslOK; // map already exists.
ESL_ALLOC(T->taxaparent, sizeof(int) * T->N);
for (i = 0; i < T->N; i++) T->taxaparent[i] = 0; // solely to calm static analyzers.
for (i = 0; i < T->N-1; i++) /* traversal order doesn't matter */
{
if (T->left[i] <= 0) T->taxaparent[-(T->left[i])] = i;
if (T->right[i] <= 0) T->taxaparent[-(T->right[i])] = i;
}
return eslOK;
ERROR:
if (T->taxaparent != NULL) { free(T->taxaparent); T->taxaparent = NULL; }
return status;
}
/* Function: esl_tree_SetCladesizes()
*
* Purpose: Constructs the <T->cladesize[]> array in tree structure
* <T>. Upon successful return, <T->cladesize[i]> is the
* number of taxa contained by the clade rooted at node <i>
* in the tree. For example, <T->cladesize[0]> is $N$ by
* definition, because 0 is the root of the tree.
*
* Returns: <eslOK> on success.
*
* Throws: <eslEMEM> on allocation error; in this case, the
* original <T> is unmodified.
*/
int
esl_tree_SetCladesizes(ESL_TREE *T)
{
int i;
int status;
if (T->cladesize != NULL) return eslOK; /* already set. */
ESL_ALLOC(T->cladesize, sizeof(int) * (T->N-1));
esl_vec_ISet(T->cladesize, T->N-1, 0);
for (i = T->N-2; i >= 0; i--)
{ /* taxon: ...else... internal node: */
if (T->left[i] <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->left[i]];
if (T->right[i] <= 0) T->cladesize[i]++; else T->cladesize[i] += T->cladesize[T->right[i]];
}
return eslOK;
ERROR:
if (T->cladesize != NULL) { free(T->cladesize); T->cladesize = NULL; }
return status;
}
/* Function: esl_tree_SetTaxonlabels()
*
* Purpose: Given an array of taxon names <names[0..N-1]> with the
* same order and number as the taxa in tree <T>, make a
* copy of those names in <T>. For example, <names> might
* be the sequence names in a multiple alignment,
* <msa->sqname>.
*
* If the tree already had taxon names assigned to it, they
* are replaced.
*
* As a special case, if the <names> argument is passed as
* <NULL>, then the taxon labels are set to a string
* corresponding to their internal index; that is, taxon 0
* is labeled "0".
*
* Returns: <eslOK> on success, and internal state of <T>
* (specifically, <T->taxonlabel[]>) now contains a copy
* of the taxa names.
*
* Throws: <eslEMEM> on allocation failure. <T->taxonlabel[]> will be
* <NULL> (even if it was already set).
*/
int
esl_tree_SetTaxonlabels(ESL_TREE *T, char **names)
{
int i;
int status;
if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->N);
ESL_ALLOC(T->taxonlabel, sizeof(char *) * T->nalloc);
for (i = 0; i < T->nalloc; i++) T->taxonlabel[i] = NULL;
if (names != NULL)
{
for (i = 0; i < T->N; i++)
if ((status = esl_strdup(names[i], -1, &(T->taxonlabel[i]))) != eslOK) goto ERROR;
}
else
{
for (i = 0; i < T->N; i++)
{
ESL_ALLOC(T->taxonlabel[i], sizeof(char) * 32); /* enough width for any conceivable int */
snprintf(T->taxonlabel[i], 32, "%d", i);
}
}
return eslOK;
ERROR:
if (T->taxonlabel != NULL) esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc);
return status;
}
/* Function: esl_tree_RenumberNodes()
* Synopsis: Assure nodes are numbered in preorder.
*
* Purpose: Given a tree <T> whose internal nodes might be numbered in
* any order, with the sole requirement that node 0 is the
* root; renumber the internal nodes (if necessary) to be in Easel's
* convention of preorder traversal. No other aspect of <T> is
* altered (including its allocation size).
*
* Returns: <eslOK> on success.
*
* Throws: <eslEMEM> on allocation failure.
*
* Xref: STL11/77
*/
int
esl_tree_RenumberNodes(ESL_TREE *T)
{
ESL_TREE *T2 = NULL;
ESL_STACK *vs = NULL;
int *map = NULL;
int v,new;
int status;
int needs_rearranging = FALSE;
/* Pass 1. Preorder traverse of T by its children links;
* construct map[old] -> new.
*/
ESL_ALLOC(map, sizeof(int) * (T->N-1));
if (( vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
new = 0;
while (esl_stack_IPop(vs, &v) == eslOK)
{
if (v != new) needs_rearranging = TRUE;
map[v] = new++;
if (T->right[v] > 0 && esl_stack_IPush(vs, T->right[v]) != eslOK) { status = eslEMEM; goto ERROR; };
if (T->left[v] > 0 && esl_stack_IPush(vs, T->left[v]) != eslOK) { status = eslEMEM; goto ERROR; };
}
if (! needs_rearranging) { status = eslOK; goto ERROR; } /* not an error; just cleaning up & returning eslOK. */
/* Pass 2. Construct the guts of correctly numbered new T2.
* (traversal order doesn't matter here)
*/
if (( T2 = esl_tree_Create(T->nalloc)) == NULL) { status = eslEMEM; goto ERROR; };
if (T->nodelabel != NULL) {
ESL_ALLOC(T2->nodelabel, sizeof(char *) * (T2->nalloc-1));
for (v = 0; v < T2->nalloc-1; v++) T2->nodelabel[v] = NULL;
}
if (T->taxaparent != NULL) {
ESL_ALLOC(T2->taxaparent, sizeof(int) * (T2->nalloc));
for (v = 0; v < T2->nalloc; v++) T2->taxaparent[v] = 0;
}
for (v = 0; v < T->N-1; v++)
{
T2->parent[map[v]] = map[T->parent[v]];
if (T->left[v] > 0) T2->left[map[v]] = map[T->left[v]]; /* internal nodes renumbered... */
else T2->left[map[v]] = T->left[v]; /* ...taxon indices unchanged */
if (T->right[v] > 0) T2->right[map[v]] = map[T->right[v]];
else T2->right[map[v]] = T->right[v];
T2->ld[map[v]] = T->ld[v];
T2->rd[map[v]] = T->rd[v];
if (T->taxaparent != NULL) {
if (T->left[v] <= 0) T2->taxaparent[-(T->left[v])] = map[v];
if (T->right[v] <= 0) T2->taxaparent[-(T->right[v])] = map[v];
}
if (T->nodelabel != NULL)
esl_strdup(T->nodelabel[v], -1, &(T2->nodelabel[map[v]]));
}
/* Finally, swap the new guts of T2 with the old guts of T;
* destroy T2 and return. T is now renumbered.
*/
ESL_SWAP(T->parent, T2->parent, int *);
ESL_SWAP(T->left, T2->left, int *);
ESL_SWAP(T->right, T2->right, int *);
ESL_SWAP(T->ld, T2->ld, double *);
ESL_SWAP(T->rd, T2->rd, double *);
ESL_SWAP(T->taxaparent, T2->taxaparent, int *);
ESL_SWAP(T->nodelabel, T2->nodelabel, char **);
free(map);
esl_stack_Destroy(vs);
esl_tree_Destroy(T2);
return eslOK;
ERROR:
if (map != NULL) free(map);
if (vs != NULL) esl_stack_Destroy(vs);
if (T2 != NULL) esl_tree_Destroy(T2);
return status;
}
/* Function: esl_tree_VerifyUltrametric()
*
* Purpose: Verify that tree <T> is ultrametric.
*
* Returns: <eslOK> if so; <eslFAIL> if not.
*
* Throws: <eslEMEM> on an allocation failure.
*/
int
esl_tree_VerifyUltrametric(ESL_TREE *T)
{
double *d = NULL; /* Distance from root for each OTU */
int status;
int i, child, parent;
/* First, calculate distance from root to each taxon.
* (This chunk of code might be useful to put on its own someday.)
*/
ESL_ALLOC(d, sizeof(double) * T->N);
if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
for (i = 0; i < T->N; i++)
{
d[i] = 0.0;
parent = T->taxaparent[i];
if (T->left[parent] == -i) d[i] += T->ld[parent];
else if (T->right[parent] == -i) d[i] += T->rd[parent];
else ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops");
while (parent != 0) /* upwards to the root */
{
child = parent;
parent = T->parent[child];
if (T->left[parent] == child) d[i] += T->ld[parent];
else if (T->right[parent] == child) d[i] += T->rd[parent];
else ESL_XEXCEPTION(eslEINCONCEIVABLE, "oops");
}
}
/* In an ultrametric tree, all those distances must be equal.
*/
for (i = 1; i < T->N; i++)
if ((status = esl_DCompare(d[0], d[i], 0.0001)) != eslOK) break;
free(d);
return status;
ERROR:
if (d != NULL) free(d);
return status;
}
/* Function: esl_tree_Validate()
*
* Purpose: Validates the integrity of the data structure in <T>.
* Returns <eslOK> if the internal data in <T> are
* consistent and valid. Returns <eslFAIL> if not,
* and if a non-<NULL> message buffer <errbuf> has been
* provided by the caller, an informative message is
* left in <errbuf> describing the reason for the
* failure.
*
* Args: T - tree structure to validate
* errbuf - NULL, or a buffer of at least p7_ERRBUFSIZE
* chars to contain an error message upon
* any validation failure.
*/
int
esl_tree_Validate(ESL_TREE *T, char *errbuf)
{
int N;
int i, c;
int shouldbe;
int status;
if (errbuf != NULL) *errbuf = 0;
N = T->N; /* just to save writing T->N so many times below */
if (N < 2) ESL_XFAIL(eslFAIL, errbuf, "number of taxa is less than 2");
if (T->parent[0] != 0) ESL_XFAIL(eslFAIL, errbuf, "parent of root 0 should be set to 0");
if (T->nalloc < N) ESL_XFAIL(eslFAIL, errbuf, "number of taxa N is less than allocation");
/* Verify preorder tree numbering.
*/
for (i = 0; i < N-1; i++)
{
if (T->left[i] > 0 && T->left[i] < i)
ESL_XFAIL(eslFAIL, errbuf, "l child of node %d not in preorder", i);
if (T->right[i] > 0 && T->right[i] < i)
ESL_XFAIL(eslFAIL, errbuf, "r child of node %d not in preorder", i);
}
/* Range checks on values. */
for (i = 0; i < N-1; i++)
{
if (T->parent[i] < 0 || T->parent[i] > N-2)
ESL_XFAIL(eslFAIL, errbuf, "parent idx of node %d invalid", i);
if (T->left[i] < -(N-1) || T->left[i] > N-2)
ESL_XFAIL(eslFAIL, errbuf, "left child idx of node %d invalid", i);
if (T->right[i] < -(N-1) || T->right[i] > N-2)
ESL_XFAIL(eslFAIL, errbuf, "right child idx of node %d invalid", i);
if (T->ld[i] < 0.)
ESL_XFAIL(eslFAIL, errbuf, "negative l branch length at node %d", i);
if (T->rd[i] < 0.)
ESL_XFAIL(eslFAIL, errbuf, "negative r branch length at node %d", i);
if (T->cladesize != NULL && (T->cladesize[i] < 0 || T->cladesize[i] > N))
ESL_XFAIL(eslFAIL, errbuf, "invalid cladesize at node %d", i);
}
for (c = 0; c < N; c++)
if (T->taxaparent != NULL && (T->taxaparent[c] < 0 || T->taxaparent[c] > N-2))
ESL_XFAIL(eslFAIL, errbuf, "invalid taxaparent at node %d", c);
/* more sophisticated integrity checks on parent-child relations in
nodes ...*/
for (i = 1; i < T->N-1; i++)
if (T->left[T->parent[i]] != i && T->right[T->parent[i]] != i)
ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at internal node %d\n", i);
/* ...and between terminal nodes and taxa.
*/
if (T->taxaparent != NULL)
for (c = 0; c < T->N; c++)
if (T->left[T->taxaparent[c]] != -c && T->right[T->taxaparent[c]] != -c)
ESL_XFAIL(eslFAIL, errbuf, "parent/child link discrepancy at taxon %d\n", c);
/* check on cladesizes */
if (T->cladesize != NULL)
for (i = 0; i < T->N-1; i++)
{
shouldbe = 0;
if (T->left[i] > 0) shouldbe += T->cladesize[T->left[i]]; else shouldbe++;
if (T->right[i] > 0) shouldbe += T->cladesize[T->right[i]]; else shouldbe++;
if (shouldbe != T->cladesize[i])
ESL_XFAIL(eslFAIL, errbuf, "incorrect cladesize at node %d", i);
}
return eslOK;
ERROR:
return status;
}
/* Function: esl_tree_Destroy()
*
* Purpose: Frees an <ESL_TREE> object.
*/
void
esl_tree_Destroy(ESL_TREE *T)
{
if (T == NULL) return;
esl_free(T->parent);
esl_free(T->left);
esl_free(T->right);
esl_free(T->ld);
esl_free(T->rd);
esl_free(T->taxaparent);
esl_free(T->cladesize);
esl_arr2_Destroy((void **) T->taxonlabel, T->nalloc);
esl_arr2_Destroy((void **) T->nodelabel, T->nalloc-1);
free(T);
return;
}
/*----------------- end, ESL_TREE object -----------------------*/
/*****************************************************************
*# 2. Newick format i/o
*****************************************************************/
/* newick_validate_unquoted():
* Returns <eslOK> if we can represent <label> as an unquoted label
* in Newick format. (Spaces are ok, but will be converted to
* _ on output.)
*/
static int
newick_validate_unquoted(char *label)
{
char *sptr;
for (sptr = label; *sptr != '\0'; sptr++)
{
if (! isprint(*sptr)) return eslFAIL;
if (strchr("()[]':;,", *sptr) != NULL) return eslFAIL;
}
return eslOK;
}
/* newick_validate_quoted():
* Returns <eslOK> if we can represent <label> as a
* quoted label in Newick format. (Single quotes will
* be converted to '' on output.)
*/
static int
newick_validate_quoted(char *label)
{
char *sptr;
for (sptr = label; *sptr != '\0'; sptr++)
{
if (! isprint(*sptr)) return eslFAIL;
}
return eslOK;
}
/* newick_write_unquoted():
* Prints <label> to <fp> as an unquoted Newick label.
*/
static int
newick_write_unquoted(FILE *fp, char *label)
{
char *sptr;
for (sptr = label; *sptr != '\0'; sptr++)
{
if (*sptr == ' ') { if (fputc('_', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
else { if (fputc(*sptr, fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
}
return eslOK;
}
/* newick_write_quoted():
* Prints <label> to <fp> as a quoted Newick label.
*/
static int
newick_write_quoted(FILE *fp, char *label)
{
char *sptr;
if (fputc('\'', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
for (sptr = label; *sptr != '\0'; sptr++)
{
if (*sptr == '\'') { if (fprintf(fp, "''") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
else { if (fputc(*sptr, fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
}
if (fputc('\'', fp) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
return eslOK;
}
/* newick_write_taxonlabel():
* Print the label for taxon <v> to stream <fp>.
* Tries to print label as an unquoted label, then
* as a quoted label, (then fails).
* If label isn't available, does nothing.
* If label contains invalid characters, throws <eslECORRUPT>.
*/
static int
newick_write_taxonlabel(FILE *fp, ESL_TREE *T, int v)
{
int status;
if (T->taxonlabel == NULL || T->taxonlabel[v] == NULL)
{
if (T->show_numeric_taxonlabels && fprintf(fp, "%d", v) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
return eslOK;
}
if (! T->show_quoted_labels && newick_validate_unquoted(T->taxonlabel[v]) == eslOK)
status = newick_write_unquoted(fp, T->taxonlabel[v]);
else if (newick_validate_quoted(T->taxonlabel[v]) == eslOK)
status = newick_write_quoted(fp, T->taxonlabel[v]);
else
ESL_EXCEPTION(eslECORRUPT, "bad taxon label");
return status;
}
/* newick_write_nodelabel():
* Print the label for internal node <v> to stream <fp>.
* Tries to print label as an unquoted label, then
* as a quoted label.
* If label isn't available, does nothing.
* If tree's options say not to print node labels, does nothing.
* If label contains invalid characters, throws <eslECORRUPT>.
*/
static int
newick_write_nodelabel(FILE *fp, ESL_TREE *T, int v)
{
int status;
if (T->nodelabel == NULL) return eslOK;
if (T->nodelabel[v] == NULL) return eslOK;
if (T->show_node_labels != TRUE) return eslOK;
if (! T->show_quoted_labels && newick_validate_unquoted(T->nodelabel[v]) == eslOK)
status = newick_write_unquoted(fp, T->nodelabel[v]);
else if (newick_validate_quoted(T->nodelabel[v]) == eslOK)
status = newick_write_quoted(fp, T->nodelabel[v]);
else
ESL_EXCEPTION(eslECORRUPT, "bad node label\n");
return status;
}
/* newick_write_branchlength()
* Writes the branch length *to* <v>.
* If <v> is negative, it's a leaf; if <v> is positive, it's an internal node.
* You can't pass the root node 0 to this. 0 always means taxon 0.
* There is no branch to the root node.
*/
static int
newick_write_branchlength(FILE *fp, ESL_TREE *T, int v)
{
double branchlength;
if (! T->show_branchlengths) return eslOK;
if (T->taxaparent == NULL) ESL_EXCEPTION(eslEINVAL, "T must have taxaparent");
if (v <= 0) /* leaf */
{
if (T->left [T->taxaparent[-v]] == v) branchlength = T->ld[T->taxaparent[-v]];
else if (T->right[T->taxaparent[-v]] == v) branchlength = T->rd[T->taxaparent[-v]];
else ESL_EXCEPTION(eslECORRUPT, "Can't find branch length");
}
else /* internal node */
{
if (T->left [T->parent[v]] == v) branchlength = T->ld[T->parent[v]];
else if (T->right[T->parent[v]] == v) branchlength = T->rd[T->parent[v]];
else ESL_EXCEPTION(eslECORRUPT, "Can't find branch length");
}
if (fprintf(fp, ":%f", branchlength) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "newick tree write failed");
return eslOK;
}
/* Function: esl_tree_WriteNewick()
*
* Purpose: Writes tree <T> to stream <fp> in Newick format.
*
* Certain options are set in <T> to control output style,
* as follows.
*
* If <T->show_unrooted> is <TRUE>, <T> is printed as an
* unrooted tree starting with a trifurcation, a la PHYLIP
* format (default=<FALSE>).
*
* If <T->show_node_labels> is <TRUE>, then labels are
* shown for internal nodes, if any are available
* (default=<TRUE>).
*
* If <T->show_branchlengths> is <TRUE>, then branch
* lengths are shown, as opposed to just printing a labeled
* topology (default=<TRUE>).
*
* If <T->show_root_branchlength> is also <TRUE>, then a
* 0.0 branchlength is shown to the root node, a la Hein's
* TreeAlign Newick format (default=<FALSE>).
*
* If <T->show_quoted_labels> is <TRUE>, then all labels
* are shown in Newick's quoted format, as opposed to only
* using quoted labels where necessary (default=<FALSE>).
*
* Returns: <eslOK> on success.
*
* Throws: <eslEMEM> on allocation error.
* <eslEWRITE> on any system write error.
* <eslEINCONCEIVABLE> on internal error.
*
* Xref: SRE:STL11/74
*/
int
esl_tree_WriteNewick(FILE *fp, ESL_TREE *T)
{
ESL_STACK *vs = NULL;
ESL_STACK *cs = NULL;
int v;
char c;
int status;
if ((vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; }
if ((cs = esl_stack_CCreate()) == NULL) { status = eslEMEM; goto ERROR; }
if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
/* Initialization.
* Push a trifurcation (swallowing the right internal node) if unrooted;
* else push the first bifurcation.
*
* When we push a trifurcation, the branch lengths will come out fine
* on output, if the tree followed the correct convention of having
* a T->rd[0] = 0.0.
*/
if (fputc('(', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
if (T->show_unrooted && T->right[0] > 0)
{
v = T->right[0];
if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
if ((status = esl_stack_IPush(vs, T->right[v])) != eslOK) goto ERROR;
if ((status = esl_stack_CPush(cs, ',')) != eslOK) goto ERROR;
if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
if ((status = esl_stack_IPush(vs, T->left[v])) != eslOK) goto ERROR;
}
else
{
if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
if ((status = esl_stack_IPush(vs, T->right[0])) != eslOK) goto ERROR;
}
if ((status = esl_stack_CPush(cs, ',')) != eslOK) goto ERROR;
if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
if ((status = esl_stack_IPush(vs, T->left[0])) != eslOK) goto ERROR;
/* Main iteration. Pop off stacks 'til they're empty.
*/
while ((status = esl_stack_CPop(cs, &c)) == eslOK)
{
if (c == ',') { /* comma doesn't have a v stacked with it */
if (fputc(',', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
continue;
}
if ((status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
switch (c) {
case 'x': /* a subtree, which could be a node or a taxon: */
if (v > 0) /* internal node 1..N-2*/
{
if (fputc('(', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
if ((status = esl_stack_CPush(cs, ')')) != eslOK) goto ERROR;
if ((status = esl_stack_IPush(vs, v)) != eslOK) goto ERROR;
if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
if ((status = esl_stack_IPush(vs, T->right[v])) != eslOK) goto ERROR;
if ((status = esl_stack_CPush(cs, ',')) != eslOK) goto ERROR;
if ((status = esl_stack_CPush(cs, 'x')) != eslOK) goto ERROR;
if ((status = esl_stack_IPush(vs, T->left[v])) != eslOK) goto ERROR;
}
else /* taxon -(N-1)..0 */
{ /* -v below to convert taxon code to 0..N-1 */
if ((status = newick_write_taxonlabel (fp, T, -v)) != eslOK) goto ERROR;
if ((status = newick_write_branchlength(fp, T, v)) != eslOK) goto ERROR;
}
break;
case ')': /* closing an internal node. v > 0 is a node code. */
if (fputc(')', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
if ((status = newick_write_nodelabel (fp, T, v)) != eslOK) goto ERROR;
if ((status = newick_write_branchlength(fp, T, v)) != eslOK) goto ERROR;
break;
default:
ESL_XEXCEPTION(eslEINCONCEIVABLE, "bad state code");
break;
}
}
/* Termination
*/
if (fputc(')', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
if ((status = newick_write_nodelabel(fp, T, 0)) != eslOK) goto ERROR;
if (T->show_branchlengths && T->show_root_branchlength)
{ if (fprintf(fp, ":0.0") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed"); }
if (fputc(';', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "newick tree write failed");
esl_stack_Destroy(vs);
esl_stack_Destroy(cs);
return eslOK;
ERROR:
if (vs != NULL) esl_stack_Destroy(vs);
if (cs != NULL) esl_stack_Destroy(cs);
return status;
}
/* newick_advance_buffer()
*
* Advance the read buffer by one character; reload it
* if we reach the end. <eslOK> on success, and <eslEOF>
* if the read fails.
*/
static int
newick_advance_buffer(FILE *fp, char *buf, int *pos, int *nc)
{
(*pos)++;
if (*pos == *nc)
{
*nc = fread(buf, sizeof(char), 4096, fp);
if (*nc == 0) return eslEOF;
*pos = 0;
}
return eslOK;
}
/* newick_skip_whitespace()
*
* Given the 4k input buffer <buf>, which currently contains
* <*nc> total characters and is positioned at position <*pos>,
* move <*pos> to be at the first nonwhitespace character,
* skipping any Newick comments ([...]) encountered. Read
* new data from the stream <fp> into <buf> as needed.
*
* Return <eslOK> on success. <*pos> is reset to point at a
* non-whitespace input character. <*nc> may be reset and the contents
* of <buf> altered if a new block was read from <fp> into <buf>.
*
* Returns <eslEOF> if end-of-file is reached in <fp> before a data
* character is found, or if a read error occurs.
*/
static int
newick_skip_whitespace(FILE *fp, char *buf, int *pos, int *nc)
{
int commentlevel = 0;
while (commentlevel > 0 || isspace(buf[*pos]) || buf[*pos] == '[')
{
if (buf[*pos] == '[') commentlevel++;
if (buf[*pos] == ']') commentlevel--;
if (newick_advance_buffer(fp, buf, pos, nc) == eslEOF) return eslEOF;
}
return eslOK;
}
/* newick_parse_quoted_label()
*
* On entry, buf[pos] == '\'': the opening single quote.
* On exit, buf[pos] is positioned at the next data character following the closing
* single quote; possibly the ':' for a branch length;
* and <ret_label> points to a newly allocated, NUL-terminated string
* containing the label that was read (possibly the empty string).
* Returns eslOK on success.
*
* Returns eslEFORMAT on parse error, eslEOF if it runs out of data.
*/
static int
newick_parse_quoted_label(FILE *fp, char *buf, int *pos, int *nc, char **ret_label)
{
char *label = NULL;
void *tmp;
int n = 0;
int nalloc = 0;
int status;
nalloc = 32;
ESL_ALLOC(label, sizeof(char) * nalloc);
/* advance past the opening ' */
if (buf[*pos] != '\'') { status = eslEFORMAT; goto ERROR; }
if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
/* skip leading whitespace (\n and comments forbidden in quoted label) */
while (buf[*pos] == '\t' || buf[*pos] == ' ')
if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
/* Read the label */
while (1) {
if (buf[*pos] == '\'') { /* watch out for escaped single quotes, '' */
if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
if (buf[*pos] != '\'') break; /* we've just moved past the last ' */
}
label[n++] = buf[*pos];
if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
if (n == (nalloc-1)) { /* reallocate label if it fills, leave room for NUL */
ESL_RALLOC(label, tmp, sizeof(char) * (nalloc * 2));
nalloc *= 2;
}
}
/* backtrack over any trailing whitespace and nul-terminate. */
while (n>0 && isspace(label[n-1])) n--;
label[n] = '\0';
*ret_label = label;
return eslOK;
ERROR:
if (label != NULL) { free(label); *ret_label = NULL; }
return status;
}
/* newick_parse_unquoted_label
*
* On entry, buf[pos] == first character in the label.
* On exit, buf[pos] is positioned at the next data character following the end
* of the label -- one of "),\t\n;[:" --
* and <ret_label> points to a newly allocated, NUL-terminated string
* containing the label that was read (possibly the empty string).
* Returns eslOK on success.
*
* Returns eslEFORMAT on parse error, eslEOF if it runs out of data.
*/
static int
newick_parse_unquoted_label(FILE *fp, char *buf, int *pos, int *nc, char **ret_label)
{
char *label = NULL;
char *tmp = NULL;
int n = 0;
int nalloc = 0;
int status;
nalloc = 32;
ESL_ALLOC(label, sizeof(char) * nalloc);
while (1) {
if (strchr("(]", buf[*pos]) != NULL) { status = eslEFORMAT; goto ERROR; }
if (strchr(" \t\n)[':;,", buf[*pos]) != NULL) { break; }
label[n++] = buf[*pos];
if (newick_advance_buffer(fp, buf, pos, nc) == eslEOF) { status = eslEOF; goto ERROR; }
if (n == (nalloc-1)) { /* reallocate label if it fills, leave room for NUL */
ESL_RALLOC(label, tmp, sizeof(char) * (nalloc * 2));
nalloc *= 2;
}
}
label[n] = '\0';
*ret_label = label;
return eslOK;
ERROR:
if (label != NULL) { free(label); *ret_label = NULL; }
return status;
}
/* newick_parse_branchlength
*
* On entry, buf[pos] == ':'
* On exit, buf[pos] is positioned at the next data character following the end
* of the branchlength -- one of "),\t\n;[:"
* and <ret_d> is the branch length that was read.
*
* Returns eslOK on success;
*
* Returns eslEFORMAT on parse error (including nonexistent branch lengths),
* eslEOF if it runs out of data in the file.
*/
static int
newick_parse_branchlength(FILE *fp, char *buf, int *pos, int *nc, double *ret_d)
{
char *branch = NULL;
char *tmp = NULL;
int n = 0;
int nalloc = 0;
int status;
nalloc = 32;
ESL_ALLOC(branch, sizeof(char) * nalloc);
if (buf[*pos] != ':') { status = eslEFORMAT; goto ERROR; }
if (( status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
while (1) {
if (strchr("(]", buf[*pos]) != NULL) { status = eslEFORMAT; goto ERROR; }
if (strchr(" \t\n)[':;,", buf[*pos]) != NULL) break;
branch[n++] = buf[*pos];
if ((status = newick_advance_buffer(fp, buf, pos, nc)) != eslOK) goto ERROR;
if (n == (nalloc-1)) { /* reallocate label if it fills, leave room for NUL */
ESL_RALLOC(branch, tmp, sizeof(char) * (nalloc * 2));
nalloc *= 2;
}
}
branch[n] = '\0';
*ret_d = strtod(branch, &tmp);
if (n == 0 || tmp != branch+n) { status = eslEFORMAT; goto ERROR; }
free(branch);
return eslOK;
ERROR:
if (branch != NULL) free(branch);
*ret_d = 0.;
return status;
}
/* Function: esl_tree_ReadNewick()
* Synopsis: Input a Newick format tree.
*
* Purpose: Read a Newick format tree from an open input stream <fp>.
* Return the new tree in <ret_T>.
*
* The new tree <T> will have the optional <T->taxonlabel> and
* <T->nodelabel> arrays allocated, containing names of all the
* taxa and nodes. Whenever no label appeared in the Newick file
* for a node or taxon, the label is set to the empty string.
*
* Caller may optionally provide an <errbuf> of at least
* <eslERRBUFSIZE> chars, to retrieve diagnostic information
* in case of a parsing problem; or <errbuf> may be passed as
* <NULL>.
*
* Args: fp - open input stream
* errbuf - NULL, or allocated space for >= eslERRBUFSIZE chars
* ret_T - RETURN: the new tree.
*
* Returns: Returns <eslOK> on success, and <ret_T> points
* to the new tree.
*
* Returns <eslEFORMAT> on parse errors, such as premature EOF
* or bad syntax in the Newick file. In this case, <ret_T> is
* returned NULL, and the <errbuf> (if provided> contains an
* informative error message.
*
* Throws: <eslEMEM> on memory allocation errors.
* <eslEINCONCEIVABLE> may also arise in case of internal bugs.
*
* Xref: STL11/75
*/
int
esl_tree_ReadNewick(FILE *fp, char *errbuf, ESL_TREE **ret_T)
{
ESL_TREE *T = NULL; /* the new, growing tree */
ESL_STACK *cs = NULL; /* state stack: possible states are LRX);, */
ESL_STACK *vs = NULL; /* node index stack: LRX) states are associated with node #'s */
int status;
char buf[4096]; /* 4K input buffer */
int pos,nc; /* position in buf, and number of chars in buf */
char c; /* current state */
int v; /* current node idx */
int currnode;
int currtaxon;
char *label; /* a parsed label */
double d; /* a parsed branch length */
if (errbuf != NULL) *errbuf = '\0';
if ((vs = esl_stack_ICreate()) == NULL) { status = eslEMEM; goto ERROR; };
if ((cs = esl_stack_CCreate()) == NULL) { status = eslEMEM; goto ERROR; };
/* Create the tree, initially allocated for 32 taxa.
* Allocate for taxon and node labels, too.
*/
if ((T = esl_tree_CreateGrowable(32)) == NULL) { status = eslEMEM; goto ERROR; };
ESL_ALLOC(T->taxonlabel, sizeof(char *) * 32);
ESL_ALLOC(T->nodelabel, sizeof(char *) * 31);
for (currtaxon = 0; currtaxon < 32; currtaxon++) T->taxonlabel[currtaxon] = NULL;
for (currnode = 0; currnode < 31; currnode++) T->nodelabel[currnode] = NULL;
/* Load the input buffer
*/
if ((nc = fread(buf, sizeof(char), 4096, fp)) == 0)
ESL_XFAIL(eslEFORMAT, errbuf, "file is empty.");
pos = 0;
/* Initialization:
* create the root node in the tree;
* push L,R...); onto the stacks;
* swallow the first ( in the file.
*
* A note on memory management in the growing tree:
* we are going to keep T->N set to the number of taxa
* that the tree *will* contain, given the number of nodes
* it currently *does* contain. Before we try to add
* any new node, we call the Grow() routine, which will
* check T->N against T->nalloc. This strategy works
* because nodes always get added before their children
* taxa, so the # of taxa is always <= nodes-1: that is,
* our need for reallocation is driven by new nodes,
* never by new taxa; that is, if we have enough room
* for nodes, we automatically have enough room for the
* taxa.
*/
T->parent[0] = 0;
currnode = 1;
currtaxon = 0;
T->N = 2; /* c.f. note above: T->N is the # of taxa we *would* hold, given currnode=1*/
if (esl_stack_CPush(cs, ';') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, ')') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, 'X') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, 'R') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, ',') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, 'L') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, 0) != eslOK) { status = eslEMEM; goto ERROR; };
if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
if (buf[pos] != '(')
ESL_XFAIL(eslEFORMAT, errbuf, "file is not in Newick format.");
if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
/* Iteration.
*/
while ((status = esl_stack_CPop(cs, &c)) == eslOK)
{
if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
if (c == ',')
{
if (buf[pos] != ',')
ESL_XFAIL(eslEFORMAT, errbuf, "expected a comma, saw %c.", buf[pos]);
if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
continue;
}
else if (c == ';')
{
if (buf[pos] != ';')
ESL_XFAIL(eslEFORMAT, errbuf, "expected a semicolon, saw %c.", buf[pos]);
break; /* end of the Newick file */
}
else if (c == 'L' || c == 'R') /* c says, we expect to add a subtree next */
{
if (esl_stack_IPop(vs, &v) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* v = parent of currnode */
if (buf[pos] == '(') /* a new interior node attaches to v */
{
if (esl_tree_Grow(T) != eslOK) { status = eslEMEM; goto ERROR; }; /* c.f. memory management note: check that we can add new node */
T->parent[currnode] = v;
if (c == 'L') T->left[v] = currnode;
else T->right[v] = currnode;
if (esl_stack_CPush(cs, ')') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, 'X') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, 'R') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, ',') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, 'L') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
currnode++; /* T->N == # of internal nodes/idx of next internal node */
}
else /* a taxon attaches to v */
{
if (buf[pos] == '\'') { /* a quoted label, for a new taxon attached to v*/
if ((status = newick_parse_quoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a quoted taxon label");
} else { /* an unquoted label, for a new taxon attached to v */
if ((status = newick_parse_unquoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse an unquoted taxon label");
}
if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely");
d = 0.;
if (buf[pos] == ':') {
if ((status = newick_parse_branchlength(fp, buf, &pos, &nc, &d)) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a branch length");
}
if (c == 'L') { T->left[v] = -currtaxon; T->ld[v] = d; }
else { T->right[v] = -currtaxon; T->rd[v] = d; }
T->taxonlabel[currtaxon] = label;
currtaxon++;
}
}
else if (c == ')') /* c says, expect to close an interior node next */
{
/* get v = the interior node we're closing, naming, and setting a branch length to */
if (( status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
if (buf[pos] != ')') ESL_XFAIL(eslEFORMAT, errbuf, "Parse error: expected ) to close node #%d\n", v);
if (newick_advance_buffer(fp, buf, &pos, &nc) == eslEOF)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
if (buf[pos] == '\'') {
if ((status = newick_parse_quoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a quoted node label");
} else { /* an unquoted label, for a new taxon attached to v */
if ((status = newick_parse_unquoted_label(fp, buf, &pos, &nc, &label)) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse an unquoted node label");
}
if (newick_skip_whitespace(fp, buf, &pos, &nc) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "file ended prematurely.");
d = 0.;
if (buf[pos] == ':') {
if ((status = newick_parse_branchlength(fp, buf, &pos, &nc, &d)) != eslOK)
ESL_XFAIL(eslEFORMAT, errbuf, "failed to parse a branch length");
}
if (v > 0) { /* branch length to root node is meaningless, ignore it */
if (T->left [T->parent[v]] == v) T->ld[T->parent[v]] = d;
else if (T->right[T->parent[v]] == v) T->rd[T->parent[v]] = d;
}
T->nodelabel[v] = label;
}
else if (c == 'X') /* optionally, multifurcations: if we see a comma, we have another node to deal with */
{
if ((status = esl_stack_IPop(vs, &v)) != eslOK) goto ERROR;
if (buf[pos] != ',') continue;
if (esl_tree_Grow(T) != eslOK) { status = eslEMEM; goto ERROR; }; /* c.f. memory management note: check that we can add new node */
/* v = the interior node that is multifurcated.
* What we're going to do is to create a new node y; move the existing right child of v
* to the left child of y; and connect y as the new right child of v with a branch
* length of zero. The right child of y is now open. Then, we push a X->,RX production, so the next subtree will
* be parsed as the right child of y. We can do this ad infinitum, resulting in
* a representation of a multifurcation as, for example, a (A,(B,(C,(D,E)))) binary
* subtree with zero length interior branches for a five-way multifurcation.
*
* This swapping destroys the order of the nodes: they will not be in preorder traversal.
* This is temporarily ok. We renumber later.
*/
T->left[currnode] = T->right[v];
T->ld[currnode] = T->rd[v];
T->parent[currnode] = v;
if (T->right[v] > 0)
T->parent[T->right[v]] = currnode;
T->right[v] = currnode;
T->rd[v] = 0.;
if (esl_stack_CPush(cs, 'X') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, 'R') != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_IPush(vs, currnode) != eslOK) { status = eslEMEM; goto ERROR; };
if (esl_stack_CPush(cs, ',') != eslOK) { status = eslEMEM; goto ERROR; };
currnode++;
}
T->N = currnode + 1; /* c.f. memory management note: keep T->N = # of taxa the tree *would* hold, given currnode */
}
esl_tree_RenumberNodes(T);
esl_stack_Destroy(cs);
esl_stack_Destroy(vs);
*ret_T = T;
return eslOK;
ERROR:
if (T != NULL) esl_tree_Destroy(T);
if (cs != NULL) esl_stack_Destroy(cs);
if (vs != NULL) esl_stack_Destroy(vs);
*ret_T = NULL;
return status;
}
/*-------------------- end, Newick i/o --------------------------*/
/*****************************************************************
* 3. Tree comparison algorithms
*****************************************************************/
/* Function: esl_tree_Compare()
*
* Purpose: Given two trees <T1> and <T2> for the same
* set of <N> taxa, compare the topologies of the
* two trees.
*
* The routine must be able to determine which taxa are
* equivalent in <T1> and <T2>. If <T1> and <T2> both have
* taxon labels set, then the routine compares labels.
* This is the usual case. (Therefore, the <N> labels must
* all be different, or the routine will be unable to do
* this mapping uniquely.) As a special case, if neither
* <T1> nor <T2> has taxon labels, then the indexing of
* taxa <0..N-1> is assumed to be exactly the same in the
* two trees. (And if one tree has labels and the other
* does not, an <eslEINVAL> exception is thrown.)
*
* For comparing unrooted topologies, be sure that <T1> and
* <T2> both obey the unrooted tree convention that the
* "root" is placed on the branch to taxon 0. (That is,
* <T->left[0] = 0>.)
*
* Returns: <eslOK> if tree topologies are identical. <eslFAIL>
* if they aren't.
*
* Throws: <eslEMEM> on allocation error. <eslEINVAL> if the taxa in
* the trees can't be mapped uniquely and completely to
* each other (because one tree doesn't have labels and
* one does, or because the labels aren't unique, or the
* two trees have different taxa).
*/
int
esl_tree_Compare(ESL_TREE *T1, ESL_TREE *T2)
{
int *Mg = NULL; /* the M(g) tree-mapping function for internal nodes [0..N-2] */
int *Mgt = NULL; /* the M(g) tree-mapping function for leaves (taxa), [0..N-1] */
int g, child; /* node indices for parent, children */
int a,b;
int status;
if (T1->N != T2->N) ESL_EXCEPTION(eslEINVAL, "trees don't have the same # of taxa");
/* We need taxon parent map in tree 2, but not tree 1.
*/
if ((status = esl_tree_SetTaxaParents(T2)) != eslOK) goto ERROR;
/* We're going to use the tree mapping function M(g) [Goodman79].
* In the implementation here, we split it into two, Mg[] for internal
* nodes 0..N-2 and Mgt[] for taxa 0..N-1.
*
* Mg[g] for node g in T1 is the index of the lowest node in T2
* that contains the same children taxa as the subtree
* under g in T1.
*
* For the taxa, Mgt[g] for taxon g in T1 is the index of the
* corresponding taxon in T2. If neither tree has taxon labels
* Mgt[g] = g for all g. Otherwise we have to compare labels. Right
* now, we do this by brute force, which is O(N^2); if this ever
* becomes rate limiting, replace it with a keyhash to make it O(N)
* (and in fact, the keyhash of taxon names could even become part
* of the ESL_TREE).
*/
ESL_ALLOC(Mg, sizeof(int) * (T1->N-1));
ESL_ALLOC(Mgt, sizeof(int) * (T1->N));
if (T1->taxonlabel != NULL && T2->taxonlabel != NULL)
{
esl_vec_ISet(Mgt, T1->N, -1); /* flags for "unset" */
for (a = 0; a < T1->N; a++)
{
for (b = 0; b < T1->N; b++)
if (strcmp(T1->taxonlabel[a], T2->taxonlabel[b]) == 0)
{ Mgt[a] = b; break; }
}
for (a = 0; a < T1->N; a++)
if (Mgt[a] == -1) ESL_XEXCEPTION(eslEINVAL, "couldn't map taxa");
}
else if (T1->taxonlabel == NULL && T2->taxonlabel == NULL)
{
for (a = 0; a < T1->N; a++)
Mgt[a] = a;
}
else
ESL_XEXCEPTION(eslEINVAL, "either both trees must have taxon labels, or neither");
/* Finally, we use the SDI algorithm [ZmasekEddy01] to construct
* M(g) for internal nodes, by postorder traversal of T1.
*/
for (g = T1->N-2; g >= 0; g--)
{
child = T1->left[g];
if (child <= 0) a = T2->taxaparent[Mgt[-child]];
else a = T2->parent[Mg[child]];
child = T1->right[g];
if (child <= 0) b = T2->taxaparent[Mgt[-child]];
else b = T2->parent[Mg[child]];
/* a shortcut in SDI: special case for exact tree comparison: */
if (a != b) { free(Mg); free(Mgt); return eslFAIL; }
Mg[g] = a;
}
free(Mg);
free(Mgt);
return eslOK;
ERROR:
if (Mg != NULL) free(Mg);
if (Mgt != NULL) free(Mgt);
return status;
}
/*----------------- end, tree comparison -----------------------*/
/*****************************************************************
* 4. Clustering algorithms for tree construction.
*****************************************************************/
/* cluster_engine()
*
* Implements four clustering algorithms for tree construction:
* UPGMA, WPGMA, single-linkage, and maximum-linkage. These differ
* only by the rule used to construct new distances after joining
* two clusters i,j.
*
* Input <D_original> is a symmetric distance matrix, for <D->n> taxa.
* The diagonal is all 0's, and off-diagonals are $\geq 0$. <D->n>
* must be at least two.
*
* <mode> is one of <eslUPGMA>, <eslWPGMA>, <eslSINGLE_LINKAGE>, or
* <eslCOMPLETE_LINKAGE>: a flag specifying which algorithm to use.
*
* The output is a tree structure, returned in <ret_T>.
*
* Returns <eslOK> on success.
*
* Throws <eslEMEM> on allocation failure.
*
* Complexity: O(N^2) in memory, O(N^3) in time.
*
* This function can be optimized. Memory usage is at least
* 4x more than necessary. First, we don't need to make a copy of D
* if the caller doesn't mind it being consumed. Second, D only
* needs to be lower- or upper-triangular, because it's symmetric,
* but that requires changing dmatrix module. In time,
* O(N^2 log N) if not O(N^2) should be possible, by being more
* sophisticated about identifying the minimum element;
* see Gronau and Moran (2006).
*
*/
static int
cluster_engine(ESL_DMATRIX *D_original, int mode, ESL_TREE **ret_T)
{
ESL_DMATRIX *D = NULL;
ESL_TREE *T = NULL;
double *height = NULL; /* height of internal nodes [0..N-2] */
int *idx = NULL; /* taxa or node index of row/col in D [0..N-1] */
int *nin = NULL; /* # of taxa in clade in row/col in D [0..N-1] */
int N;
int i = 0, j = 0;
int row,col;
double minD;
int status;
/* Contract checks.
*/
ESL_DASSERT1((D_original != NULL)); /* matrix exists */
ESL_DASSERT1((D_original->n == D_original->m)); /* D is NxN square */
ESL_DASSERT1((D_original->n >= 2)); /* >= 2 taxa */
#if (eslDEBUGLEVEL >=1)
for (i = 0; i < D_original->n; i++) {
assert(D_original->mx[i][i] == 0.); /* self-self d = 0 */
for (j = i+1; j < D_original->n; j++) /* D symmetric */
assert(D_original->mx[i][j] == D_original->mx[j][i]);
}
#endif
/* Allocations.
* NxN copy of the distance matrix, which we'll iteratively whittle down to 2x2;
* tree for N taxa;
*/
if ((D = esl_dmatrix_Clone(D_original)) == NULL) return eslEMEM;
if ((T = esl_tree_Create(D->n)) == NULL) return eslEMEM;
ESL_ALLOC(idx, sizeof(int) * D->n);
ESL_ALLOC(nin, sizeof(int) * D->n);
ESL_ALLOC(height, sizeof(double) * (D->n-1));
for (i = 0; i < D->n; i++) idx[i] = -i; /* assign taxa indices to row/col coords */
for (i = 0; i < D->n; i++) nin[i ] = 1; /* each cluster starts as 1 */
for (i = 0; i < D->n-1; i++) height[i] = 0.;
/* If we're doing either single linkage or complete linkage clustering,
* we will construct a "linkage tree", where ld[v], rd[v] "branch lengths"
* below node v are the linkage value for clustering node v; thus
* ld[v] == rd[v] in a linkage tree.
* For UPGMA or WPGMA, we're building an additive tree, where ld[v] and
* rd[v] are branch lengths.
*/
if (mode == eslSINGLE_LINKAGE || mode == eslCOMPLETE_LINKAGE)
T->is_linkage_tree = TRUE;
for (N = D->n; N >= 2; N--)
{
/* Find minimum in our current N x N matrix.
* (Don't init minD to -infinity; linkage trees use sparse distance matrices
* with -infinity representing unlinked.)
*/
minD = D->mx[0][1]; i = 0; j = 1; /* init with: if nothing else, try to link 0-1 */
for (row = 0; row < N; row++)
for (col = row+1; col < N; col++)
if (D->mx[row][col] < minD)
{
minD = D->mx[row][col];
i = row;
j = col;
}
/* We're joining node at row/col i with node at row/col j.
* Add node (index = N-2) to the tree at height minD/2.
*/
T->left[N-2] = idx[i];
T->right[N-2] = idx[j];
if (T->is_linkage_tree) height[N-2] = minD;
else height[N-2] = minD / 2.;
/* Set the branch lengths (additive trees) or heights (linkage trees)
*/
T->ld[N-2] = T->rd[N-2] = height[N-2];
if (! T->is_linkage_tree) {
if (idx[i] > 0) T->ld[N-2] = ESL_MAX(0., T->ld[N-2] - height[idx[i]]); // max to 0, to avoid fp roundoff giving us negative length
if (idx[j] > 0) T->rd[N-2] = ESL_MAX(0., T->rd[N-2] - height[idx[j]]);
}
/* If either node was an internal node, record parent in it.
*/
if (idx[i] > 0) T->parent[idx[i]] = N-2;
if (idx[j] > 0) T->parent[idx[j]] = N-2;
/* Now, build a new matrix by merging row i+j and col i+j.
* 1. move j to N-1 (unless it's already there)
* 2. move i to N-2 (unless it's already there)
*/
if (j != N-1)
{
for (row = 0; row < N; row++)
ESL_SWAP(D->mx[row][N-1], D->mx[row][j], double);
for (col = 0; col < N; col++)
ESL_SWAP(D->mx[N-1][col], D->mx[j][col], double);
ESL_SWAP(idx[j], idx[N-1], int);
ESL_SWAP(nin[j], nin[N-1], int);
}
if (i != N-2)
{
for (row = 0; row < N; row++)
ESL_SWAP(D->mx[row][N-2], D->mx[row][i], double);
for (col = 0; col < N; col++)
ESL_SWAP(D->mx[N-2][col], D->mx[i][col], double);
ESL_SWAP(idx[i], idx[N-2], int);
ESL_SWAP(nin[i], nin[N-2], int);
}
i = N-2;
j = N-1;
/* 3. merge i (now at N-2) with j (now at N-1)
* according to the desired clustering rule.
*/
for (col = 0; col < N; col++)
{
switch (mode) {
case eslUPGMA:
D->mx[i][col] = (nin[i] * D->mx[i][col] + nin[j] * D->mx[j][col]) / (double) (nin[i] + nin[j]);
break;
case eslWPGMA: D->mx[i][col] = (D->mx[i][col] + D->mx[j][col]) / 2.; break;
case eslSINGLE_LINKAGE: D->mx[i][col] = ESL_MIN(D->mx[i][col], D->mx[j][col]); break;
case eslCOMPLETE_LINKAGE: D->mx[i][col] = ESL_MAX(D->mx[i][col], D->mx[j][col]); break;
default: ESL_XEXCEPTION(eslEINCONCEIVABLE, "no such strategy");
}
D->mx[col][i] = D->mx[i][col];
}
/* row/col i is now the new cluster, and it corresponds to node N-2
* in the tree (remember, N is decrementing at each iteration).
* row/col j (N-1) falls away when we go back to the start of the loop
* and decrement N.
*/
nin[i] += nin[j];
idx[i] = N-2;
}
esl_dmatrix_Destroy(D);
free(height);
free(idx);
free(nin);
if (ret_T != NULL) *ret_T = T;
return eslOK;
ERROR:
if (D != NULL) esl_dmatrix_Destroy(D);
if (T != NULL) esl_tree_Destroy(T);
if (height != NULL) free(height);
if (idx != NULL) free(idx);
if (nin != NULL) free(nin);
if (ret_T != NULL) *ret_T = NULL;
return status;
}
/* Function: esl_tree_UPGMA()
*
* Purpose: Given distance matrix <D>, use the UPGMA algorithm
* to construct a tree <T>.
*
* Returns: <eslOK> on success; the tree is returned in <ret_T>,
* and must be freed by the caller with <esl_tree_Destroy()>.
*
* Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
*/
int
esl_tree_UPGMA(ESL_DMATRIX *D, ESL_TREE **ret_T)
{
return cluster_engine(D, eslUPGMA, ret_T);
}
/* Function: esl_tree_WPGMA()
*
* Purpose: Given distance matrix <D>, use the WPGMA algorithm
* to construct a tree <T>.
*
* Returns: <eslOK> on success; the tree is returned in <ret_T>,
* and must be freed by the caller with <esl_tree_Destroy()>.
*
* Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
*/
int
esl_tree_WPGMA(ESL_DMATRIX *D, ESL_TREE **ret_T)
{
return cluster_engine(D, eslWPGMA, ret_T);
}
/* Function: esl_tree_SingleLinkage()
*
* Purpose: Given distance matrix <D>, construct a single-linkage
* (minimum distances) clustering tree <T>.
*
* Returns: <eslOK> on success; the tree is returned in <ret_T>,
* and must be freed by the caller with <esl_tree_Destroy()>.
*
* Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
*/
int
esl_tree_SingleLinkage(ESL_DMATRIX *D, ESL_TREE **ret_T)
{
return cluster_engine(D, eslSINGLE_LINKAGE, ret_T);
}
/* Function: esl_tree_CompleteLinkage()
*
* Purpose: Given distance matrix <D>, construct a complete-linkage
* (maximum distances) clustering tree <T>.
*
* Returns: <eslOK> on success; the tree is returned in <ret_T>,
* and must be freed by the caller with <esl_tree_Destroy()>.
*
* Throws: <eslEMEM> on allocation problem, and <ret_T> is set <NULL>.
*/
int
esl_tree_CompleteLinkage(ESL_DMATRIX *D, ESL_TREE **ret_T)
{
return cluster_engine(D, eslCOMPLETE_LINKAGE, ret_T);
}
/*----------------- end, clustering algorithms ----------------*/
/*****************************************************************
* 5. Generating simulated trees
*****************************************************************/
/* Function: esl_tree_Simulate()
* Synopsis: Generate a random rooted ultrametric tree.
*
* Purpose: Generate a random rooted ultrametric tree of <N> taxa,
* using the algorithm of Kuhner and Felsenstein (1994).
*
* The branch lengths are generated by choosing <N-1>
* exponentially distributed split times, with decreasing
* expectations of $\frac{1}{2},\frac{1}{3}..\frac{1}{N}$
* as the simulation proceeds from the root. Thus the
* total expected branch length on the tree is
* $\sum_{k=2}^{N} \frac{1}{k}$.
*
* Args: r - random number source
* N - number of taxa (>= 2)
* ret_T - RETURN: sampled tree
*
* Returns: <eslOK> on success, and the new tree is allocated
* here and returned via <ret_tree>; caller is
* responsible for free'ing it.
*
* Throws: <eslEMEM> on allocation failure, in which case
* the <ret_T> is returned <NULL>.
*
* Xref: STL11/65.
*/
int
esl_tree_Simulate(ESL_RANDOMNESS *r, int N, ESL_TREE **ret_T)
{
ESL_TREE *T = NULL;
int *branchpapa = NULL;
int *branchside = NULL;
int nactive;
double d;
int node;
int bidx; /* index of an active branch */
int status;
ESL_DASSERT1( (r != NULL) );
ESL_DASSERT1( (N >= 2) );
/* Kuhner/Felsenstein uses a list of active branches,
* which we implement by tracking the index of the parent
* node (in <branchpapa>) and a 0/1 flag (in <branchside>)
* for the branch to the left vs. right child.
*/
if ((T = esl_tree_Create(N)) == NULL) { status = eslEMEM; goto ERROR; }
ESL_ALLOC(branchpapa, sizeof(int) * N);
ESL_ALLOC(branchside, sizeof(int) * N);
/* Initialize: add two branches from the root
* onto the active list, and set internal node
* counter to start at 1.
*/
branchpapa[0] = 0; branchside[0] = 0;
branchpapa[1] = 0; branchside[1] = 1;
nactive = 2;
node = 1;
/* Algorithm proceeds by iterating:
* 1. choose random time <d> from exponential(1/nactive)
* 2. choose random active branch, <bidx>
* 3. add new <node> to active branch at length d
* 4. add d to all other active branches
* 5. delete the old parent branch from the active list,
* add the two new child branches to the active list
*/
while (nactive < N)
{
d = (double) nactive * -log(esl_rnd_UniformPositive(r));
bidx = esl_rnd_Roll(r, nactive);
T->parent[node] = branchpapa[bidx];
if (branchside[bidx] == 0) {
T->left[branchpapa[bidx]] = node;
T->ld [branchpapa[bidx]] += d;
} else {
T->right[branchpapa[bidx]] = node;
T->rd [branchpapa[bidx]] += d;
}
ESL_SWAP(branchpapa[bidx], branchpapa[nactive-1], int);
ESL_SWAP(branchside[bidx], branchside[nactive-1], int);
for (bidx = 0; bidx < nactive-1; bidx++) {
if (branchside[bidx] == 0) T->ld[branchpapa[bidx]] += d;
else T->rd[branchpapa[bidx]] += d;
}
/* delete the branch at nactive-1 that we just added to;
* replace it with two new branches
*/
branchpapa[nactive-1] = node; branchside[nactive-1] = 0;
branchpapa[nactive] = node; branchside[nactive] = 1;
node++;
nactive++;
}
/* Terminate by adding the N taxa to the N active branches.
*/
d = (double) N * -log(esl_rnd_UniformPositive(r));
for (bidx = 0; bidx < N; bidx++)
{
if (branchside[bidx] == 0) {
T->left[branchpapa[bidx]] = -bidx; /* taxa indices stored as neg #'s */
T->ld [branchpapa[bidx]] += d;
} else {
T->right[branchpapa[bidx]] = -bidx;
T->rd [branchpapa[bidx]] += d;
}
}
*ret_T = T;
free(branchpapa);
free(branchside);
return eslOK;
ERROR:
if (T != NULL) esl_tree_Destroy(T);
if (branchpapa != NULL) free(branchpapa);
if (branchside != NULL) free(branchside);
*ret_T = NULL;
return status;
}
/* Function: esl_tree_ToDistanceMatrix()
* Synopsis: Obtain a pairwise distance matrix from a tree.
*
* Purpose: Given tree <T>, calculate a pairwise distance matrix
* and return it in <ret_D>.
*
* Note: Algorithm here is O(N^3). It can probably be improved.
* There ought to be a more efficient recursion that
* saves recalculating node-node distances inside the tree.
* All we do here is a brute force, upwards O(N) LCA
* search for each of the N^2 taxon pairs.
*
* Args: T - input tree
* ret_D - RETURN: the new distance matrix
*
* Returns: <eslOK> on success, and <ret_D> points to the distance
* matrix, which caller is responsible for free'ing with
* <esl_dmatrix_Destroy()>.
*
* Throws: <eslEMEM> on allocation failure, in which case
* <ret_D> is returned <NULL>.
*
* Xref: STL11/66.
*/
int
esl_tree_ToDistanceMatrix(ESL_TREE *T, ESL_DMATRIX **ret_D)
{
ESL_DMATRIX *D = NULL;
int i,j; /* a pair of taxa {0..N-1} */
int a,b; /* a pair of internal nodes {0..N-2} */
int p; /* a tmp parent index */
double d; /* ij distance */
int status;
D = esl_dmatrix_Create(T->N, T->N); /* creates a NxN square symmetric matrix; really only need triangular */
if (D == NULL) { status = eslEMEM; goto ERROR; }
if ((status = esl_tree_SetTaxaParents(T)) != eslOK) goto ERROR;
for (i = 0; i < T->N; i++)
{
D->mx[i][i] = 0.; /* by definition */
for (j = i+1; j < T->N; j++)
{
a = T->taxaparent[i];
b = T->taxaparent[j];
d = (T->left[a] == -i) ? T->ld[a] : T->rd[a];
d += (T->left[b] == -j) ? T->ld[b] : T->rd[b];
while (a != b) /* a brute force LCA algorithm */
{
if (a < b) ESL_SWAP(a, b, int);
p = T->parent[a];
d += (T->left[p] == a) ? T->ld[p] : T->rd[p];
a = p;
}
D->mx[i][j] = D->mx[j][i] = d;
}
}
*ret_D = D;
return eslOK;
ERROR:
if (D != NULL) esl_dmatrix_Destroy(D);
*ret_D = NULL;
return status;
}
/*****************************************************************
* 6. Unit tests
*****************************************************************/
#ifdef eslTREE_TESTDRIVE
static void
utest_OptionalInformation(ESL_RANDOMNESS *r, int ntaxa)
{
char *msg = "optional information fields unit test failed";
ESL_TREE *T;
if (esl_tree_Simulate(r, ntaxa, &T) != eslOK) esl_fatal(msg);
if (esl_tree_SetTaxaParents(T) != eslOK) esl_fatal(msg);
if (esl_tree_SetCladesizes(T) != eslOK) esl_fatal(msg);
if (esl_tree_Validate(T, NULL) != eslOK) esl_fatal(msg);
esl_tree_Destroy(T);
return;
}
static void
utest_WriteNewick(ESL_RANDOMNESS *r, int ntaxa)
{
char *msg = "esl_tree_WriteNewick unit test failed";
char tmpfile[32] = "esltmpXXXXXX";
FILE *fp = NULL;
ESL_TREE *T1 = NULL;
ESL_TREE *T2 = NULL;
char errbuf[eslERRBUFSIZE];
if (esl_tmpfile(tmpfile, &fp) != eslOK) esl_fatal(msg);
if (esl_tree_Simulate(r, ntaxa, &T1) != eslOK) esl_fatal(msg);
if (esl_tree_SetTaxonlabels(T1, NULL) != eslOK) esl_fatal(msg);
if (esl_tree_Validate(T1, NULL) != eslOK) esl_fatal(msg);
if (esl_tree_WriteNewick(fp, T1) != eslOK) esl_fatal(msg);
rewind(fp);
if (esl_tree_ReadNewick(fp, errbuf, &T2) != eslOK) esl_fatal(msg);
if (esl_tree_Validate(T2, NULL) != eslOK) esl_fatal(msg);
if (esl_tree_Compare(T1, T2) != eslOK) esl_fatal(msg);
fclose(fp);
esl_tree_Destroy(T1);
esl_tree_Destroy(T2);
return;
}
static void
utest_UPGMA(ESL_RANDOMNESS *r, int ntaxa)
{
char *msg = "esl_tree_UPGMA unit test failed";
ESL_TREE *T1 = NULL;
ESL_TREE *T2 = NULL;
ESL_DMATRIX *D1 = NULL;
ESL_DMATRIX *D2 = NULL;
if (esl_tree_Simulate(r, ntaxa, &T1) != eslOK) esl_fatal(msg);
if (esl_tree_ToDistanceMatrix(T1, &D1) != eslOK) esl_fatal(msg);
if (esl_tree_UPGMA(D1, &T2) != eslOK) esl_fatal(msg);
if (esl_tree_Validate(T1, NULL) != eslOK) esl_fatal(msg);
if (esl_tree_Validate(T2, NULL) != eslOK) esl_fatal(msg);
if (esl_tree_VerifyUltrametric(T1) != eslOK) esl_fatal(msg);
if (esl_tree_VerifyUltrametric(T2) != eslOK) esl_fatal(msg);
if (esl_tree_Compare(T1, T2) != eslOK) esl_fatal(msg);
if (esl_tree_ToDistanceMatrix(T1, &D2) != eslOK) esl_fatal(msg);
if (esl_dmatrix_Compare(D1, D2, 0.001) != eslOK) esl_fatal(msg);
esl_tree_Destroy(T1);
esl_tree_Destroy(T2);
esl_dmatrix_Destroy(D1);
esl_dmatrix_Destroy(D2);
return;
}
#endif /*eslTREE_TESTDRIVE*/
/*-------------------- end, unit tests -------------------------*/
/*****************************************************************
* 7. Test driver
*****************************************************************/
#ifdef eslTREE_TESTDRIVE
/*
* gcc -g -Wall -o test -L. -I. -DeslTREE_TESTDRIVE esl_tree.c -leasel -lm
* gcc -g -Wall -o test -L. -I. -DeslTEST_THROWING -DeslTREE_TESTDRIVE esl_msa.c -leasel -lm
* ./test
*/
#include "easel.h"
#include "esl_tree.h"
#include "esl_random.h"
int
main(int argc, char **argv)
{
ESL_RANDOMNESS *r = NULL;
int ntaxa;
r = esl_randomness_Create(0);
ntaxa = 20;
utest_OptionalInformation(r, ntaxa); /* SetTaxaparents(), SetCladesizes() */
utest_WriteNewick(r, ntaxa);
utest_UPGMA(r, ntaxa);
esl_randomness_Destroy(r);
return eslOK;
}
#endif /*eslTREE_TESTDRIVE*/
/*-------------------- end, test driver -------------------------*/
/*****************************************************************
* 8. Examples.
*****************************************************************/
/* The first example is an example of inferring a tree by the
* UPGMA algorithm, starting from a multiple sequence alignment.
*/
#ifdef eslTREE_EXAMPLE
/*::cexcerpt::tree_example::begin::*/
/* To compile: gcc -g -Wall -o example -I. -DeslTREE_EXAMPLE esl_tree.c esl_dmatrix.c esl_msa.c easel.c -lm
* or: gcc -g -Wall -o example -I. -L. -DeslTREE_EXAMPLE esl_tree.c -leasel -lm
* To run: ./example <MSA file>
*/
#include "easel.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_distance.h"
#include "esl_tree.h"
int main(int argc, char **argv)
{
ESL_TREE *tree = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
ESL_DMATRIX *D = NULL;
esl_msafile_Open(NULL, argv[1], NULL, eslMSAFILE_UNKNOWN, NULL, &afp);
esl_msafile_Read(afp, &msa);
esl_msafile_Close(afp);
esl_dst_CDiffMx(msa->aseq, msa->nseq, &D);
esl_tree_UPGMA(D, &tree);
esl_tree_Destroy(tree);
esl_msa_Destroy(msa);
esl_dmatrix_Destroy(D);
return eslOK;
}
/*::cexcerpt::tree_example::end::*/
#endif /*eslTREE_EXAMPLE*/
/* The second example is an example of reading in a Newick format tree.
*/
#ifdef eslTREE_EXAMPLE2
/*::cexcerpt::tree_example2::begin::*/
/* To compile: gcc -g -Wall -o example -I. -DeslTREE_EXAMPLE2 esl_tree.c esl_dmatrix.c esl_msa.c easel.c -lm
* or: gcc -g -Wall -o example -I. -L. -DeslTREE_EXAMPLE2 esl_tree.c -leasel -lm
* To run: ./example <Newick file>
*/
#include "easel.h"
#include "esl_msa.h"
#include "esl_distance.h"
#include "esl_tree.h"
int main(int argc, char **argv)
{
ESL_TREE *T;
char errbuf[eslERRBUFSIZE];
FILE *fp;
if ((fp = fopen(argv[1], "r")) == NULL) esl_fatal("Failed to open %s", argv[1]);
if (esl_tree_ReadNewick(fp, errbuf, &T) != eslOK) esl_fatal("Failed to read tree: %s", errbuf);
esl_tree_WriteNewick(stdout, T);
esl_tree_Destroy(T);
fclose(fp);
return eslOK;
}
/*::cexcerpt::tree_example2::end::*/
#endif /*eslTREE_EXAMPLE*/
You can’t perform that action at this time.