Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1896 lines (1652 sloc) 76.3 KB
/* I/O of multiple sequence alignment files in PHYLIP format(s)
*
* Contents:
* 1. API for reading/writing PHYLIP format alignment files
* 2. I/O of the interleaved variant of the format
* 3. I/O of the sequential variant of the format
* 4. Autodetection of format and its variants
* 5. Rectifying valid input/output symbols in names, seqs
* 6. Unit tests
* 7. Test driver
* 8. Example
*
* See: http://evolution.genetics.washington.edu/phylip/doc/sequence.html
*/
#include "esl_config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_mem.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_phylip.h"
#define eslMSAFILE_PHYLIP_LEGALSYMS "-ABCDEFGHIJKLMNOPQRSTUVWZYX*?."
static int phylip_interleaved_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated);
static int phylip_interleaved_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd);
static int phylip_sequential_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated);
static int phylip_sequential_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd);
static int phylip_check_interleaved (ESL_BUFFER *bf, int *ret_nblocks, int *ret_namewidth);
static int phylip_check_sequential_known (ESL_BUFFER *bf, int namewidth);
static int phylip_check_sequential_unknown(ESL_BUFFER *bf, int *ret_namewidth);
static int phylip_parse_header(ESL_BUFFER *bf, int32_t *ret_nseq, int32_t *ret_alen, char **ret_p, esl_pos_t *ret_n);
static int phylip_collate_colcodes(char *p, esl_pos_t n, char *colcodes, int ncols);
static int phylip_deduce_namewidth(char *colcodes0, int ncols0, int alen, int nres2, int *ret_namewidth);
static int phylip_rectify_input_name(char *namebuf, char *p, int n);
static int phylip_rectify_output_seq_digital(char *buf);
static int phylip_rectify_output_seq_text(char *buf);
/*****************************************************************
* 1. API for reading/writing PHYLIP format alignment files
*****************************************************************/
/* Function: esl_msafile_phylip_SetInmap()
* Synopsis: Configure input map for PHYLIP formats.
*
* Purpose: Set the <afp->inmap> for PHYLIP formats.
*
* Phylip documentation states that DNA programs accept
* 'ABCDGHKMNORSTUVWXY?-', that 'a period was previously
* allowed' and that O means a deletion. Protein programs
* accept 'ABCDEFGHIJKLMNOPQRSTUVWXYZ*?-', and while JOU
* are accepted, they are unused.
*
* So: in text mode, we accept any alphabetic character
* plus '-*?.', verbatim. '~_', which Easel would normally
* accept, are illegal. Whitespace and numbers are ignored.
*
* In digital mode, we modify the digital alphabet by
* demapping '~_' and making them illegal; '?' is mapped to
* missing data; whitespace and numbers are ignored;
* and ONLY in <eslDNA> or <eslRNA> alphabets, 'O' is
* mapped to a gap.
*
* The inconsistent mapping of 'O' poses potential
* problems. In text mode (where we don't know the
* alphabet, and thus don't know what to do with O), we
* input the O verbatim. In digital mode, in a DNA or RNA
* alphabet, we map O to a gap; in other digital alphabets,
* we use the default digital alphabet mapping of O.
*
* Xref: http://evolution.genetics.washington.edu/phylip/doc/sequence.html
*/
int
esl_msafile_phylip_SetInmap(ESL_MSAFILE *afp)
{
int sym;
if (afp->abc)
{
for (sym = 1; sym < 128; sym++) afp->inmap[sym] = afp->abc->inmap[sym];
for (sym = '0'; sym < '9'; sym++) afp->inmap[sym] = eslDSQ_IGNORED;
afp->inmap['?'] = esl_abc_XGetMissing(afp->abc);
afp->inmap['~'] = eslDSQ_ILLEGAL;
afp->inmap['_'] = eslDSQ_ILLEGAL;
afp->inmap[' '] = eslDSQ_IGNORED;
afp->inmap['\t'] = eslDSQ_IGNORED;
afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
if (afp->abc->type == eslDNA || afp->abc->type == eslRNA)
afp->inmap['O'] = esl_abc_XGetGap(afp->abc);
}
if (! afp->abc)
{
for (sym = 1; sym < 128; sym++) afp->inmap[sym] = eslDSQ_ILLEGAL;
for (sym = 'a'; sym <= 'z'; sym++) afp->inmap[sym] = sym;
for (sym = 'A'; sym <= 'Z'; sym++) afp->inmap[sym] = sym;
for (sym = '0'; sym <= '9'; sym++) afp->inmap[sym] = eslDSQ_IGNORED;
afp->inmap['-'] = '-';
afp->inmap['*'] = '*';
afp->inmap['?'] = '?';
afp->inmap['.'] = '.';
afp->inmap[' '] = eslDSQ_IGNORED;
afp->inmap['\t'] = eslDSQ_IGNORED;
afp->inmap[0] = '?';
}
return eslOK;
}
/* Function: esl_msafile_phylip_GuessAlphabet()
* Synopsis: Guess the alphabet of an open PHYLIP MSA input.
*
* Purpose: Guess the alphabet of the sequences in open
* PHYLIP format MSA file <afp>.
*
* On normal return, <*ret_type> is set to <eslDNA>,
* <eslRNA>, or <eslAMINO>, and <afp> is reset to its
* original point.
*
* Args: afp - open PHYLIP format MSA file
* *ret_type - RETURN: <eslDNA>, <eslRNA>, <eslAMINO>; or <eslUNKNOWN>.
*
* Returns: <eslOK> on success.
* <eslENOALPHABET> if autodetection fails.
*
* Throws: <eslEMEM> on allocation error.
* <eslESYS> on failures of fread() or other system calls
*/
int
esl_msafile_phylip_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
{
int namewidth = (afp->fmtd.namewidth ? afp->fmtd.namewidth : 10); /* default: strict PHYLIP, namewidth 10 */
int alphatype = eslUNKNOWN;
int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
int nsteps = 3; /* ...there's 3 threshold[]'s we check */
int step = 0; /* ...starting on threshold[0] */
int64_t nres = 0;
char *p;
esl_pos_t n, pos;
int x;
esl_pos_t anchor;
int64_t ct[26];
int status;
for (x = 0; x < 26; x++) ct[x] = 0;
anchor = esl_buffer_GetOffset(afp->bf);
if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
/* Find the first nonblank line, which says " <nseq> <alen>" and may also have options. we ignore this header */
while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
if (status == eslEOF) ESL_XFAIL(eslENOALPHABET, afp->errmsg, "can't determine alphabet: no alignment data found");
else if (status != eslOK) goto ERROR;
/* Read line by line, just looking for residues, not worrying about nseq/alen or sequential/interleaved
* Always skip the name field, even in continuation lines/blocks
* This may miss some residues, but it means we work on both sequential and interleaved formats
*/
while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
{
if (esl_memspn(p, n, " \t") == n) continue;
if (n < namewidth) continue;
p += namewidth;
n -= namewidth;
/* count characters into ct[] array */
for (pos = 0; pos < n; pos++)
if (isalpha(p[pos])) {
x = toupper(p[pos]) - 'A';
ct[x]++;
nres++;
}
/* try to stop early, checking after 500, 5000, and 50000 residues: */
if (step < nsteps && nres > threshold[step]) {
if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
step++;
}
}
if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
DONE:
esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
esl_buffer_RaiseAnchor(afp->bf, anchor);
*ret_type = alphatype;
return status;
ERROR:
if (anchor != -1) {
esl_buffer_SetOffset(afp->bf, anchor);
esl_buffer_RaiseAnchor(afp->bf, anchor);
}
*ret_type = eslUNKNOWN;
return status;
}
/* Function: esl_msafile_phylip_Read()
* Synopsis: Read in a PHYLIP format alignment.
*
* Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing for
* Phylip format, starting from the current point. The
* format may be either the interleaved or sequential
* variant, according to the format set in <afp->format>:
* <eslMSAFILE_PHYLIP> means interleaved, and
* <eslMSAFILE_PHYLIPS> means sequential. Create a new
* multiple alignment, and return a ptr to that alignment
* in <*ret_msa>. Caller is responsible for free'ing this
* <ESL_MSA>.
*
* Args: afp - open <ESL_MSAFILE>
* ret_msa - RETURN: newly parsed <ESL_MSA>
*
* Returns: <eslOK> on success.
*
* <eslEOF> if no (more) alignment data were found in
* <afp>, and <afp> is returned at EOF.
*
* <eslEFORMAT> on a parse error. <*ret_msa> is set to
* <NULL>. <afp> contains information sufficient for
* constructing useful diagnostic output:
* | <afp->errmsg> | user-directed error message |
* | <afp->linenumber> | line # where error was detected |
* | <afp->line> | offending line (not NUL-term) |
* | <afp->n> | length of offending line |
* | <afp->bf->filename> | name of the file |
* and <afp> is poised at the start of the following line,
* so (in principle) the caller could try to resume
* parsing.
*
* Throws: <eslEMEM> - an allocation failed.
* <eslESYS> - a system call such as fread() failed
* <eslEINCONCEIVABLE> - "impossible" corruption
*/
int
esl_msafile_phylip_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
{
ESL_MSA *msa = NULL;
int32_t alen_stated; /* int32_t because we're using strtoi32() to parse it from the file */
int nseq;
char *p, *tok;
esl_pos_t n, toklen;
int status;
ESL_DASSERT1( (afp->format == eslMSAFILE_PHYLIP || afp->format == eslMSAFILE_PHYLIPS) );
afp->errmsg[0] = '\0';
/* skip leading blank lines (though there shouldn't be any) */
while ( (status = esl_msafile_GetLine(afp, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
if (status != eslOK) goto ERROR; /* includes normal EOF */
/* the first line: <nseq> <alen> */
esl_memtok(&p, &n, " \t", &tok, &toklen);
if (esl_mem_strtoi32(tok, toklen, 0, NULL, &nseq) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "first PHYLIP line should be <nseq> <alen>: first field isn't an integer");
if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "first PHYLIP line should be <nseq> <alen>: only one field found");
if (esl_mem_strtoi32(tok, toklen, 0, NULL, &alen_stated) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "first PHYLIP line should be <nseq> <alen>: second field isn't an integer");
/* believe <nseq> and allocate accordingly */
/* don't believe <alen_stated>; use it for validation, after we've parsed everything. */
if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; }
if (! afp->abc && (msa = esl_msa_Create( nseq, -1)) == NULL) { status = eslEMEM; goto ERROR; }
/* load next line, skipping any blank ones (though there shouldn't be any) */
do {
status = esl_msafile_GetLine(afp, &p, &n);
if (status == eslEOF) ESL_XFAIL(eslEFORMAT, afp->errmsg, "no alignment data following PHYLIP header");
else if (status != eslOK) goto ERROR;
} while (esl_memspn(p, n, " \t") == n); /* idiom for "blank line" */
/* hand off to interleaved vs. sequential parser for the rest */
if (afp->format == eslMSAFILE_PHYLIP) status = phylip_interleaved_Read(afp, msa, nseq, alen_stated);
else if (afp->format == eslMSAFILE_PHYLIPS) status = phylip_sequential_Read (afp, msa, nseq, alen_stated);
if (status != eslOK) goto ERROR;
if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
*ret_msa = msa;
return eslOK;
ERROR:
if (msa) esl_msa_Destroy(msa);
*ret_msa = NULL;
return status;
}
/* Function: esl_msafile_phylip_Write()
* Synopsis: Write an MSA to a stream in PHYLIP format.
*
* Purpose: Write alignment <msa> in PHYLIP format to stream <fp>.
* If <format> is <eslMSAFILE_PHYLIP>, write interleaved format;
* if <format> is <eslMSAFILE_PHYLIPS>, write sequential format.
*
* Optionally, caller may pass additional formatting
* information by passing a ptr to a valid <opt_fmtd>
* structure. <opt_fmtd->namewidth> sets the width of the
* name field. For strict PHYLIP format, this must be 10.
* <opt_fmtd->rpl> sets the number of residues per line.
* The default is 60. For either value, if it is unset or
* if <opt_fmtd> is <NULL>, the default is used.
*
* Args: fp - open output stream
* msa - alignment to write
* format - <eslMSAFILE_PHYLIP> for interleaved; <eslMSAFILE_PHYLIPS> for sequential.
* fmtd - optional: <NULL>, or additional format information
*
* Returns: <eslOK> on success.
*
* Throws: <eslEINVAL> if <format> isn't <eslMSAFILE_PHYLIP> or <eslMSAFILE_PHYLIPS>.
* <eslEMEM> on allocation failure.
* <eslEWRITE> on any system write error, such as a filled disk.
*/
int
esl_msafile_phylip_Write(FILE *fp, const ESL_MSA *msa, int format, ESL_MSAFILE_FMTDATA *opt_fmtd)
{
if (format == eslMSAFILE_PHYLIP) return phylip_interleaved_Write(fp, msa, opt_fmtd);
else if (format == eslMSAFILE_PHYLIPS) return phylip_sequential_Write (fp, msa, opt_fmtd);
else ESL_EXCEPTION(eslEINVAL, "format %s is not a PHYLIP format", esl_msafile_DecodeFormat(format));
}
/* Function: esl_msafile_phylip_CheckFileFormat()
* Synopsis: Check whether an input seems to be in PHYLIP format.
*
* Purpose: Check whether input buffer <bf> appears to be
* in a PHYLIP format, starting from the current point.
* Return <eslOK> if it is, and <eslFAIL> if it isn't.
*
* There are two main variants of the format, interleaved
* and sequential. Upon successful return, <*ret_format> is
* set to <eslMSAFILE_PHYLIP> for interleaved, or
* <eslMSAFILE_PHYLIPS> for sequential.
*
* Strict PHYLIP format has a name/identifier field width
* of exactly 10 characters, but variants of the format are
* in common use with different name widths. The
* name width is determined and returned in <*ret_namewidth>.
*
* If the input doesn't appear to be in PHYLIP format,
* return <eslFAIL>, with <*ret_format> as
* <eslMSAFILE_UNKNOWN> and <*ret_namewidth> as 0.
*
* The PHYLIP format definition is ambiguous; it is
* possible to construct pathological inputs that could be
* validly parsed to yield different data when read as
* interleaved versus sequential. (This requires names that
* use a character set that looks like sequence, among
* other unusual edge conditions.) If the format variant
* cannot be unambiguously determined, we do not attempt to
* make a guess that might result in corrupted input;
* rather, return <eslEAMBIGUOUS>.
*
* Args: bf - input buffer
* *ret_format - RETURN: format variant, <eslMSAFILE_PHYLIP>, <_PHYLIPS>, <_UNKNOWN>
* *ret_namewidth - RETURN: width of name field, in characters
*
* Returns: <eslOK> if input is in PHYLIP format; <*ret_format> is set to
* <eslMSAFILE_PHYLIP> or <eslMSAFILE_PHYLIPS>; <*ret_namewidth>
* is the name width, either the usual 10, or something nonstandard.
*
* <eslEAMBIGUOUS> if the input appears to be in a PHYLIP format but
* we can't tell which one. <*ret_format> is <eslMSAFILE_UNKNOWN>,
* <*ret_namewidth> is 0.
*
* <eslFAIL> if the input is not in either PHYLIP format;
* <*ret_format> is <eslMSAFILE_UNKNOWN>, <*ret_namewidth> is 0.
*
* Throws: (no abnormal error conditions)
*/
int
esl_msafile_phylip_CheckFileFormat(ESL_BUFFER *bf, int *ret_format, int *ret_namewidth)
{
int maybe_interleaved = FALSE; // Must worry about pathological files consistent with both formats.
int maybe_sequential = FALSE;
int nblocks; // number of interleaved blocks. if only 1, interleaved == sequential
int w1, w2; // name widths if interleaved, sequential
if ( phylip_check_interleaved(bf, &nblocks, &w1) == eslOK) { maybe_interleaved = TRUE; }
if (maybe_interleaved && nblocks == 1) { *ret_namewidth = w1; *ret_format = eslMSAFILE_PHYLIP; return eslOK; }
if ( phylip_check_sequential_known (bf, 10) == eslOK) { maybe_sequential = TRUE; w2 = 10; }
else if ( phylip_check_sequential_unknown(bf, &w2) == eslOK) { maybe_sequential = TRUE; }
if (maybe_interleaved && maybe_sequential) { *ret_namewidth = 0; *ret_format = eslMSAFILE_UNKNOWN; return eslEAMBIGUOUS; }
else if (maybe_interleaved) { *ret_namewidth = w1; *ret_format = eslMSAFILE_PHYLIP; return eslOK; }
else if (maybe_sequential) { *ret_namewidth = w2; *ret_format = eslMSAFILE_PHYLIPS; return eslOK; }
else { *ret_namewidth = 0; *ret_format = eslMSAFILE_UNKNOWN; return eslFAIL; }
}
/*-------------- end, API for PHYLIP format i/o -----------------*/
/*****************************************************************
* 2. i/o of the interleaved variant of the format
*****************************************************************/
/* Read the interleaved variant.
* header told us to expect <nseq>, <alen_stated>.
* <msa> is already allocated <nseq>, <-1>.
* In <afp>, we've loaded the first line of the alignment data.
*/
static int
phylip_interleaved_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated)
{
int namewidth = (afp->fmtd.namewidth ? afp->fmtd.namewidth : 10); /* default: strict PHYLIP, namewidth 10 */
char *namebuf = NULL;
int nblocks = 0;
int64_t alen = 0; /* alignment length observed so far */
char *p = afp->line;
esl_pos_t n = afp->n;
int64_t block_alen; /* # of residues being added by current block */
int64_t cur_alen;
int idx;
int status;
ESL_ALLOC(namebuf, sizeof(char) * (namewidth+1));
/* p, n is now the first line of a block */
/* read the alignment data */
do {
idx = 0;
do { /* First block? store the sequence names */
if (nblocks == 0)
{
if (n < namewidth) ESL_XFAIL(eslEFORMAT, afp->errmsg, "PHYLIP line too short to find sequence name");
if (phylip_rectify_input_name(namebuf, p, namewidth) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid character(s) in sequence name");
if ( (status = esl_msa_SetSeqName(msa, idx, namebuf, -1)) != eslOK) goto ERROR;
p += namewidth;
n -= namewidth;
}
/* Append the sequence. */
cur_alen = alen;
if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &(cur_alen), p, n); }
if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), p, n); }
if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
else if (status != eslOK) goto ERROR;
/* validate # of residues added */
if (idx == 0) block_alen = cur_alen - alen;
else if (cur_alen - alen != block_alen) ESL_XFAIL(eslEFORMAT, afp->errmsg, "number of residues on line differs from previous seqs in alignment block");
/* get next line. */
idx++;
status = esl_msafile_GetLine(afp, &p, &n);
} while (status == eslOK && idx < nseq && esl_memspn(p, n, " \t") < n); /* stop block on: read error, EOF; idx == nseq; or blank line */
if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of sequences in block (saw %d, expected %d)", idx, nseq);
nblocks += 1;
alen += block_alen;
/* tolerate blank lines only at the end of a block */
while (status == eslOK && esl_memspn(p, n, " \t") == n)
status = esl_msafile_GetLine(afp, &p, &n); /* [eslEMEM,eslESYS] */
/* now status can be: read error, EOF, or OK. */
} while (status == eslOK && alen < alen_stated);
/* End of all blocks. We swallowed blank lines following last block,
* so we should be EOF; unless we're in a seqboot file, in which
* case we just read the first line of the next MSA.
*/
if (status == eslOK) esl_msafile_PutLine(afp); /* put <nseq> <alen> line back in the stream, so we will read it properly w/ next msa read */
else if (status != eslEOF) goto ERROR;
else if (alen != alen_stated) ESL_XFAIL(eslEFORMAT, afp->errmsg, "alignment length disagrees with header: header said %d, parsed %" PRId64, alen_stated, alen);
msa->nseq = nseq;
msa->alen = alen;
free(namebuf);
return eslOK;
ERROR:
msa->nseq = nseq; /* we're allocated and initialized for <nseq>: this makes sure we free everything we need to in <msa> */
if (namebuf) free(namebuf);
return status;
}
/* Write an interleaved PHYLIP file.
* Returns <eslOK> on success.
* Throws <eslEWRITE> on any system write error.
*/
static int
phylip_interleaved_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd)
{
int rpl = ( (opt_fmtd && opt_fmtd->rpl) ? opt_fmtd->rpl : 60);
int namewidth = ( (opt_fmtd && opt_fmtd->namewidth) ? opt_fmtd->namewidth : 10);
char *buf = NULL;
int idx;
int64_t apos;
int status;
ESL_ALLOC(buf, sizeof(char) * (rpl+1));
buf[rpl] = '\0';
if (fprintf(fp, " %d %" PRId64, msa->nseq, msa->alen) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed");
for (apos = 0; apos < msa->alen; apos += rpl)
{
if (fprintf(fp, "\n") < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed");
for (idx = 0; idx < msa->nseq; idx++)
{
if (msa->abc)
{
esl_abc_TextizeN(msa->abc, msa->ax[idx]+apos+1, rpl, buf);
phylip_rectify_output_seq_digital(buf);
}
if (! msa->abc)
{
strncpy(buf, msa->aseq[idx]+apos, rpl);
phylip_rectify_output_seq_text(buf);
}
if (apos == 0) { if (fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[idx], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed"); }
else { if (fprintf(fp, "%s\n", buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "interleaved phylip write failed"); }
}
}
free(buf);
return eslOK;
ERROR:
if (buf) free(buf);
return status;
}
/*----------------- end, interleaved variant -------------------*/
/*****************************************************************
* 3. i/o of sequential variant of the format
*****************************************************************/
/* Read the sequential variant.
* header told us to expect <nseq>, <alen_stated>.
* <msa> is already allocated <nseq>, <-1>.
* In <afp>, we've loaded the first line of the alignment data.
*/
static int
phylip_sequential_Read(ESL_MSAFILE *afp, ESL_MSA *msa, int nseq, int32_t alen_stated)
{
int namewidth = (afp->fmtd.namewidth ? afp->fmtd.namewidth : 10); /* default: strict PHYLIP, namewidth 10 */
char *namebuf = NULL;
char *p = afp->line;
esl_pos_t n = afp->n;
int idx;
int64_t alen = 0;
int status = eslOK;
ESL_ALLOC(namebuf, sizeof(char) * (namewidth+1));
for (idx = 0; idx < nseq; idx++)
{
alen = 0;
status = eslOK;
while (status == eslOK && alen < alen_stated)
{
if (alen == 0) /* First line? Store the sequence name */
{
if (n < namewidth) ESL_XFAIL(eslEFORMAT, afp->errmsg, "PHYLIP line too short to find sequence name");
if (phylip_rectify_input_name(namebuf, p, namewidth) != eslOK) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid character(s) in sequence name");
if ( (status = esl_msa_SetSeqName(msa, idx, namebuf, -1)) != eslOK) goto ERROR;
p += namewidth;
n -= namewidth;
}
if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &alen, p, n); }
if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &alen, p, n); }
if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
else if (status != eslOK) goto ERROR;
/* get next line */
status = esl_msafile_GetLine(afp, &p, &n);
} /* end looping over a seq */
/* tolerate blank lines after sequences. */
while (status == eslOK && esl_memspn(p, n, " \t") == n)
status = esl_msafile_GetLine(afp, &p, &n);
if (status == eslEOF) { if (idx < nseq-1) ESL_XFAIL(eslEFORMAT, afp->errmsg, "premature end of file: header said to expect %d sequences", nseq); }
else if (status != eslOK) goto ERROR;
else if (alen != alen_stated) ESL_XFAIL(eslEFORMAT, afp->errmsg, "aligned length of sequence disagrees with header: header says %d, parsed %" PRId64, alen_stated, alen);
} /* end looping over all seqs */
/* we should be EOF; we've swallowed all trailing blank lines.
* Exception: if we're in a seqboot file, we just read the <nseq> <alen> line of the next msa record; push it back on stream
*/
if (status == eslOK) esl_msafile_PutLine(afp);
msa->nseq = nseq;
msa->alen = alen;
free(namebuf);
return eslOK;
ERROR:
msa->nseq = nseq; /* we're allocated and initialized for <nseq>: this makes sure we free everything we need to in <msa> */
if (namebuf) free(namebuf);
return status;
}
static int
phylip_sequential_Write(FILE *fp, const ESL_MSA *msa, ESL_MSAFILE_FMTDATA *opt_fmtd)
{
int rpl = ( (opt_fmtd && opt_fmtd->rpl) ? opt_fmtd->rpl : 60);
int namewidth = ( (opt_fmtd && opt_fmtd->namewidth) ? opt_fmtd->namewidth : 10);
char *buf = NULL;
int idx;
int64_t apos;
int status;
ESL_ALLOC(buf, sizeof(char) * (rpl+1));
buf[rpl] = '\0';
fprintf(fp, " %d %" PRId64 "\n", msa->nseq, msa->alen);
for (idx = 0; idx < msa->nseq; idx++)
{
for (apos = 0; apos < msa->alen; apos += rpl)
{
if (msa->abc)
{
esl_abc_TextizeN(msa->abc, msa->ax[idx]+apos+1, rpl, buf);
phylip_rectify_output_seq_digital(buf);
}
if (! msa->abc)
{
strncpy(buf, msa->aseq[idx]+apos, rpl);
phylip_rectify_output_seq_text(buf);
}
if (apos == 0) fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[idx], buf);
else fprintf(fp, "%s\n", buf);
}
}
free(buf);
return eslOK;
ERROR:
if (buf) free(buf);
return status;
}
/*------------------ end, sequential variant --------------------*/
/*****************************************************************
* 4. Autodetection of the format and its variants
*****************************************************************/
/* return <eslOK> if input is consistent with interleaved format;
* set <*ret_namewidth> to the apparent name width;
* set <*ret_nblocks> to the number of interleaved blocks seen.
*
* If <nblocks=1>, it doesn't matter whether we read the file as
* interleaved or sequential; it will also appear to be consistent
* with sequential format, but we will parse it as interleaved.
*
* <namewidth> can be uniquely determined because we know <nseq> and
* <alen> (from the Phylip header), so we know how many characters
* should be accounted for by the alignment; the rest have to be
* the names.
*
* If the input is not consistent with interleaved Phylip format,
* return <eslFAIL>, and <*ret_namewidth> and <*ret_nblocks> are 0.
*
* upon return, restore the <bf> to its original position.
*/
static int
phylip_check_interleaved(ESL_BUFFER *bf, int *ret_nblocks, int *ret_namewidth)
{
esl_pos_t anchor = -1;
char *p;
esl_pos_t n;
int32_t nseq, alen;
char *colcodes = NULL;
char *colcodes0 = NULL;
int ncols, ncols0;
int c, idx;
int nblocks = 0;
int nres2 = 0;
int nres1;
int status;
anchor = esl_buffer_GetOffset(bf);
if (esl_buffer_SetAnchor(bf, anchor) != eslOK) { status = eslFAIL; goto ERROR; }
if ((status = phylip_parse_header(bf, &nseq, &alen, &p, &n)) != eslOK) goto ERROR;
/* read the whole file, one block at a time */
while (status == eslOK)
{
ncols = n;
ESL_REALLOC(colcodes, sizeof(char) * ncols);
for (c = 0; c < ncols; c++) colcodes[c] = '?';
/* read a block: for each line, update the colcodes[] array. */
for (idx = 0; idx < nseq; idx++)
{
if (status == eslEOF) goto ERROR;
if ((status = phylip_collate_colcodes(p, n, colcodes, ncols)) != eslOK) goto ERROR;
status = esl_buffer_GetLine(bf, &p, &n);
if (status != eslOK && status != eslEOF) goto ERROR; /* EOF is ok on last seq of single-block data */
}
/* finished a block. status may be EOF or OK.
* save the first block's colcodes[] to defer its analysis
* for all other blocks: count x columns, reject n columns, ignore . and o
*/
if (nblocks == 0)
{
colcodes0 = colcodes; ncols0 = ncols;
colcodes = NULL;
/* let's speculate that it's strictly conforming PHYLIP with a namewidth of 10
* in that case, we'll be able to stop parsing blocks when we reach the full
* alignment length
*/
for (nres1 = 0, c = 10; c < ncols0; c++)
if (colcodes0[c] != '.') nres1++; /* for this test, consider even invalid symbols to be "residues". */
}
else
{
for (c = 0; c < ncols; c++)
{
if (colcodes[c] == 'x') nres2++;
if (colcodes[c] == 'n') { status = eslFAIL; goto ERROR; } /* subsequent blocks can't contain name-like columns */
}
}
nblocks++;
/* if it's strictly conforming w/ namewidth=10, we know when we're done,
* and in that case we can tolerate trailing data (tree, whatever) in the
* input. status will be eslOK if we break this way.
*/
if (nres1+nres2 == alen) break;
/* skip blank lines until we load start of next block in <p>, <n> */
while (status == eslOK && esl_memspn(p, n, "\t") == n)
status = esl_buffer_GetLine(bf, &p, &n);
}
if (nres1+nres2 == alen) *ret_namewidth = 10;
else if ((status = phylip_deduce_namewidth(colcodes0, ncols0, alen, nres2, ret_namewidth)) != eslOK) goto ERROR;
free(colcodes);
free(colcodes0);
esl_buffer_SetOffset(bf, anchor);
esl_buffer_RaiseAnchor(bf, anchor);
*ret_nblocks = nblocks;
return eslOK;
ERROR:
if (anchor != -1) {
esl_buffer_SetOffset(bf, anchor);
esl_buffer_RaiseAnchor(bf, anchor);
}
if (colcodes) free(colcodes);
if (colcodes0) free(colcodes0);
*ret_namewidth = 0;
*ret_nblocks = 0;
return status;
}
/* check for a sequential format file, given a known namewidth (10, for strict PHYLIP) */
static int
phylip_check_sequential_known(ESL_BUFFER *bf, int namewidth)
{
esl_pos_t anchor = -1;
char *p;
esl_pos_t n;
int32_t nseq, alen;
int idx, nres, line, c;
int status;
anchor = esl_buffer_GetOffset(bf);
if (esl_buffer_SetAnchor(bf, anchor) != eslOK) { status = eslFAIL; goto ERROR; }
if ((status = phylip_parse_header(bf, &nseq, &alen, &p, &n)) != eslOK) goto ERROR;
for (idx = 0; idx < nseq; idx++)
{
nres = 0;
line = 0;
while (nres < alen)
{
if (status == eslEOF) goto ERROR;
c = (line == 0 ? namewidth : 0);
for ( ; c < n; c++)
if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[c]) != NULL)
nres++;
status = esl_buffer_GetLine(bf, &p, &n);
if (status != eslOK && status != eslEOF) goto ERROR;
}
if (nres != alen) { status = eslFAIL; goto ERROR; }
/* tolerate blank spaces after individual sequences */
while (status == eslOK && esl_memspn(p, n, "\t") == n)
status = esl_buffer_GetLine(bf, &p, &n);
if (status != eslOK && status != eslEOF) goto ERROR;
}
esl_buffer_SetOffset(bf, anchor);
esl_buffer_RaiseAnchor(bf, anchor);
return eslOK;
ERROR:
if (anchor != -1) {
esl_buffer_SetOffset(bf, anchor);
esl_buffer_RaiseAnchor(bf, anchor);
}
return status;
}
/* phylip_check_sequential_unknown()
* Check for sequential format when namewidth may deviate from standard 10.
*
* Returns: <eslOK> if file is consistent with sequential Phylip format,
* and <*ret_namewidth> is the name width.
*
* <eslFAIL> if file is inconsistent with sequential format.
*
* Throws: <eslEMEM> on allocation failure
*/
static int
phylip_check_sequential_unknown(ESL_BUFFER *bf, int *ret_namewidth)
{
esl_pos_t anchor = -1;
int nlines = 0;
int nblocks = 0;
int32_t nseq, alen;
int *rth = NULL; // rth[i=0..L1-1] = # of legal chars, inclusive, in i..L1-1, or 0
int r; // total # of legal seq chars on line 1 of a seq
int L1; // total length of line 1, including all chars & whitespace
int b; // # of legal seq chars on remainder of lines of a seq
int a; // # of seq chars we need on line 1: alen - b
char *p; // pointer to current line in input buffer
esl_pos_t n; // length of current line in input buffer
int i, j, k;
int w; // name width we determine
int len2; // alen of subsequent seqs in file
int status;
/* Set an anchor where we are in the input, so we can rewind */
anchor = esl_buffer_GetOffset(bf);
if ( esl_buffer_SetStableAnchor(bf, anchor) != eslOK) { status = eslFAIL; goto ERROR; }
/* Pass 1: number of data lines in file / nseq = integral # of lines per sequence */
while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK)
if (esl_memspn(p, n, " \t") != n) nlines++;
if (status != eslEOF) { status = eslFAIL; goto ERROR; }
nlines--; /* not counting the <nseq> <alen> header */
esl_buffer_SetOffset(bf, anchor); /* rewind */
/* pass 2: first, parse the <nseq> <alen> line */
if ((status = phylip_parse_header(bf, &nseq, &alen, &p, &n)) != eslOK) { status = eslFAIL; goto ERROR; }
if (nlines % nseq != 0) { status = eslFAIL; goto ERROR; }
nblocks = nlines / nseq;
/* ... evaluate the first line, already at point in p,n */
ESL_ALLOC(rth, sizeof(int) * n);
for (i = n-1, r = 0; i >= 0; i--)
rth[i] = ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL ? ++r : 0 );
L1 = n;
/* rth[i] = 0 if line1[i] is not a legal seq char
* else # of legal chars in line1[i..L1-1]
* r = total # of legal chars on line 1
* L1 = total length of line 1, all chars (including whitespace)
* Once we know how many seq residues we need to get on the first line,
* we can use rth[] to find the position of the first one.
*/
/* Now count b: # of legal chars on subsequent lines for 1st seq */
for (k = 1, b = 0; k < nblocks; k++)
{
while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
if (status != eslOK) { status = eslFAIL; goto ERROR; }
for (i = 0; i < n; i++)
if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL) b++;
}
/* We need to find (alen - b) = a residues on the first line. */
a = alen - b;
if (a > r) { status = eslFAIL; goto ERROR; }
for (w = 0; w < L1; w++) // find the leftmost seq residue
if (rth[w] == a) break;
if (w == L1) { status = eslFAIL; goto ERROR; }
for (i = 0; i < w; i++) // make sure there's a name of some sort
if (! isspace(p[i])) break;
if ( i == w) { status = eslFAIL; goto ERROR; }
/* Now we "know" the name width is w. */
/* Check that the rest of the sequences (up to 100 of them) are consistent w/ that. */
for (j = 1; j < nseq && j < 100; j++)
{
while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
if (status != eslOK) { status = eslFAIL; goto ERROR; }
if ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[w]) == NULL) { status = eslFAIL; goto ERROR; }
for (i = 0; i < w; i++)
if (! isspace(p[i])) break;
if ( i == w) { status = eslFAIL; goto ERROR; }
for (i = w, len2 = 0; i < n; i++)
if ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL) len2++;
for (k = 1; k < nblocks; k++)
{
while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
if (status != eslOK) { status = eslFAIL; goto ERROR; }
for (i = 0; i < n; i++)
if ( strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[i]) != NULL) len2++;
}
if (len2 != alen) { status = eslFAIL; goto ERROR; }
}
esl_buffer_SetOffset(bf, anchor);
esl_buffer_RaiseAnchor(bf, anchor);
free(rth);
*ret_namewidth = w;
return eslOK;
ERROR:
if (anchor != -1) {
esl_buffer_SetOffset(bf, anchor);
esl_buffer_RaiseAnchor(bf, anchor);
}
free(rth);
*ret_namewidth = 0;
return status;
}
/* parse the header from a buffer (note, NOT an open MSAFILE).
* used by the format checkers (before an MSAFILE is open)
* return <eslOK> on success, <eslFAIL> on parse failure.
*
* Upon return <p>, <n> contains the first alignment data line,
* and the <bf>'s point is on the line following that.
*
*/
static int
phylip_parse_header(ESL_BUFFER *bf, int32_t *ret_nseq, int32_t *ret_alen, char **ret_p, esl_pos_t *ret_n)
{
char *p, *tok;
esl_pos_t n, toklen;
int32_t nseq, alen;
int status;
/* Find the first nonblank line, which says " <nseq> <alen>" and may also have options (which we'll ignore) */
while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
if (status == eslEOF) status = eslFAIL;
if (status != eslOK) goto ERROR;
esl_memtok(&p, &n, " \t", &tok, &toklen);
if (esl_mem_strtoi32(tok, toklen, 0, NULL, &nseq) != eslOK) { status = eslFAIL; goto ERROR; }
if (esl_memtok(&p, &n, " \t", &tok, &toklen) != eslOK) { status = eslFAIL; goto ERROR; }
if (esl_mem_strtoi32(tok, toklen, 0, NULL, &alen) != eslOK) { status = eslFAIL; goto ERROR; }
/* skip any blank lines, load p,n as first line of putative block */
while ( (status = esl_buffer_GetLine(bf, &p, &n)) == eslOK && esl_memspn(p, n, " \t") == n) ;
if (status == eslEOF) status = eslFAIL;
if (status != eslOK) goto ERROR;
*ret_nseq = nseq;
*ret_alen = alen;
*ret_p = p;
*ret_n = n;
return eslOK;
ERROR:
*ret_nseq = 0;
*ret_alen = 0;
*ret_p = NULL;
*ret_n = 0;
return status;
}
/*
* '?' : unset
* 'x' : column consists only of valid data symbols
* '.' : column consists only of spaces
* 'o' : column consists only of spaces or other (invalid) symbols such as numbers
* 'n' : column mixes x and (. or o): must be a name column
*
* A name can contain spaces, valid symbols, other symbols ('x', 'n', '.', 'o')
* Any whitespace between name, data must contain only '.'
* Spacer columns elsewhere can contain spaces or other symbols (such as numbers): 'o'
* Alignment columns must contain only valid data symbols: 'x'.
*/
static int
phylip_collate_colcodes(char *p, esl_pos_t n, char *colcodes, int ncols)
{
int c;
for (c = 0; c < ncols && c < n; c++)
{
if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[c]) != NULL)
{
switch (colcodes[c]) {
case '?': colcodes[c] = 'x'; break;
case '.': colcodes[c] = 'n'; break;
}
}
else if (p[c] == ' ')
{
switch (colcodes[c]) {
case '?': colcodes[c] = '.'; break;
case 'x': colcodes[c] = 'n'; break;
}
}
else if (isgraph(p[c]))
{
switch (colcodes[c]) {
case '?': colcodes[c] = 'o'; break;
case 'x': colcodes[c] = 'n'; break;
case '.': colcodes[c] = 'o'; break;
}
}
else return eslFAIL;
}
for ( ; c < n; c++)
if (strchr(eslMSAFILE_PHYLIP_LEGALSYMS, p[c]) != NULL) return eslFAIL;
return eslOK;
}
static int
phylip_deduce_namewidth(char *colcodes0, int ncols0, int alen, int nres2, int *ret_namewidth)
{
int nres1;
int c;
int nwA, nwB, namewidth;
/* nres1 = alen - nres2 : # of columns that must be provided in first block */
nres1 = alen - nres2;
if (nres1 <= 0) return eslFAIL;
/* search leftward in first block until we've accounted for enough cols */
/* c becomes the position of the rightmost col before ali data, and hence = maximal namewidth-1 */
for (c = ncols0-1; nres1 && c >= 0; c--)
if (colcodes0[c] == 'x') nres1--;
if (nres1 > 0) return eslFAIL;
nwB = c+1;
/* search leftward past whitespace columns that might be trailing the name */
/* c becomes the position of the rightmost non-whitespace column, and hence = minimal namewidth-1 */
for ( ; c >= 0; c--)
if (colcodes0[c] != '.') break;
nwA = c+1;
/* if nwA <= 10 <= nwB, the namewidth is consistent with strict PHYLIP namewidth of 10 */
if (nwA <= 10 && nwB >= 10) namewidth = 10;
else namewidth = nwA;
*ret_namewidth = namewidth;
return eslOK;
}
/*-------------- end, format autodetection ----------------------*/
/*****************************************************************
* 5. Rectifying valid input/output symbols in names and seqs
*****************************************************************/
/* We allow any isprint() char in a name, except tabs.
* Phylip allows spaces in names, but Easel doesn't.
* Internal spaces are converted to underscore _
* Leading and trailing spaces are ignored
* namebuf must be allocated at least n+1 chars
* returns eslEINVAL if it sees an invalid character
*/
static int
phylip_rectify_input_name(char *namebuf, char *p, int n)
{
int pos, endpos;
int npos = 0;
for (endpos = n-1; endpos > 0; endpos--) if (p[endpos] != ' ') break;
for (pos = 0; pos <= endpos; pos++) if (p[pos] != ' ') break;
for ( ; pos <= endpos; pos++)
{
if (! isgraph(p[pos]) && p[pos] != ' ') return eslEINVAL;
namebuf[npos++] = (p[pos] == ' ' ? '_' : p[pos]);
}
namebuf[npos] = '\0';
return eslOK;
}
/* Easel's internal alphabet (and output syms) are compatible with PHYLIP
* (upper case, '-' for gaps, '*' for stop codon; except that PHYLIP uses
* '?' for missing data
*/
static int
phylip_rectify_output_seq_digital(char *buf)
{
int i;
for (i = 0; buf[i]; i++)
{
if (buf[i] == '~') buf[i] = '?';
}
return eslOK;
}
static int
phylip_rectify_output_seq_text(char *buf)
{
int i;
for (i = 0; buf[i]; i++)
{
if (islower(buf[i])) buf[i] = toupper(buf[i]);
if (strchr("._ ", buf[i]) != NULL) buf[i] = '-';
if (buf[i] == '~') buf[i] = '?';
}
return eslOK;
}
/*---------- end, rectification of name, seq symbols ------------*/
/*****************************************************************
* 6. Unit tests
*****************************************************************/
#ifdef eslMSAFILE_PHYLIP_TESTDRIVE
static void
utest_write_good1(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs(" 5 42\n", ofp);
fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp);
fputs("H. SapiensACCGGTTGGC CGTTCAGGGT\n", ofp);
fputs("Chimp AAACCCTTGC CGTTACGCTT\n", ofp);
fputs("Gorilla AAACCCTTGC CGGTACGCTT\n", ofp);
fputs("\n", ofp);
fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp);
fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp);
fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp);
fputs("AAACCGAGGC CGGGACACTC AT\n", ofp);
fputs("AAACCATTGC CGGTACGCTT AA\n", ofp);
*ret_format = eslMSAFILE_PHYLIP;
*ret_alphatype = eslDNA;
*ret_nseq = 5;
*ret_alen = 42;
}
static void
utest_write_good2(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs("7 50 \n", ofp);
fputs("thermotogaATGGCGAAGGAAAAATTTGTGAGAACAAAACCGCATGTTAACGTTGGAAC\n", ofp);
fputs("TthermophiATGGCGAAGGGCGAGTTTGTTCGGACGAAGCCTCACGTGAACGTGGGGAC \n", ofp);
fputs("TaquaticusATGGCGAAGGGCGAGTTTATCCGGACGAAGCCCCACGTGAACGTGGGGAC \n", ofp);
fputs("deinonema-ATGGCTAAGGGAACGTTTGAACGCACCAAACCCCACGTGAACGTGGGCAC \n", ofp);
fputs("ChlamydiaBATGTCAAAAGAAACTTTTCAACGTAATAAGCCTCATATCAACATAGGGGC \n", ofp);
fputs("flexistipsATGTCCAAGCAAAAGTACGAAAGGAAGAAACCTCACGTAAACGTAGGCAC \n", ofp);
fputs("borrelia-bATGGCAAAAGAAGTTTTTCAAAGAACAAAGCCGCACATGAATGTTGGAAC \n", ofp);
*ret_format = eslMSAFILE_PHYLIP;
*ret_alphatype = eslDNA;
*ret_nseq = 7;
*ret_alen = 50;
}
static void
utest_write_good3(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs(" 3 384\n", ofp);
fputs("CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- --------SQ \n", ofp);
fputs("ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG ALGRTRHALR \n", ofp);
fputs("CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- --------FH \n", ofp);
fputs("\n", ofp);
fputs(" FLEFQDKFNK KY-SHEEYLE RFEIFKSNLG KIEELNLIAI NHKADTKFGV NKFADLSSDE \n", ofp);
fputs(" FARFAVRYGK SYESAAEVRR RFRIFSESLE EVRSTN---- RKGLPYRLGI NRFSDMSWEE \n", ofp);
fputs(" FKSWMSKHRK TY-STEEYHH RLQTFASNWR KINAHN---- NGNHTFKMAL NQFSDMSFAE \n", ofp);
fputs("\n", ofp);
fputs(" FKNYYLNNKE AIFTDDLPVA DYLDDEFINS IPTAFDWRTR G-AVTPVKNQ GQCGSCWSFS \n", ofp);
fputs(" FQATRL-GAA QTCSATLAGN HLMRDA--AA LPETKDWRED G-IVSPVKNQ AHCGSCWTFS \n", ofp);
fputs(" IKHKYLWSEP QNCSAT--KS NYLRGT--GP YPPSVDWRKK GNFVSPVKNQ GACGSCWTFS \n", ofp);
fputs("\n", ofp);
fputs(" TTGNVEGQHF ISQNKLVSLS EQNLVDCDHE CMEYEGEEAC DEGCNGGLQP NAYNYIIKNG \n", ofp);
fputs(" TTGALEAAYT QATGKNISLS EQQLVDCAGG FNNF------ --GCNGGLPS QAFEYIKYNG \n", ofp);
fputs(" TTGALESAIA IATGKMLSLA EQQLVDCAQD FNNY------ --GCQGGLPS QAFEYILYNK \n", ofp);
fputs("\n", ofp);
fputs(" GIQTESSYPY TAETGTQCNF NSANIGAKIS NFTMIP-KNE TVMAGYIVST GPLAIAADAV \n", ofp);
fputs(" GIDTEESYPY KGVNGV-CHY KAENAAVQVL DSVNITLNAE DELKNAVGLV RPVSVAFQVI \n", ofp);
fputs(" GIMGEDTYPY QGKDGY-CKF QPGKAIGFVK DVANITIYDE EAMVEAVALY NPVSFAFEVT \n", ofp);
fputs("\n", ofp);
fputs(" E-WQFYIGGV F-DIPCN--P NSLDHGILIV GYSAKNTIFR KNMPYWIVKN SWGADWGEQG \n", ofp);
fputs(" DGFRQYKSGV YTSDHCGTTP DDVNHAVLAV GYGVENGV-- ---PYWLIKN SWGADWGDNG \n", ofp);
fputs(" QDFMMYRTGI YSSTSCHKTP DKVNHAVLAV GYGEKNGI-- ---PYWIVKN SWGPQWGMNG \n", ofp);
fputs("\n", ofp);
fputs(" YIYLRRGKNT CGVSNFVSTS II-- \n", ofp);
fputs(" YFKMEMGKNM CAIATCASYP VVAA \n", ofp);
fputs(" YFLIERGKNM CGLAACASYP IPLV\n", ofp);
*ret_format = eslMSAFILE_PHYLIP;
*ret_alphatype = eslAMINO;
*ret_nseq = 3;
*ret_alen = 384;
}
static void
utest_write_good4(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs(" 5 42\n", ofp);
fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp);
fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp);
fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp);
fputs("H. SapiensACCGGTTGGC CGTTCAGGGT\n", ofp);
fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp);
fputs("Chimp AAACCCTTGC CGTTACGCTT\n", ofp);
fputs("AAACCGAGGC CGGGACACTC AT\n", ofp);
fputs("Gorilla AAACCCTTGC CGGTACGCTT\n", ofp);
fputs("AAACCATTGC CGGTACGCTT AA\n", ofp);
*ret_format = eslMSAFILE_PHYLIPS;
*ret_alphatype = eslDNA;
*ret_nseq = 5;
*ret_alen = 42;
}
static void
utest_write_good5(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs(" 3 384\n", ofp);
fputs("CYS1_DICDI-----MKVILLFVLAVFTVFVSS---------------RGIPPEEQ------------SQFLEFQDKFNKKY-SHEEY\n", ofp);
fputs("LERFEIFKSNLGKIEELNLIAINHKADTKFGVNKFADLSSDEFKNYYLNNKEAIFTDDLPVADYLDDEFINSIPTAFDWRTRG-AVTP\n", ofp);
fputs("VKNQGQCGSCWSFSTTGNVEGQHFISQNKLVSLSEQNLVDCDHECMEYEGEEACDEGCNGGLQPNAYNYIIKNGGIQTESSYPYTAET\n", ofp);
fputs("GTQCNFNSANIGAKISNFTMIP-KNETVMAGYIVSTGPLAIAADAVE-WQFYIGGVF-DIPCN--PNSLDHGILIVGYSAKNTIFRKN\n", ofp);
fputs("MPYWIVKNSWGADWGEQGYIYLRRGKNTCGVSNFVSTSII--\n", ofp);
fputs("\n", ofp);
fputs("ALEU_HORVUMAHARVLLLALAVLATAAVAVASSSSFADSNPIRPVTDRAASTLESAVLGALGRTRHALRFARFAVRYGKSYESAAEVR\n", ofp);
fputs("RRFRIFSESLEEVRSTN----RKGLPYRLGINRFSDMSWEEFQATRL-GAAQTCSATLAGNHLMRDA--AALPETKDWREDG-IVSPVK\n", ofp);
fputs("NQAHCGSCWTFSTTGALEAAYTQATGKNISLSEQQLVDCAGGFNNF--------GCNGGLPSQAFEYIKYNGGIDTEESYPYKGVNGV-\n", ofp);
fputs("CHYKAENAAVQVLDSVNITLNAEDELKNAVGLVRPVSVAFQVIDGFRQYKSGVYTSDHCGTTPDDVNHAVLAVGYGVENGV-----PYW\n", ofp);
fputs("LIKNSWGADWGDNGYFKMEMGKNMCAIATCASYPVVAA\n", ofp);
fputs("\n", ofp);
fputs("CATH_HUMAN------MWATLPLLCAGAWLLGV--------PVCGAAELSVNSLEK------------FHFKSWMSKHRKTY-STEEYH\n", ofp);
fputs("HRLQTFASNWRKINAHN----NGNHTFKMALNQFSDMSFAEIKHKYLWSEPQNCSAT--KSNYLRGT--GPYPPSVDWRKKGNFVSPVK\n", ofp);
fputs("NQGACGSCWTFSTTGALESAIAIATGKMLSLAEQQLVDCAQDFNNY--------GCQGGLPSQAFEYILYNKGIMGEDTYPYQGKDGY-\n", ofp);
fputs("CKFQPGKAIGFVKDVANITIYDEEAMVEAVALYNPVSFAFEVTQDFMMYRTGIYSSTSCHKTPDKVNHAVLAVGYGEKNGI-----PYW\n", ofp);
fputs("IVKNSWGPQWGMNGYFLIERGKNMCGLAACASYPIPLV\n", ofp);
fputs("\n", ofp);
*ret_format = eslMSAFILE_PHYLIPS;
*ret_alphatype = eslAMINO;
*ret_nseq = 3;
*ret_alen = 384;
}
/* a tricky one, with a nonstandard name width of 12 (all names end in xx) */
static void
utest_write_good6(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs(" 5 42\n", ofp);
fputs("Turkey xxAAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("Salmo gairxxAAGCCTTGGC AGTGCAGGGT\n", ofp);
fputs("H. SapiensxxACCGGTTGGC CGTTCAGGGT\n", ofp);
fputs("Chimp xxAAACCCTTGC CGTTACGCTT\n", ofp);
fputs("Gorilla xxAAACCCTTGC CGGTACGCTT\n", ofp);
fputs("\n", ofp);
fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp);
fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp);
fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp);
fputs("AAACCGAGGC CGGGACACTC AT\n", ofp);
fputs("AAACCATTGC CGGTACGCTT AA\n", ofp);
*ret_format = eslMSAFILE_PHYLIP;
*ret_alphatype = eslDNA;
*ret_nseq = 5;
*ret_alen = 42;
}
/* nonstandard name field width in a sequential file. */
static void
utest_write_good7(FILE *ofp, int *ret_format, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs(" 5 42\n", ofp);
fputs("Turkey xxAAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("GAGCCCGGGC AATACAGGGT AT\n", ofp);
fputs("\n", ofp);
fputs("Salmo gairxxAAGCCTTGGC AGTGCAGGGT\n", ofp);
fputs("GAGCCGTGGC CGGGCACGGT AT\n", ofp);
fputs("\n", ofp);
fputs("H. SapiensxxACCGGTTGGC CGTTCAGGGT\n", ofp);
fputs("ACAGGTTGGC CGTTCAGGGT AA\n", ofp);
fputs("\n", ofp);
fputs("Chimp xxAAACCCTTGC CGTTACGCTT\n", ofp);
fputs("AAACCGAGGC CGGGACACTC AT\n", ofp);
fputs("\n", ofp);
fputs("Gorilla xxAAACCCTTGC CGGTACGCTT\n", ofp);
fputs("AAACCATTGC CGGTACGCTT AA\n", ofp);
*ret_format = eslMSAFILE_PHYLIPS;
*ret_alphatype = eslDNA;
*ret_nseq = 5;
*ret_alen = 42;
}
/* if using strict PHYLIP, 10 char namewidth results in reading a M=21 alignment,
* which disagrees with alen=20
*/
static void
utest_write_bad1(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 20\n", ofp);
fputs("Turkey xAAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("Salmo gairxAAGCCTTGGC AGTGCAGGGT\n", ofp);
*ret_alphatype = eslDNA;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 3;
strcpy(errmsg, "alignment length disagrees");
}
static void
utest_write_bad2(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" x 2 20\n", ofp);
fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp);
*ret_alphatype = eslDNA;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 1;
strcpy(errmsg, "first field isn't an integer");
}
static void
utest_write_bad3(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 x\n", ofp);
fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp);
*ret_alphatype = eslDNA;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 1;
strcpy(errmsg, "second field isn't an integer");
}
static void
utest_write_bad4(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2\n", ofp);
fputs("Turkey AAGCTNGGGC ATTTCAGGGT\n", ofp);
fputs("Salmo gairAAGCCTTGGC AGTGCAGGGT\n", ofp);
*ret_alphatype = eslDNA;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 1;
strcpy(errmsg, "only one field found");
}
static void
utest_write_bad5(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 20\n", ofp);
fputs("\n", ofp);
*ret_alphatype = eslDNA;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 2;
strcpy(errmsg, "no alignment data");
}
static void
utest_write_bad6(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 20\n", ofp);
fputs("seq1 \n", ofp);
fputs("seq2 ACDEFGHIKLMNPQRSTVWY\n", ofp);
*ret_alphatype = eslAMINO;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 2;
strcpy(errmsg, "line too short");
}
static void
utest_write_bad7(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 20\n", ofp);
fputs("\tseq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("seq2_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
*ret_alphatype = eslAMINO;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 2;
strcpy(errmsg, "invalid character(s) in sequence name");
}
static void
utest_write_bad8(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 20\n", ofp);
fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("seq2_name ACDEFGHI~LMNPQRSTVWY\n", ofp);
*ret_alphatype = eslAMINO;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 3;
strcpy(errmsg, "one or more invalid sequence characters");
}
static void
utest_write_bad9(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 20\n", ofp);
fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("seq2_name ACDEFGHIKLMNPQRSTVW\n", ofp);
*ret_alphatype = eslAMINO;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 3;
strcpy(errmsg, "number of residues on line differs from previous seqs");
}
static void
utest_write_bad10(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 3 40\n", ofp);
fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("seq2_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("\n", ofp);
fputs("ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("ACDEFGHIKLMNPQRSTVWY\n", ofp);
*ret_alphatype = eslAMINO;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 4;
strcpy(errmsg, "unexpected number of sequences in block");
}
static void
utest_write_bad11(FILE *ofp, int *ret_alphatype, int *ret_errstatus, int *ret_linenumber, char *errmsg)
{
fputs(" 2 30\n", ofp);
fputs("seq1_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("seq2_name ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("\n", ofp);
fputs(" ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs(" ACDEFGHIKLMNPQRSTVWY\n", ofp);
fputs("\n", ofp);
*ret_alphatype = eslAMINO;
*ret_errstatus = eslEFORMAT;
*ret_linenumber = 7;
strcpy(errmsg, "alignment length disagrees with header");
}
/* example of a pathological file where interleaved, sequential
* can't be distinguished. If we read this as interleaved:
* seq1 AAAAAAAAAA CCCCCCCCCC YYYYYYYYY FFFFFFFFFF GGGGGGGGGG
* YYYYYYYYY DDDDDDDDDD EEEEEEEEEE HHHHHHHHH IIIIIIIIII KKKKKKKKKK
* whereas if we read it as sequential:
* seq1 AAAAAAAAAA CCCCCCCCCC YYYYYYYYY DDDDDDDDDD EEEEEEEEEE
* YYYYYYYYY FFFFFFFFFF GGGGGGGGGG HHHHHHHHH IIIIIIIIII KKKKKKKKKK
*
* To fall into this, sequence names have to look like a chunk of
* aligned sequence, with exactly the right length to give a correct
* <alen> whichever way we read the file.
*/
static void
utest_write_ambig1(FILE *ofp)
{
fputs(" 2 49\n", ofp);
fputs("seq1 AAAAAAAAAA CCCCCCCCCC\n", ofp);
fputs("YYYYYYYYY DDDDDDDDDD EEEEEEEEEE\n", ofp);
fputs("YYYYYYYYY FFFFFFFFFF GGGGGGGGGG\n", ofp);
fputs("HHHHHHHHH IIIIIIIIII KKKKKKKKKK\n", ofp);
}
static void
utest_goodfile(char *filename, int testnumber, int expected_format, int expected_alphatype, int expected_nseq, int expected_alen)
{
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSAFILE_FMTDATA fmtd; /* for writing formats with nonstandard name field widths */
ESL_MSA *msa1 = NULL;
ESL_MSA *msa2 = NULL;
char tmpfile1[32] = "esltmpXXXXXX";
char tmpfile2[32] = "esltmpXXXXXX";
FILE *ofp = NULL;
int status;
esl_msafile_fmtdata_Init(&fmtd);
/* guessing both the format and the alphabet should work: this is a digital open */
if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("phylip good file unit test %d failed: digital open", testnumber);
if (afp->format != expected_format) esl_fatal("phylip good file unit test %d failed: format autodetection", testnumber);
if (abc->type != expected_alphatype) esl_fatal("phylip good file unit test %d failed: alphabet autodetection", testnumber);
/* This is a digital read, using <abc>. */
if ( (status = esl_msafile_phylip_Read(afp, &msa1)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa read, digital", testnumber);
if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("phylip good file unit test %d failed: nseq/alen", testnumber);
if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("phylip good file test %d failed: msa1 invalid", testnumber);
fmtd.namewidth = afp->fmtd.namewidth;
esl_msafile_Close(afp);
/* write it back out to a new tmpfile (digital write) */
if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("phylip good file unit test %d failed: tmpfile creation", testnumber);
if ( (status = esl_msafile_phylip_Write(ofp, msa1, expected_format, &fmtd)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa write, digital", testnumber);
fclose(ofp);
/* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, expected_format, &fmtd, &afp)) != eslOK) esl_fatal("phylip good file unit test %d failed: text mode open", testnumber);
if ( (status = esl_msafile_phylip_Read(afp, &msa2)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa read, text", testnumber);
if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("phylip good file test %d failed: msa2 invalid", testnumber);
if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("phylip good file unit test %d failed: nseq/alen", testnumber);
fmtd.namewidth = afp->fmtd.namewidth;
esl_msafile_Close(afp);
/* write it back out to a new tmpfile (text write) */
if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("phylip good file unit test %d failed: tmpfile creation", testnumber);
if ( (status = esl_msafile_phylip_Write(ofp, msa2, expected_format, &fmtd)) != eslOK) esl_fatal("phylip good file unit test %d failed: msa write, text", testnumber);
fclose(ofp);
esl_msa_Destroy(msa2);
/* open and read it in digital mode */
if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, expected_format, &fmtd, &afp)) != eslOK) esl_fatal("phylip good file unit test %d failed: 2nd digital mode open", testnumber);
if ( (status = esl_msafile_phylip_Read(afp, &msa2)) != eslOK) esl_fatal("phylip good file unit test %d failed: 2nd digital msa read", testnumber);
if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("phylip good file test %d failed: msa2 invalid", testnumber);
esl_msafile_Close(afp);
/* this msa <msa2> should be identical to <msa1> */
if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("phylip good file unit test %d failed: msa compare", testnumber);
remove(tmpfile1);
remove(tmpfile2);
esl_msa_Destroy(msa1);
esl_msa_Destroy(msa2);
esl_alphabet_Destroy(abc);
}
/* utest_badfile:
* Test the strict PHYLIP parser's ability to detect bad input;
* regression test its user-directed error messages.
*
* note: this test uses strict phylip, with 10 char name width.
* it's not using the format guesser, which could determine a nonstandard name width
* some "bad" utests can be parsed differently by esl_msafile_phylip_example,
* which does use the format guesser
*
* TODO: we should also have "bad" tests for sequential PHYLIP, and for
* nonstandard name widths.
*
*/
static void
utest_badfile(char *filename, int testnumber, int expected_alphatype, int expected_status, int expected_linenumber, char *expected_errmsg)
{
ESL_ALPHABET *abc = esl_alphabet_Create(expected_alphatype);
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
int status;
if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_PHYLIP, NULL, &afp)) != eslOK) esl_fatal("phylip bad file unit test %d failed: unexpected open failure", testnumber);
if ( (status = esl_msafile_phylip_Read(afp, &msa)) != expected_status) esl_fatal("phylip bad file unit test %d failed: unexpected error code", testnumber);
if (strstr(afp->errmsg, expected_errmsg) == NULL) esl_fatal("phylip bad file unit test %d failed: unexpected errmsg", testnumber);
if (afp->linenumber != expected_linenumber) esl_fatal("phylip bad file unit test %d failed: unexpected linenumber", testnumber);
esl_msafile_Close(afp);
esl_alphabet_Destroy(abc);
esl_msa_Destroy(msa);
}
static void
utest_ambigfile(char *filename, int testnumber)
{
ESL_BUFFER *bf = NULL;
int fmt;
int namewidth;
if ( esl_buffer_Open(filename, NULL, &bf) != eslOK) esl_fatal("phylip ambig file unit test %d failed: buffer open", testnumber);
if ( esl_msafile_phylip_CheckFileFormat(bf, &fmt, &namewidth) != eslEAMBIGUOUS) esl_fatal("phylip ambig file unit test %d failed: ambiguity detection", testnumber);
if ( fmt != eslMSAFILE_UNKNOWN ) esl_fatal("phylip ambig file unit test %d failed: format code", testnumber);
if ( namewidth != 0 ) esl_fatal("phylip ambig file unit test %d failed: namewidth not 0", testnumber);
esl_buffer_Close(bf);
}
/* PHYLIP's seqboot program can output many MSAs to the same phylip file.
* For this reason (only), we allow PHYLIP format to have multiple MSAs per file.
* PHYLIP format 'officially' does not document this!
*/
static void
utest_seqboot(void)
{
char msg[] = "seqboot unit test failed";
char tmpfile[32];
FILE *ofp;
int expected_fmt;
int expected_alphatype;
int expected_nseq;
int expected_alen;
int expected_nali;
int i;
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
/* Write a tmp testfile with good1 concatenated 3 times. */
expected_nali = 3;
strcpy(tmpfile, "esltmpXXXXXX");
if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
for (i = 0; i < expected_nali; i++)
utest_write_good1(ofp, &expected_fmt, &expected_alphatype, &expected_nseq, &expected_alen);
fclose(ofp);
/* open it and loop over it, reading MSAs. there should be 3, of the expected size. */
if (esl_msafile_Open(&abc, tmpfile, /*env=*/NULL, eslMSAFILE_UNKNOWN, /*fmtd=*/NULL, &afp) != eslOK) esl_fatal(msg);
if (abc->type != expected_alphatype) esl_fatal(msg);
if (afp->format != expected_fmt) esl_fatal(msg);
i = 0;
while ( esl_msafile_Read(afp, &msa) == eslOK)
{
i++;
if (msa->nseq != expected_nseq) esl_fatal(msg);
if (msa->alen != expected_alen) esl_fatal(msg);
esl_msa_Destroy(msa);
}
if (i != expected_nali) esl_fatal(msg);
remove(tmpfile);
esl_msafile_Close(afp);
esl_alphabet_Destroy(abc);
}
#endif /*eslMSAFILE_PHYLIP_TESTDRIVE*/
/*---------------------- end, unit tests ------------------------*/
/*****************************************************************
* 7. Test driver
*****************************************************************/
#ifdef eslMSAFILE_PHYLIP_TESTDRIVE
/* compile: gcc -g -Wall -I. -L. -o esl_msafile_phylip_utest -DeslMSAFILE_PHYLIP_TESTDRIVE esl_msafile_phylip.c -leasel -lm
* (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_phylip_utest -DeslMSAFILE_PHYLIP_TESTDRIVE esl_msafile_phylip.c -leasel -lm
* run: ./esl_msafile_phylip_utest
*/
#include "esl_config.h"
#include <stdio.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_msafile.h"
#include "esl_msafile_phylip.h"
static ESL_OPTIONS options[] = {
/* name type default env range togs reqs incomp help docgrp */
{"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
{ 0,0,0,0,0,0,0,0,0,0},
};
static char usage[] = "[-options]";
static char banner[] = "test driver for PHYLIP MSA format module";
int
main(int argc, char **argv)
{
char msg[] = "PHYLIP MSA i/o module test driver failed";
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
FILE *ofp = NULL;
int ngoodtests = 7;
int nbadtests = 11;
int nambigtests = 1;
int testnumber = 0;
char tmpfile[32];
int expected_format;
int expected_alphatype;
int expected_errstatus;
int expected_linenumber;
int expected_nseq;
int expected_alen;
char expected_errmsg[eslERRBUFSIZE];
/* Test various correct versions of the format */
for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
{
strcpy(tmpfile, "esltmpXXXXXX");
if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
switch (testnumber) {
case 1: utest_write_good1 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
case 2: utest_write_good2 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
case 3: utest_write_good3 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
case 4: utest_write_good4 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
case 5: utest_write_good5 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
case 6: utest_write_good6 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
case 7: utest_write_good7 (ofp, &expected_format, &expected_alphatype, &expected_nseq, &expected_alen); break;
}
fclose(ofp);
utest_goodfile(tmpfile, testnumber, expected_format, expected_alphatype, expected_nseq, expected_alen);
remove(tmpfile);
}
/* Test for all the possible EFORMAT errors (using strict Phylip interleaved parser) */
for (testnumber = 1; testnumber <= nbadtests; testnumber++)
{
strcpy(tmpfile, "esltmpXXXXXX");
if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
switch (testnumber) {
case 1: utest_write_bad1 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 2: utest_write_bad2 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 3: utest_write_bad3 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 4: utest_write_bad4 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 5: utest_write_bad5 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 6: utest_write_bad6 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 7: utest_write_bad7 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 8: utest_write_bad8 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 9: utest_write_bad9 (ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 10: utest_write_bad10(ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
case 11: utest_write_bad11(ofp, &expected_alphatype, &expected_errstatus, &expected_linenumber, expected_errmsg); break;
}
fclose(ofp);
utest_badfile(tmpfile, testnumber, expected_alphatype, expected_errstatus, expected_linenumber, expected_errmsg);
remove(tmpfile);
}
/* Test that we correctly detect pathological files that look both interleaved and sequential */
for (testnumber = 1; testnumber <= nambigtests; testnumber++)
{
strcpy(tmpfile, "esltmpXXXXXX");
if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
switch (testnumber) {
case 1: utest_write_ambig1 (ofp);
}
fclose(ofp);
utest_ambigfile(tmpfile, testnumber);
remove(tmpfile);
}
/* Other tests */
utest_seqboot();
esl_getopts_Destroy(go);
return 0;
}
#endif /*eslMSAFILE_PHYLIP_TESTDRIVE*/
/*--------------------- end, test driver ------------------------*/
/*****************************************************************
* 8. Example.
*****************************************************************/
#ifdef eslMSAFILE_PHYLIP_EXAMPLE
/* A full-featured example of reading/writing an MSA in Phylip format(s).
gcc -g -Wall -o esl_msafile_phylip_example -I. -L. -DeslMSAFILE_PHYLIP_EXAMPLE esl_msafile_phylip.c -leasel -lm
./esl_msafile_phylip_example <msafile>
*/
/*::cexcerpt::msafile_phylip_example::begin::*/
#include <stdio.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_getopts.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_phylip.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-1", "no autodetection; use interleaved PHYLIP", 0 },
{ "-2", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-2", "no autodetection; use sequential PHYLIPS", 0 },
{ "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
{ "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
{ "-w", eslARG_INT, "10", NULL, NULL, NULL, NULL, NULL, "specify that format's name width is <n>", 0 },
{ "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
{ "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
{ "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options] <msafile>";
static char banner[] = "example of guessing, reading, writing PHYLIP formats";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
char *filename = esl_opt_GetArg(go, 1);
int infmt = eslMSAFILE_UNKNOWN;
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
ESL_MSAFILE_FMTDATA fmtd;
int status;
if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_PHYLIP; /* interleaved format */
else if (esl_opt_GetBoolean(go, "-2")) infmt = eslMSAFILE_PHYLIPS; /* sequential format */
if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
/* Variant PHYLIP formats allow nonstandard name field width.
* Usually we can successfully guess this, when guessing format.
* But if PHYLIP or PHYLIPS format is set (no guessing), caller may also
* want to allow a nonstandard name field width to be set.
*/
esl_msafile_fmtdata_Init(&fmtd);
fmtd.namewidth = esl_opt_GetInteger(go, "-w");
/* Text mode: pass NULL for alphabet.
* Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
*/
if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, &fmtd, &afp);
else status = esl_msafile_Open(&abc, filename, NULL, infmt, &fmtd, &afp);
if (status != eslOK) esl_msafile_OpenFailure(afp, status);
if ( (status = esl_msafile_phylip_Read(afp, &msa)) != eslOK)
esl_msafile_ReadFailure(afp, status);
printf("format variant: %s\n", esl_msafile_DecodeFormat(afp->format));
printf("name width: %d\n", afp->fmtd.namewidth);
printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
printf("# of seqs: %d\n", msa->nseq);
printf("# of cols: %d\n", (int) msa->alen);
printf("\n");
if (afp->fmtd.namewidth != 10) fmtd.namewidth = afp->fmtd.namewidth;
if (! esl_opt_GetBoolean(go, "-q"))
esl_msafile_phylip_Write(stdout, msa, eslMSAFILE_PHYLIP, &fmtd);
esl_msa_Destroy(msa);
esl_msafile_Close(afp);
if (abc) esl_alphabet_Destroy(abc);
esl_getopts_Destroy(go);
exit(0);
}
/*::cexcerpt::msafile_phylip_example::end::*/
#endif /*eslMSAFILE_PHYLIP_EXAMPLE*/
#ifdef eslMSAFILE_PHYLIP_EXAMPLE2
/* A minimal example. Reading a strict interleaved PHYLIP MSA in text mode.
gcc -g -Wall -o esl_msafile_phylip_example2 -I. -L. -DeslMSAFILE_PHYLIP_EXAMPLE2 esl_msafile_phylip.c -leasel -lm
./esl_msafile_phylip_example2 <msafile>
*/
/*::cexcerpt::msafile_phylip_example2::begin::*/
#include <stdio.h>
#include "easel.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_phylip.h"
int
main(int argc, char **argv)
{
char *filename = argv[1];
int infmt = eslMSAFILE_PHYLIP; /* or eslMSAFILE_PHYLIPS, for sequential format */
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
int status;
if ( (status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
if ( (status = esl_msafile_phylip_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status);
printf("# of seqs: %d\n", msa->nseq);
printf("# of cols: %d\n", (int) msa->alen);
printf("\n");
esl_msafile_phylip_Write(stdout, msa, eslMSAFILE_PHYLIP, NULL);
esl_msa_Destroy(msa);
esl_msafile_Close(afp);
exit(0);
}
/*::cexcerpt::msafile_phylip_example::end::*/
#endif /*eslMSAFILE_PHYLIP_EXAMPLE*/
/*--------------------- end of examples -------------------------*/
You can’t perform that action at this time.