Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
810 lines (706 sloc) 34.2 KB
/* I/O of multiple sequence alignments in PSI-BLAST format
*
* Contents:
* 1. API for reading/writing PSI-BLAST format
* 2. Unit tests.
* 3. Test driver.
* 4. Examples.
*/
#include "esl_config.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_mem.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_psiblast.h"
/*****************************************************************
*# 1. API for reading/writing PSI-BLAST format
*****************************************************************/
/* Function: esl_msafile_psiblast_SetInmap()
* Synopsis: Set input map specific for PSI-BLAST input.
*
* Purpose: Set the <afp->inmap> for PSI-BLAST format.
*
* PSI-BLAST only allows - for a gap. It also disallows O residues.
*
* Text mode accepts any <isalpha()> character plus '-' but not 'O' or 'o'.
* Digital mode enforces the usual Easel alphabets, but disallows "._*~".
*/
int
esl_msafile_psiblast_SetInmap(ESL_MSAFILE *afp)
{
int sym;
if (afp->abc)
{
for (sym = 0; sym < 128; sym++)
afp->inmap[sym] = afp->abc->inmap[sym];
afp->inmap[0] = esl_abc_XGetUnknown(afp->abc);
afp->inmap['.'] = eslDSQ_ILLEGAL;
afp->inmap['_'] = eslDSQ_ILLEGAL;
afp->inmap['*'] = eslDSQ_ILLEGAL;
afp->inmap['~'] = eslDSQ_ILLEGAL;
}
if (! afp->abc)
{
for (sym = 1; sym < 128; sym++)
afp->inmap[sym] = (isalpha(sym) ? sym : eslDSQ_ILLEGAL);
afp->inmap[0] = '?';
afp->inmap['-'] = '-';
}
afp->inmap['O'] = eslDSQ_ILLEGAL;
afp->inmap['o'] = eslDSQ_ILLEGAL;
return eslOK;
}
/* Function: esl_msafile_psiblast_GuessAlphabet()
* Synopsis: Guess the alphabet of an open PSI-BLAST MSA file.
*
* Purpose: Guess the alpbabet of the sequences in open
* PSI-BLAST format MSA file <afp>.
*
* On a normal return, <*ret_type> is set to <eslDNA>,
* <eslRNA>, or <eslAMINO>, and <afp> is reset to its
* original position.
*
* Args: afp - open PSI-BLAST format MSA file
* ret_type - RETURN: <eslDNA>, <eslRNA>, or <eslAMINO>
*
* Returns: <eslOK> on success.
* <eslENOALPHABET> if alphabet type can't be determined.
* In either case, <afp> is rewound to the position it
* started at.
*/
int
esl_msafile_psiblast_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type)
{
int alphatype = eslUNKNOWN;
esl_pos_t anchor = -1;
int threshold[3] = { 500, 5000, 50000 }; /* we check after 500, 5000, 50000 residues; else we go to EOF */
int nsteps = 3;
int step = 0;
int nres = 0;
int x;
int64_t ct[26];
char *p, *tok;
esl_pos_t n, toklen, pos;
int status;
for (x = 0; x < 26; x++) ct[x] = 0;
anchor = esl_buffer_GetOffset(afp->bf);
if ((status = esl_buffer_SetAnchor(afp->bf, anchor)) != eslOK) { status = eslEINCONCEIVABLE; goto ERROR; } /* [eslINVAL] can't happen here */
while ( (status = esl_buffer_GetLine(afp->bf, &p, &n)) == eslOK)
{
if ((status = esl_memtok(&p, &n, " \t", &tok, &toklen)) != eslOK) continue; /* blank lines */
/* p now points to the rest of the sequence line, after a name */
/* count characters into ct[] array */
for (pos = 0; pos < n; pos++)
if (isalpha(p[pos])) {
x = toupper(p[pos]) - 'A';
ct[x]++;
nres++;
}
/* try to stop early, checking after 500, 5000, and 50000 residues: */
if (step < nsteps && nres > threshold[step]) {
if ((status = esl_abc_GuessAlphabet(ct, &alphatype)) == eslOK) goto DONE; /* (eslENOALPHABET) */
step++;
}
}
if (status != eslEOF) goto ERROR; /* [eslEMEM,eslESYS,eslEINCONCEIVABLE] */
status = esl_abc_GuessAlphabet(ct, &alphatype); /* (eslENOALPHABET) */
DONE:
esl_buffer_SetOffset(afp->bf, anchor); /* Rewind to where we were. */
esl_buffer_RaiseAnchor(afp->bf, anchor);
*ret_type = alphatype;
return status;
ERROR:
if (anchor != -1) {
esl_buffer_SetOffset(afp->bf, anchor);
esl_buffer_RaiseAnchor(afp->bf, anchor);
}
*ret_type = eslUNKNOWN;
return status;
}
/* Function: esl_msafile_psiblast_Read()
* Synopsis: Read an alignment in PSI-BLAST's input format.
*
* Purpose: Read an MSA from an open <ESL_MSAFILE> <afp>, parsing for
* PSI-BLAST input format, starting from the current point.
* Create a new multiple alignment, and return a ptr to
* that alignment via <*ret_msa>. Caller is responsible for
* free'ing this <ESL_MSA>.
*
* The <msa> has a reference line (<msa->rf[]>) that
* corresponds to the uppercase/lowercase columns in the
* alignment: consensus (uppercase) columns are marked 'x',
* and insert (lowercase) columns are marked '.' in this RF
* line.
*
* Args: afp - open <ESL_MSAFILE>
* ret_msa - RETURN: newly parsed <ESL_MSA>
*
* Returns: <eslOK> on success. <*ret_msa> contains the newly
* allocated MSA. <afp> is at EOF.
*
* <eslEOF> if no (more) alignment data are found in
* <afp>, and <afp> is returned at EOF.
*
* <eslEFORMAT> on a parse error. <*ret_msa> is set to
* <NULL>. <afp> contains information sufficient for
* constructing useful diagnostic output:
* | <afp->errmsg> | user-directed error message |
* | <afp->linenumber> | line # where error was detected |
* | <afp->line> | offending line (not NUL-term) |
* | <afp->n> | length of offending line |
* | <afp->bf->filename> | name of the file |
* and <afp> is poised at the start of the following line,
* so (in principle) the caller could try to resume
* parsing.
*
* Throws: <eslEMEM> on allocation error.
* <eslESYS> if a system call fails, such as fread().
* <eslEINCONCEIVABLE> - "impossible" corruption
* On these, <*ret_msa> is returned <NULL>, and the state of
* <afp> is undefined.
*/
int
esl_msafile_psiblast_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa)
{
ESL_MSA *msa = NULL;
int idx = 0; /* counter over sequences in a block */
int nblocks = 0; /* counter over blocks */
int64_t alen = 0;
int nseq = 0;
int64_t cur_alen;
esl_pos_t pos; /* position on a line */
esl_pos_t name_start, name_len;
esl_pos_t seq_start, seq_len;
esl_pos_t block_seq_start, block_seq_len;
int status;
ESL_DASSERT1( (afp->format == eslMSAFILE_PSIBLAST) );
afp->errmsg[0] = '\0';
/* allocate a growable MSA. We set msa->{nseq,alen} only when we're done. */
if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
/* skip leading blank lines in file */
while ( (status = esl_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
if (status != eslOK) goto ERROR; /* includes normal EOF */
/* Read the file a line at a time; if a parsing error occurs, detect immediately, with afp->linenumber set correctly */
do { /* while in the file... */
idx = 0;
do { /* while in a block... */
for (pos = 0; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;
name_start = pos;
for (pos = pos+1; pos < afp->n; pos++) if ( isspace(afp->line[pos])) break;
name_len = pos - name_start;
for (pos = pos+1; pos < afp->n; pos++) if (! isspace(afp->line[pos])) break;
seq_start = pos;
if (pos >= afp->n) ESL_XFAIL(eslEFORMAT, afp->errmsg, "invalid alignment line");
for (pos = afp->n-1; pos > 0; pos--) if (! isspace(afp->line[pos])) break;
seq_len = pos - seq_start + 1;
if (idx == 0) {
block_seq_start = seq_start;
block_seq_len = seq_len;
} else {
if (seq_start != block_seq_start) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence start is misaligned");
if (seq_len != block_seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "sequence end is misaligned");
}
/* Process the consensus #=RF line. */
if (idx == 0) {
ESL_REALLOC(msa->rf, sizeof(char) * (alen + seq_len + 1));
for (pos = 0; pos < seq_len; pos++) msa->rf[alen+pos] = '-'; /* anything neutral other than . or x will do. */
msa->rf[alen+pos] = '\0';
}
for (pos = 0; pos < seq_len; pos++)
{
if (afp->line[seq_start+pos] == '-') continue;
if (isupper(afp->line[seq_start+pos])) {
if (msa->rf[alen+pos] == '.') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected upper case residue (#%d on line)", (int) pos+1);
msa->rf[alen+pos] = 'x';
}
if (islower(afp->line[seq_start+pos])) {
if (msa->rf[alen+pos] == 'x') ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected lower case residue (#%d on line)", (int) pos+1);
msa->rf[alen+pos] = '.';
}
}
/* Store the sequence name. */
if (nblocks == 0) {
/* make sure we have room for another sequence */
if (idx >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) goto ERROR;
if ( (status = esl_msa_SetSeqName(msa, idx, afp->line+name_start, name_len)) != eslOK) goto ERROR;
} else {
if (! esl_memstrcmp(afp->line+name_start, name_len, msa->sqname[idx]))
ESL_XFAIL(eslEFORMAT, afp->errmsg, "expected sequence %s on this line, but saw %.*s", msa->sqname[idx], (int) name_len, afp->line+name_start);
}
/* Append the sequence. */
cur_alen = alen;
if (msa->abc) { status = esl_abc_dsqcat(afp->inmap, &(msa->ax[idx]), &(cur_alen), afp->line+seq_start, seq_len); }
if (! msa->abc) { status = esl_strmapcat (afp->inmap, &(msa->aseq[idx]), &(cur_alen), afp->line+seq_start, seq_len); }
if (status == eslEINVAL) ESL_XFAIL(eslEFORMAT, afp->errmsg, "one or more invalid sequence characters");
else if (status != eslOK) goto ERROR;
if (cur_alen - alen != seq_len) ESL_XFAIL(eslEFORMAT, afp->errmsg, "unexpected number of seq characters");
/* get next line. if it's blank, or if we're EOF, we're done with the block */
idx++;
status = esl_msafile_GetLine(afp, NULL, NULL);
} while (status == eslOK && esl_memspn(afp->line, afp->n, " \t") < afp->n); /* blank line ends a block. */
if (status != eslOK && status != eslEOF) goto ERROR;
/* End of one block */
if (nblocks == 0) nseq = idx;
else if (idx != nseq) ESL_XFAIL(eslEFORMAT, afp->errmsg, "last block didn't contain same # of seqs as earlier blocks");
alen += block_seq_len;
nblocks++;
/* skip blank lines to start of next block, if any */
while ( (status = esl_msafile_GetLine(afp, NULL, NULL)) == eslOK && esl_memspn(afp->line, afp->n, " \t") == afp->n) ;
} while (status == eslOK);
if (status != eslEOF) goto ERROR;
msa->nseq = nseq;
msa->alen = alen;
if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;
*ret_msa = msa;
return eslOK;
ERROR:
if (msa) esl_msa_Destroy(msa);
*ret_msa = NULL;
return status;
}
/* Function: esl_msafile_psiblast_Write()
* Synopsis: Write an MSA to a stream in PSI-BLAST format
*
* Purpose: Write alignment <msa> in NCBI PSI-BLAST format to
* stream <fp>.
*
* The <msa> should have a valid reference line <msa->rf>,
* with alphanumeric characters marking consensus (match)
* columns, and non-alphanumeric characters marking
* nonconsensus (insert) columns. If it does not have RF
* annotation, then the first sequence in the <msa>
* defines the "consensus".
*
* PSI-BLAST format allows only one symbol ('-') for gaps,
* and cannot represent missing data symbols (Easel's
* '~'). Any missing data symbols are converted to gaps.
*
* Args: fp - open output stream
* msa - MSA to write
*
* Returns: <eslOK> on success.
*
* Throws: <eslEMEM> on allocation failure.
* <eslEWRITE> on any system write failure, such as filled disk.
*/
int
esl_msafile_psiblast_Write(FILE *fp, const ESL_MSA *msa)
{
char *buf = NULL;
int cpl = 60;
int acpl;
int i;
int sym;
int64_t pos, bpos;
int maxnamewidth = esl_str_GetMaxWidth(msa->sqname, msa->nseq);
int is_consensus;
int is_residue;
int status;
ESL_ALLOC(buf, sizeof(char) * (cpl+1));
for (pos = 0; pos < msa->alen; pos += cpl)
{
for (i = 0; i < msa->nseq; i++)
{
acpl = (msa->alen - pos > cpl)? cpl : msa->alen - pos;
if (msa->abc)
{
for (bpos = 0; bpos < acpl; bpos++)
{
sym = msa->abc->sym[msa->ax[i][pos + bpos + 1]];
is_residue = esl_abc_XIsResidue(msa->abc, msa->ax[i][pos+bpos+1]);
if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE);
else is_consensus = (esl_abc_XIsResidue(msa->abc, msa->ax[0][pos+bpos+1]) ? TRUE : FALSE);
if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); }
else { buf[bpos] = (is_residue ? tolower(sym) : '-'); }
}
}
if (! msa->abc)
{
for (bpos = 0; bpos < acpl; bpos++)
{
sym = msa->aseq[i][pos + bpos];
is_residue = isalnum(sym);
if (msa->rf) is_consensus = (isalnum(msa->rf[pos + bpos]) ? TRUE : FALSE);
else is_consensus = (isalnum(msa->aseq[0][pos+bpos]) ? TRUE : FALSE);
if (is_consensus) { buf[bpos] = (is_residue ? toupper(sym) : '-'); }
else { buf[bpos] = (is_residue ? tolower(sym) : '-'); }
}
}
buf[acpl] = '\0';
if (fprintf(fp, "%-*s %s\n", maxnamewidth, msa->sqname[i], buf) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed");
} /* end loop over sequences */
if (pos + cpl < msa->alen)
{ if (fputc('\n', fp) < 0) ESL_XEXCEPTION_SYS(eslEWRITE, "psiblast msa write failed"); }
}
free(buf);
return eslOK;
ERROR:
if (buf) free(buf);
return status;
}
/*----------- end, API for i/o of psi-blast format --------------*/
/*****************************************************************
* 2. Unit tests.
*****************************************************************/
#ifdef eslMSAFILE_PSIBLAST_TESTDRIVE
static void
utest_write_good1(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs("MYG_PHYCA --------V-LSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKT\n", ofp);
fputs("GLB5_PETMA pivdtgsvApLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLTT\n", ofp);
fputs("HBB_HUMAN --------VhLTPEEKSAVTALWGKV--NVDEVGGEALGRLLVVYPWTQRFFESFGDLST\n", ofp);
fputs("HBA_HUMAN --------V-LSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHF-----\n", ofp);
fputs("\n", ofp);
fputs("MYG_PHYCA EAEMKASEDLKKHGVTVLTALGAILKKKGH---HEAELKPLAQSHATKHKIPIKYLEFIS\n", ofp);
fputs("GLB5_PETMA ADQLKKSADVRWHAERIINAVNDAVASMDDtekMSMKLRDLSGKHAKSFQVDPQYFKVLA\n", ofp);
fputs("HBB_HUMAN PDAVMGNPKVKAHGKKVLGAFSDGLAHLDN---LKGTFATLSELHCDKLHVDPENFRLLG\n", ofp);
fputs("HBA_HUMAN -DLSHGSAQVKGHGKKVADALTNAVAHVDD---MPNALSALSDLHAHKLRVDPVNFKLLS\n", ofp);
fputs("\n", ofp);
fputs("MYG_PHYCA EAIIHVLHSRHPGDFGADAQGAMNKALELFRKDIAAKYKELGYQG\n", ofp);
fputs("GLB5_PETMA AVI---------ADTVAAGDAGFEKLMSMICILLRSAY-------\n", ofp);
fputs("HBB_HUMAN NVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH------\n", ofp);
fputs("HBA_HUMAN HCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR------\n", ofp);
*ret_alphatype = eslAMINO;
*ret_nseq = 4;
*ret_alen = 165;
}
static void
utest_write_good2(FILE *ofp, int *ret_alphatype, int *ret_nseq, int *ret_alen)
{
fputs("tRNA2 UCCGAUAUAGUGUAACGGCUAUCACAUCACGCUUUCACCGUGG-AGACCGGGGUUCGACU\n", ofp);
fputs("tRNA3 UCCGUGAUAGUUUAAUGGUCAGAAUGG-GCGCUUGUCGCGUGCcAGAUCGGGGUUCAAUU\n", ofp);
fputs("tRNA5 GGGCACAUGGCGCAGUUGGUAGCGCGCUUCCCUUGCAAGGAAGaGGUCAUCGGUUCGAUU\n", ofp);
fputs("tRNA1 GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGaGGUCCUGUGUUCGAUC\n", ofp);
fputs("tRNA4 GCUCGUAUGGCGCAGUGG-UAGCGCAGCAGAUUGCAAAUCUGUuGGUCCUUAGUUCGAUC\n", ofp);
fputs("\n", ofp);
fputs("tRNA2 CCCCGUAUCGGAG\n", ofp);
fputs("tRNA3 CCCCGUCGCGGAG\n", ofp);
fputs("tRNA5 CCGGUUGCGUCCA\n", ofp);
fputs("tRNA1 CACAGAAUUCGCA\n", ofp);
fputs("tRNA4 CUGAGUGCGAGCU\n", ofp);
*ret_alphatype = eslRNA;
*ret_nseq = 5;
*ret_alen = 73;
}
static void
utest_goodfile(char *filename, int testnumber, int expected_alphatype, int expected_nseq, int expected_alen)
{
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa1 = NULL;
ESL_MSA *msa2 = NULL;
char tmpfile1[32] = "esltmpXXXXXX";
char tmpfile2[32] = "esltmpXXXXXX";
FILE *ofp = NULL;
int status;
/* guessing both the format and the alphabet should work: this is a digital open */
/* PSIBLAST format is autodetected as SELEX, which is fine - selex parser is more general */
if ( (status = esl_msafile_Open(&abc, filename, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: digital open", testnumber);
if (afp->format != eslMSAFILE_SELEX) esl_fatal("psiblast good file test %d failed: format autodetection", testnumber);
if (abc->type != expected_alphatype) esl_fatal("psiblast good file test %d failed: alphabet autodetection", testnumber);
afp->format = eslMSAFILE_PSIBLAST;
/* This is a digital read, using <abc>. */
if ( (status = esl_msafile_psiblast_Read(afp, &msa1)) != eslOK) esl_fatal("psiblast good file test %d failed: msa read, digital", testnumber);
if (msa1->nseq != expected_nseq || msa1->alen != expected_alen) esl_fatal("psiblast good file test %d failed: nseq/alen", testnumber);
if (esl_msa_Validate(msa1, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa1 invalid", testnumber);
esl_msafile_Close(afp);
/* write it back out to a new tmpfile (digital write) */
if ( (status = esl_tmpfile_named(tmpfile1, &ofp)) != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation", testnumber);
if ( (status = esl_msafile_psiblast_Write(ofp, msa1)) != eslOK) esl_fatal("psiblast good file test %d failed: msa write, digital", testnumber);
fclose(ofp);
/* now open and read it as text mode, in known format. (We have to pass fmtd now, to deal with the possibility of a nonstandard name width) */
if ( (status = esl_msafile_Open(NULL, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: text mode open", testnumber);
if ( (status = esl_msafile_psiblast_Read(afp, &msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: msa read, text", testnumber);
if (msa2->nseq != expected_nseq || msa2->alen != expected_alen) esl_fatal("psiblast good file test %d failed: nseq/alen", testnumber);
if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa2 invalid", testnumber);
esl_msafile_Close(afp);
/* write it back out to a new tmpfile (text write) */
if ( (status = esl_tmpfile_named(tmpfile2, &ofp)) != eslOK) esl_fatal("psiblast good file test %d failed: tmpfile creation", testnumber);
if ( (status = esl_msafile_psiblast_Write(ofp, msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: msa write, text", testnumber);
fclose(ofp);
esl_msa_Destroy(msa2);
/* open and read it in digital mode */
if ( (status = esl_msafile_Open(&abc, tmpfile1, NULL, eslMSAFILE_PSIBLAST, NULL, &afp)) != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital mode open", testnumber);
if ( (status = esl_msafile_psiblast_Read(afp, &msa2)) != eslOK) esl_fatal("psiblast good file test %d failed: 2nd digital msa read", testnumber);
if (esl_msa_Validate(msa2, NULL) != eslOK) esl_fatal("psiblast good file test %d failed: msa2 invalid", testnumber);
esl_msafile_Close(afp);
/* this msa <msa2> should be identical to <msa1> */
if (esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal("psiblast good file test %d failed: msa compare", testnumber);
remove(tmpfile1);
remove(tmpfile2);
esl_msa_Destroy(msa1);
esl_msa_Destroy(msa2);
esl_alphabet_Destroy(abc);
}
static void
write_test_msas(FILE *ofp1, FILE *ofp2)
{
fprintf(ofp1, "\n");
fprintf(ofp1, "seq1 --ACDEFGHIKLMNPQRSTVWY\n");
fprintf(ofp1, "seq2 --ACDEFGHIKLMNPQRSTV-- \n");
fprintf(ofp1, "seq3 aaACDEFGHIKLMNPQRSTV-- \n");
fprintf(ofp1, "seq4 --ACDEFGHIKLMNPQRSTVWY \n");
fprintf(ofp1, "\n");
fprintf(ofp1, "seq1 ACDEFGHIKLMNPQRSTVWY--\n");
fprintf(ofp1, "seq2 ACDEFGHIKLMNPQRSTVWYyy\n");
fprintf(ofp1, "seq3 ACDEFGHIKLMNPQRSTVWY--\n");
fprintf(ofp1, "seq4 ACDEFGHIKLMNPQRSTVWY--\n");
fprintf(ofp1, "\n");
fprintf(ofp2, "# STOCKHOLM 1.0\n");
fprintf(ofp2, "\n");
fprintf(ofp2, "#=GC RF ..xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..\n");
fprintf(ofp2, "seq1 --ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY--\n");
fprintf(ofp2, "seq2 --ACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWYyy\n");
fprintf(ofp2, "seq3 aaACDEFGHIKLMNPQRSTV--ACDEFGHIKLMNPQRSTVWY--\n");
fprintf(ofp2, "seq4 --ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY--\n");
fprintf(ofp2, "//\n");
}
static void
read_test_msas_digital(char *pbfile, char *stkfile)
{
char msg[] = "PSIBLAST msa digital read unit test failed";
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp1 = NULL;
ESL_MSAFILE *afp2 = NULL;
ESL_MSA *msa1, *msa2, *msa3, *msa4;
FILE *pbfp, *stkfp;
char pbfile2[32] = "esltmppb2XXXXXX";
char stkfile2[32] = "esltmpstk2XXXXXX";
if ( esl_msafile_Open(&abc, pbfile, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
if ( !abc || abc->type != eslAMINO) esl_fatal(msg);
if ( esl_msafile_Open(&abc, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
esl_msafile_Close(afp2);
esl_msafile_Close(afp1);
/* Now write stk to psiblast file, and vice versa; then retest */
if ( esl_tmpfile_named(pbfile2, &pbfp) != eslOK) esl_fatal(msg);
if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Write (pbfp, msa2) != eslOK) esl_fatal(msg);
if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
fclose(pbfp);
fclose(stkfp);
if ( esl_msafile_Open(&abc, pbfile2, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
if ( esl_msafile_Open(&abc, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
remove(pbfile2);
remove(stkfile2);
esl_msafile_Close(afp2);
esl_msafile_Close(afp1);
esl_msa_Destroy(msa1);
esl_msa_Destroy(msa2);
esl_msa_Destroy(msa3);
esl_msa_Destroy(msa4);
esl_alphabet_Destroy(abc);
}
static void
read_test_msas_text(char *pbfile, char *stkfile)
{
char msg[] = "PSIBLAST msa text-mode read unit test failed";
ESL_MSAFILE *afp1 = NULL;
ESL_MSAFILE *afp2 = NULL;
ESL_MSA *msa1, *msa2, *msa3, *msa4;
FILE *pbfp, *stkfp;
char pbfile2[32] = "esltmppb2XXXXXX";
char stkfile2[32] = "esltmpstk2XXXXXX";
/* vvvv-- everything's the same as the digital utest except these NULLs */
if ( esl_msafile_Open(NULL, pbfile, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
if ( esl_msafile_Open(NULL, stkfile, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Read (afp1, &msa1) != eslOK) esl_fatal(msg);
if ( esl_msafile_stockholm_Read(afp2, &msa2) != eslOK) esl_fatal(msg);
if ( esl_msa_Compare(msa1, msa2) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslEOF) esl_fatal(msg);
if ( esl_msafile_stockholm_Read(afp2, &msa3) != eslEOF) esl_fatal(msg);
esl_msafile_Close(afp2);
esl_msafile_Close(afp1);
if ( esl_tmpfile_named(pbfile2, &pbfp) != eslOK) esl_fatal(msg);
if ( esl_tmpfile_named(stkfile2, &stkfp) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Write (pbfp, msa2) != eslOK) esl_fatal(msg);
if ( esl_msafile_stockholm_Write(stkfp, msa1, eslMSAFILE_STOCKHOLM) != eslOK) esl_fatal(msg);
fclose(pbfp);
fclose(stkfp);
if ( esl_msafile_Open(NULL, pbfile2, NULL, eslMSAFILE_PSIBLAST, NULL, &afp1) != eslOK) esl_fatal(msg);
if ( esl_msafile_Open(NULL, stkfile2, NULL, eslMSAFILE_STOCKHOLM, NULL, &afp2) != eslOK) esl_fatal(msg);
if ( esl_msafile_psiblast_Read (afp1, &msa3) != eslOK) esl_fatal(msg);
if ( esl_msafile_stockholm_Read(afp2, &msa4) != eslOK) esl_fatal(msg);
if ( esl_msa_Compare(msa3, msa4) != eslOK) esl_fatal(msg);
remove(pbfile2);
remove(stkfile2);
esl_msafile_Close(afp2);
esl_msafile_Close(afp1);
esl_msa_Destroy(msa1);
esl_msa_Destroy(msa2);
esl_msa_Destroy(msa3);
esl_msa_Destroy(msa4);
}
#endif /*eslMSAFILE_PSIBLAST_TESTDRIVE*/
/*---------------------- end, unit tests ------------------------*/
/*****************************************************************
* 3. Test driver.
*****************************************************************/
#ifdef eslMSAFILE_PSIBLAST_TESTDRIVE
/* compile: gcc -g -Wall -I. -L. -o esl_msafile_psiblast_utest -DeslMSAFILE_PSIBLAST_TESTDRIVE esl_msafile_psiblast.c -leasel -lm
* (gcov): gcc -g -Wall -fprofile-arcs -ftest-coverage -I. -L. -o esl_msafile_psiblast_utest -DeslMSAFILE_PSIBLAST_TESTDRIVE esl_msafile_psiblast.c -leasel -lm
* run: ./esl_msafile_psiblast_utest
*/
#include "esl_config.h"
#include <stdio.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_random.h"
#include "esl_msafile.h"
#include "esl_msafile_psiblast.h"
static ESL_OPTIONS options[] = {
/* name type default env range togs reqs incomp help docgrp */
{"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
{ 0,0,0,0,0,0,0,0,0,0},
};
static char usage[] = "[-options]";
static char banner[] = "test driver for PSIBLAST MSA format module";
int
main(int argc, char **argv)
{
char msg[] = "PSI-BLAST MSA i/o module test driver failed";
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
char pbfile[32] = "esltmppbXXXXXX";
char stkfile[32] = "esltmpstkXXXXXX";
FILE *pbfp, *stkfp;
int testnumber;
int ngoodtests = 2;
char tmpfile[32];
FILE *ofp;
int expected_alphatype;
int expected_nseq;
int expected_alen;
if ( esl_tmpfile_named(pbfile, &pbfp) != eslOK) esl_fatal(msg);
if ( esl_tmpfile_named(stkfile, &stkfp) != eslOK) esl_fatal(msg);
write_test_msas(pbfp, stkfp);
fclose(pbfp);
fclose(stkfp);
read_test_msas_digital(pbfile, stkfile);
read_test_msas_text (pbfile, stkfile);
/* Various "good" files that should be parsed correctly */
for (testnumber = 1; testnumber <= ngoodtests; testnumber++)
{
strcpy(tmpfile, "esltmpXXXXXX");
if (esl_tmpfile_named(tmpfile, &ofp) != eslOK) esl_fatal(msg);
switch (testnumber) {
case 1: utest_write_good1 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
case 2: utest_write_good2 (ofp, &expected_alphatype, &expected_nseq, &expected_alen); break;
}
fclose(ofp);
utest_goodfile(tmpfile, testnumber, expected_alphatype, expected_nseq, expected_alen);
remove(tmpfile);
}
remove(pbfile);
remove(stkfile);
esl_getopts_Destroy(go);
return 0;
}
#endif /*eslMSAFILE_PSIBLAST_TESTDRIVE*/
/*--------------------- end, test driver ------------------------*/
/*****************************************************************
* 4. Examples.
*****************************************************************/
#ifdef eslMSAFILE_PSIBLAST_EXAMPLE
/* A full-featured example of reading/writing an MSA in PSIBLAST format.
gcc -g -Wall -o esl_msafile_psiblast_example -I. -L. -DeslMSAFILE_PSIBLAST_EXAMPLE esl_msafile_psiblast.c -leasel -lm
./esl_msafile_psiblast_example <msafile>
*/
/*::cexcerpt::msafile_psiblast_example::begin::*/
#include <stdio.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_getopts.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_psiblast.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-1", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "override autodetection; force PSIBLAST format", 0 },
{ "-q", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "quieter: don't write msa back, just summary", 0 },
{ "-t", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
{ "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is DNA", 0 },
{ "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is RNA", 0 },
{ "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-t", "specify that alphabet is protein", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options] <msafile>";
static char banner[] = "example of guessing, reading, writing PSIBLAST format";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
char *filename = esl_opt_GetArg(go, 1);
int infmt = eslMSAFILE_UNKNOWN;
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
int status;
if (esl_opt_GetBoolean(go, "-1")) infmt = eslMSAFILE_PSIBLAST; /* override format autodetection */
if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
/* Text mode: pass NULL for alphabet.
* Digital mode: pass ptr to expected ESL_ALPHABET; and if abc=NULL, alphabet is guessed
*/
if (esl_opt_GetBoolean(go, "-t")) status = esl_msafile_Open(NULL, filename, NULL, infmt, NULL, &afp);
else status = esl_msafile_Open(&abc, filename, NULL, infmt, NULL, &afp);
if (status != eslOK) esl_msafile_OpenFailure(afp, status);
if ((status = esl_msafile_psiblast_Read(afp, &msa)) != eslOK)
esl_msafile_ReadFailure(afp, status);
printf("alphabet: %s\n", (abc ? esl_abc_DecodeType(abc->type) : "none (text mode)"));
printf("# of seqs: %d\n", msa->nseq);
printf("# of cols: %d\n", (int) msa->alen);
printf("\n");
if (! esl_opt_GetBoolean(go, "-q"))
esl_msafile_psiblast_Write(stdout, msa);
esl_msa_Destroy(msa);
esl_msafile_Close(afp);
if (abc) esl_alphabet_Destroy(abc);
esl_getopts_Destroy(go);
exit(0);
}
/*::cexcerpt::msafile_psiblast_example::end::*/
#endif /*eslMSAFILE_PSIBLAST_EXAMPLE*/
#ifdef eslMSAFILE_PSIBLAST_EXAMPLE2
/* A minimal example. Read PSIBLAST format MSA, in text mode.
gcc -g -Wall -o esl_msafile_psiblast_example2 -I. -L. -DeslMSAFILE_PSIBLAST_EXAMPLE2 esl_msafile_psiblast.c -leasel -lm
./esl_msafile_psiblast_example2 <msafile>
*/
/*::cexcerpt::msafile_psiblast_example2::begin::*/
#include <stdio.h>
#include "easel.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msafile_psiblast.h"
int
main(int argc, char **argv)
{
char *filename = argv[1];
int fmt = eslMSAFILE_PSIBLAST;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
int status;
if ( (status = esl_msafile_Open(NULL, filename, NULL, fmt, NULL, &afp)) != eslOK) esl_msafile_OpenFailure(afp, status);
if ( (status = esl_msafile_psiblast_Read(afp, &msa)) != eslOK) esl_msafile_ReadFailure(afp, status);
printf("%6d seqs, %5d columns\n", msa->nseq, (int) msa->alen);
esl_msafile_psiblast_Write(stdout, msa);
esl_msa_Destroy(msa);
esl_msafile_Close(afp);
exit(0);
}
/*::cexcerpt::msafile_psiblast_example2::end::*/
#endif /*eslMSAFILE_PSIBLAST_EXAMPLE2*/
/*--------------------- end of examples -------------------------*/
You can’t perform that action at this time.