Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
137 lines (115 sloc) 6.76 KB
/* Multiple sequence alignment file i/o
* See also: esl_msafile2.[ch], which contains a legacy ESL_MSAFILE2 interface
* that includes support for --small option in various tools.
#include "esl_config.h"
#include <stdio.h>
#include "esl_alphabet.h" /* digital alphabets */
#include "esl_buffer.h" /* string hashes, for mapping uniq seq names */
#include "esl_msa.h" /* ESL_MSA structure */
#include "esl_ssi.h" /* indexes of large flatfiles on disk */
* Additional (often optional) information about variants of some file
* formats. Not much in here right now - but figured this might need
* to expand in the future, best to have the mechanism in place.
* Used in three ways:
* 1. When opening an MSA file in a known format (as opposed to
* guessing an unknown format), caller may provide an <ESL_MSAFILE_FMTDATA>
* structure containing any additional constraints on the format.
* The new <afp> will copy this information into <afp->fmtd>.
* 2. When opening an MSA file in an unknown format (calling GuessFileFormat()),
* format-specific autodetectors fill in <afp->fmtd> with any additional
* constraints.
* 3. When writing an MSA file, caller may provide additional constraints on
* the format; notably <fmtd->rpl>, the number of residues per line,
* used for many formats.
* TODO: If this fills up with more information, we should eventually
* consolidate the format code too; create ESL_MSAFORMAT structure
* to hold both integer code and optional information; implement
* it in esl_msaformat.[ch]; put format guessing routines there;
* rename eslMSAFILE_* -> eslMSAFORMAT_*. For now, not worth the
* time, because it's really only a placeholder dealing with a small
* PHYLIP-specific format issue. <format>, <fmtd> are generally
* an ordered pair, to facilitate eventual replacement w/ single
* <fmt>. [SRE, 19 Jul 11]
typedef struct {
int namewidth; /* PHYLIP only: width of the name field (usually 10, but can vary) unset=0 */
int rpl; /* several formats: residues per line unset=0 */
/* Object: ESL_MSAFILE
* An alignment file open for parsing.
typedef struct {
ESL_BUFFER *bf; /* input file/data being parsed */
int32_t format; /* format of alignment file we're reading */
ESL_MSAFILE_FMTDATA fmtd; /* additional (often optional) format-specific details. */
char *line; /* line read from <bf> by <esl_msafile_GetLine()> */
esl_pos_t n; /* length of line in bytes (line is not NUL-terminated) */
int64_t linenumber; /* input linenumber for diagnostics; -1 if we lose track */
esl_pos_t lineoffset; /* offset of start of <line> in <bf>; -1 if line unset */
ESL_DSQ inmap[128]; /* input map, 0..127 */
const ESL_ALPHABET *abc; /* non-NULL if in digital mode */
ESL_SSI *ssi; /* open SSI index; or NULL if none */
char errmsg[eslERRBUFSIZE]; /* user-directed message for normal errors */
/* Alignment file format codes.
* Must coexist with sqio unaligned file format codes.
* Rules:
* - 0 is an unknown/unassigned format
* - <=100 reserved for unaligned formats
* - >100 reserved for aligned formats
#define eslMSAFILE_UNKNOWN 0 /* unknown format */
#define eslMSAFILE_STOCKHOLM 101 /* Stockholm format, interleaved */
#define eslMSAFILE_PFAM 102 /* Pfam/Rfam one-line-per-seq Stockholm format */
#define eslMSAFILE_A2M 103 /* UCSC SAM's fasta-like a2m format */
#define eslMSAFILE_PSIBLAST 104 /* NCBI PSI-BLAST alignment format */
#define eslMSAFILE_SELEX 105 /* old SELEX format (largely obsolete) */
#define eslMSAFILE_AFA 106 /* aligned FASTA format */
#define eslMSAFILE_CLUSTAL 107 /* CLUSTAL format */
#define eslMSAFILE_CLUSTALLIKE 108 /* CLUSTAL-like formats (MUSCLE, PROBCONS) */
#define eslMSAFILE_PHYLIP 109 /* interleaved PHYLIP format */
#define eslMSAFILE_PHYLIPS 110 /* sequential PHYLIP format */
/* 1. Opening/closing an ESL_MSAFILE */
extern int esl_msafile_Open (ESL_ALPHABET **byp_abc, const char *msafile, const char *env, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp);
extern int esl_msafile_OpenMem (ESL_ALPHABET **byp_abc, const char *p, esl_pos_t n, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp);
extern int esl_msafile_OpenBuffer(ESL_ALPHABET **byp_abc, ESL_BUFFER *bf, int format, ESL_MSAFILE_FMTDATA *fmtd, ESL_MSAFILE **ret_afp);
extern void esl_msafile_OpenFailure(ESL_MSAFILE *afp, int status);
extern int esl_msafile_SetDigital (ESL_MSAFILE *afp, const ESL_ALPHABET *abc);
extern void esl_msafile_Close(ESL_MSAFILE *afp);
/* 2. ESL_MSAFILE_FMTDATA: optional extra constraints on formats */
extern int esl_msafile_fmtdata_Init(ESL_MSAFILE_FMTDATA *fmtd);
extern int esl_msafile_fmtdata_Copy(ESL_MSAFILE_FMTDATA *src, ESL_MSAFILE_FMTDATA *dst);
/* 3. Utilities for different file formats */
extern int esl_msafile_GuessFileFormat(ESL_BUFFER *bf, int *ret_fmtcode, ESL_MSAFILE_FMTDATA *fmtd, char *errbuf);
extern int esl_msafile_IsMultiRecord(int fmt);
extern int esl_msafile_EncodeFormat(char *fmtstring);
extern char *esl_msafile_DecodeFormat(int fmt);
/* 4. Utilities for different alphabets */
extern int esl_msafile_GuessAlphabet(ESL_MSAFILE *afp, int *ret_type);
/* 5. Random access in a MSA flatfile database */
extern int esl_msafile_PositionByKey(ESL_MSAFILE *afp, const char *key);
/* 6. Reading an MSA from an ESL_MSAFILE */
extern int esl_msafile_Read(ESL_MSAFILE *afp, ESL_MSA **ret_msa);
extern void esl_msafile_ReadFailure(ESL_MSAFILE *afp, int status);
/* 7. Writing an MSA to a stream */
extern int esl_msafile_Write(FILE *fp, ESL_MSA *msa, int fmt);
/* 8. Utilities for specific parsers */
extern int esl_msafile_GetLine(ESL_MSAFILE *afp, char **opt_p, esl_pos_t *opt_n);
extern int esl_msafile_PutLine(ESL_MSAFILE *afp);
#include "esl_msafile_a2m.h"
#include "esl_msafile_afa.h"
#include "esl_msafile_clustal.h"
#include "esl_msafile_phylip.h"
#include "esl_msafile_psiblast.h"
#include "esl_msafile_selex.h"
#include "esl_msafile_stockholm.h"
#endif /*eslMSAFILE_INCLUDED*/
You can’t perform that action at this time.