Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
503 lines (445 sloc) 19.6 KB
/* Shuffling, bootstrapping, permuting alignments, by column or row.
*
* Table of contents:
* 1. Randomizing MSAs by column.
* 2. Permuting sequence order (i.e. by row)
* 3. Shuffling pairwise (QRNA) alignments.
*/
#include "esl_config.h"
#include <string.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_msa.h"
#include "esl_random.h"
#include "esl_msashuffle.h"
/*****************************************************************
* 1. Randomizing MSAs by column.
*****************************************************************/
/* Function: esl_msashuffle_Shuffle()
* Synopsis: Shuffle an alignment's columns.
*
* Purpose: Returns a column-shuffled version of <msa> in <shuf>,
* using random generator <r>. Shuffling by columns
* preserves the \% identity of the original
* alignment. <msa> and <shuf> can be identical, to shuffle
* in place.
*
* The caller sets up the rest of the data (everything but
* the alignment itself) in <shuf> the way it wants,
* including sequence names, MSA name, and other
* annotation. The easy thing to do is to make <shuf>
* a copy of <msa>: the caller might create <shuf> by
* a call to <esl_msa_Clone()>.
*
* The alignments <msa> and <shuf> can both be in digital
* mode, or can both be in text mode; you cannot mix
* digital and text modes.
*
* Returns: <eslOK> on success.
*
* Throws: <eslEINVAL> if <msa>,<shuf> aren't in the same mode (digital vs. text).
*/
int
esl_msashuffle_Shuffle(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *shuf)
{
int i, pos, alen;
if (! (msa->flags & eslMSA_DIGITAL))
{
char c;
if (shuf->flags & eslMSA_DIGITAL) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in text mode if <msa> is");
if (msa != shuf) {
for (i = 0; i < msa->nseq; i++)
strcpy(shuf->aseq[i], msa->aseq[i]);
}
for (i = 0; i < msa->nseq; i++)
shuf->aseq[i][msa->alen] = '\0';
for (alen = msa->alen; alen > 1; alen--)
{
pos = esl_rnd_Roll(r, alen);
for (i = 0; i < msa->nseq; i++)
{
c = msa->aseq[i][pos];
shuf->aseq[i][pos] = shuf->aseq[i][alen-1];
shuf->aseq[i][alen-1] = c;
}
}
}
else
{
ESL_DSQ x;
if (! (shuf->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in digital mode if <msa> is");
if (msa != shuf) {
for (i = 0; i < msa->nseq; i++)
memcpy(shuf->ax[i], msa->ax[i], (msa->alen + 2) * sizeof(ESL_DSQ));
}
for (i = 0; i < msa->nseq; i++)
shuf->ax[i][msa->alen+1] = eslDSQ_SENTINEL;
for (alen = msa->alen; alen > 1; alen--)
{
pos = esl_rnd_Roll(r, alen) + 1;
for (i = 0; i < msa->nseq; i++)
{
x = msa->ax[i][pos];
shuf->ax[i][pos] = shuf->ax[i][alen];
shuf->ax[i][alen] = x;
}
}
}
return eslOK;
}
/* Function: esl_msashuffle_Bootstrap()
* Synopsis: Bootstrap sample an MSA.
* Incept: SRE, Tue Jan 22 11:05:07 2008 [Janelia]
*
* Purpose: Takes a bootstrap sample of <msa> (sample <alen> columns,
* with replacement) and puts it in <bootsample>, using
* random generator <r>.
*
* The caller provides allocated space for <bootsample>.
* It must be different space than <msa>; you cannot take
* a bootstrap sample "in place". The caller sets up the
* rest of the data in <bootsample> (everything but the
* alignment itself) the way it wants, including sequence
* names, MSA name, and other annotation. The easy thing to
* do is to initialize <bootsample> by cloning <msa>.
*
* The alignments <msa> and <bootsample> can both be in digital
* mode, or can both be in text mode; you cannot mix
* digital and text modes.
*
* Returns: <eslOK> on success, and the alignment in <bootsample> is
* set to be a bootstrap resample of the alignment in <msa>.
*
* Throws: <eslEINVAL> if <msa>,<bootsample> aren't in the same mode
* (digital vs. text).
*/
int
esl_msashuffle_Bootstrap(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *bootsample)
{
int i, pos, col;
/* contract checks */
if ( (msa->flags & eslMSA_DIGITAL) && ! (bootsample->flags & eslMSA_DIGITAL))
ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");
if (! (msa->flags & eslMSA_DIGITAL) && (bootsample->flags & eslMSA_DIGITAL))
ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");
if (! (msa->flags & eslMSA_DIGITAL))
{
for (pos = 0; pos < msa->alen; pos++)
{
col = esl_rnd_Roll(r, msa->alen);
for (i = 0; i < msa->nseq; i++)
bootsample->aseq[i][pos] = msa->aseq[i][col];
}
for (i = 0; i < msa->nseq; i++)
bootsample->aseq[i][msa->alen] = '\0';
}
else
{
for (i = 0; i < msa->nseq; i++)
bootsample->ax[i][0] = eslDSQ_SENTINEL;
for (pos = 1; pos <= msa->alen; pos++)
{
col = esl_rnd_Roll(r, msa->alen) + 1;
for (i = 0; i < msa->nseq; i++)
bootsample->ax[i][pos] = msa->ax[i][col];
}
for (i = 0; i < msa->nseq; i++)
bootsample->ax[i][msa->alen+1] = eslDSQ_SENTINEL;
}
return eslOK;
}
/*****************************************************************
* 2. Permuting the sequence order
*****************************************************************/
/* Function: esl_msashuffle_PermuteSequenceOrder()
* Synopsis: Permutes the order of the sequences.
*
* Purpose: Randomly permute the order of the sequences in <msa>,
* and any associated sequence annotation, in place.
*
* Returns: <eslOK> on success.
*
* Throws: (no abnormal error conditions)
*/
int
esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS *r, ESL_MSA *msa)
{
void *tmp;
double tmpwgt;
int64_t tmplen;
int N, i, tag;
for (N = msa->nseq; N > 1; N--)
{
i = esl_rnd_Roll(r, N); /* idx = 0..N-1 */
if ( ! (msa->flags & eslMSA_DIGITAL)) { tmp = msa->aseq[i]; msa->aseq[i] = msa->aseq[N-1]; msa->aseq[N-1] = tmp; }
else { tmp = msa->ax[i]; msa->ax[i] = msa->ax[N-1]; msa->ax[N-1] = tmp; }
tmp = msa->sqname[i]; msa->sqname[i] = msa->sqname[N-1]; msa->sqname[N-1] = tmp;
tmpwgt = msa->wgt[i]; msa->wgt[i] = msa->wgt[N-1]; msa->wgt[N-1] = tmpwgt;
if (msa->sqacc) { tmp = msa->sqacc[i]; msa->sqacc[i] = msa->sqacc[N-1]; msa->sqacc[N-1] = tmp; }
if (msa->sqdesc) { tmp = msa->sqdesc[i]; msa->sqdesc[i] = msa->sqdesc[N-1]; msa->sqdesc[N-1] = tmp; }
if (msa->ss) { tmp = msa->ss[i]; msa->ss[i] = msa->ss[N-1]; msa->ss[N-1] = tmp; }
if (msa->sa) { tmp = msa->sa[i]; msa->sa[i] = msa->sa[N-1]; msa->sa[N-1] = tmp; }
if (msa->pp) { tmp = msa->pp[i]; msa->pp[i] = msa->pp[N-1]; msa->pp[N-1] = tmp; }
if (msa->sqlen) { tmplen = msa->sqlen[i]; msa->sqlen[i] = msa->sqlen[N-1]; msa->sqlen[N-1] = tmplen; }
if (msa->sslen) { tmplen = msa->sslen[i]; msa->sslen[i] = msa->sslen[N-1]; msa->sslen[N-1] = tmplen; }
if (msa->salen) { tmplen = msa->salen[i]; msa->salen[i] = msa->salen[N-1]; msa->salen[N-1] = tmplen; }
if (msa->pplen) { tmplen = msa->pplen[i]; msa->pplen[i] = msa->pplen[N-1]; msa->pplen[N-1] = tmplen; }
for (tag = 0; tag < msa->ngs; tag++) if (msa->gs[tag]) { tmp = msa->gs[tag][i]; msa->gs[tag][i] = msa->gs[tag][N-1]; msa->gs[tag][N-1] = tmp; }
for (tag = 0; tag < msa->ngr; tag++) if (msa->gr[tag]) { tmp = msa->gr[tag][i]; msa->gr[tag][i] = msa->gr[tag][N-1]; msa->gr[tag][N-1] = tmp; }
}
/* if <msa> has a keyhash that maps seqname => seqidx, we'll need to rebuild it. */
if (msa->index)
{
esl_keyhash_Reuse(msa->index);
for (i = 0; i < msa->nseq; i++)
esl_keyhash_Store(msa->index, msa->sqname[i], -1, NULL);
}
return eslOK;
}
/*****************************************************************
* 3. Shuffling pairwise (QRNA) alignments
*****************************************************************/
/* Function: esl_msashuffle_XQRNA()
* Synopsis: Gap-preserving column shuffle of a digital pairwise alignment.
* Incept: SRE, Tue Jan 22 09:09:52 2008 [Market Street Cafe, Leesburg]
*
* Purpose: Shuffle a digital pairwise alignment <x>,<y> while
* preserving the position of gaps, where both sequences are
* in digital alphabet <abc>, using the random number
* generator <r>. Return the shuffled alignment in <xs>,
* <ys>. Caller provides allocated space for <xs> and <ys>
* for at least the same length of <x>,<y>.
*
* Works by doing three separate
* shuffles, of (1) columns with residues in both
* <x> and <y>, (2) columns with residue in <x> and gap in <y>,
* and (3) columns with gap in <x> and residue in <y>.
*
* <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
* an alignment "in place", destroying the original
* alignment, just call <esl_msashuffle_XQRNA(r, abc, x,y,x,y)>.
*
* Returns: <eslOK> on success, and the shuffled alignment is
* returned in <xs>, <ys>.
*
* Throws: <eslEMEM> on allocation failure.
*/
int
esl_msashuffle_XQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, ESL_DSQ *x, ESL_DSQ *y, ESL_DSQ *xs, ESL_DSQ *ys)
{
int L;
int *xycol = NULL;
int *xcol = NULL;
int *ycol = NULL;
int nxy, nx, ny;
int i;
int pos, c;
char xsym, ysym;
int status;
L = esl_abc_dsqlen(x);
if (esl_abc_dsqlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
if (xs != x) esl_abc_dsqcpy(x, L, xs);
if (ys != y) esl_abc_dsqcpy(y, L, ys);
/* First, construct three arrays containing lists of the column positions
* of the three types of columns. (If a column contains gaps in both x and y,
* we've already simply copied it to the shuffled sequence.)
*/
ESL_ALLOC(xycol, sizeof(int) * L);
ESL_ALLOC(xcol, sizeof(int) * L);
ESL_ALLOC(ycol, sizeof(int) * L);
nxy = nx = ny = 0;
for (i = 1; i <= L; i++)
{
if ( esl_abc_XIsGap(abc, x[i]) && esl_abc_XIsGap(abc, y[i])) { continue; }
else if (! esl_abc_XIsGap(abc, x[i]) && ! esl_abc_XIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
else if ( esl_abc_XIsGap(abc, x[i])) { ycol[ny] = i; ny++; }
else if ( esl_abc_XIsGap(abc, y[i])) { xcol[nx] = i; nx++; }
}
/* Second, shuffle the sequences indirectly, via shuffling these arrays.
* Yow, careful with those indices, and with order of the statements...
*/
for (; nxy > 1; nxy--) {
pos = esl_rnd_Roll(r, nxy);
xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos];
xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1];
xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = c;
}
for (; nx > 1; nx--) {
pos = esl_rnd_Roll(r, nx);
xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos];
xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1];
xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c;
}
for (; ny > 1; ny--) {
pos = esl_rnd_Roll(r, ny);
xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos];
xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1];
xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c;
}
free(xycol); free(xcol); free(ycol);
return eslOK;
ERROR:
if (xycol != NULL) free(xycol);
if (xcol != NULL) free(xcol);
if (ycol != NULL) free(ycol);
return status;
}
/* Function: esl_msashuffle_CQRNA()
* Synopsis: Gap-preserving column shuffle of a pairwise alignment.
* Incept: SRE, Tue Jan 22 08:45:34 2008 [Market Street Cafe, Leesburg]
*
* Purpose: Shuffle a pairwise alignment <x>,<y> while preserving the
* position of gaps, using the random number generator <r>.
* Return the shuffled alignment in <xs>,
* <ys>. Caller provides allocated space for <xs> and <ys>.
*
* An alphabet <abc> must also be provided, solely for the
* definition of gap characters. Because Easel's default
* alphabets (DNA, RNA, and protein) all use the same
* definition of gap characters <-_.>, you can actually
* provide any alphabet here, and get the same results.
* (This may save having to determine the alphabet of input
* sequences.)
*
* Works by doing three separate
* shuffles, of (1) columns with residues in both
* <x> and <y>, (2) columns with residue in <x> and gap in <y>,
* and (3) columns with gap in <x> and residue in <y>.
*
* <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
* an alignment "in place", destroying the original
* alignment, just call <esl_msashuffle_CQRNA(r, abc, x,y,x,y)>.
*
* Returns: <eslOK> on success, and the shuffled alignment is
* returned in <xs>, <ys>.
*
* Throws: <eslEMEM> on allocation failure.
*/
int
esl_msashuffle_CQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, char *x, char *y, char *xs, char *ys)
{
int L;
int *xycol = NULL;
int *xcol = NULL;
int *ycol = NULL;
int nxy, nx, ny;
int i;
int pos, c;
char xsym, ysym;
int status;
if (xs != x) strcpy(xs, x);
if (ys != y) strcpy(ys, y);
/* First, construct three arrays containing lists of the column positions
* of the three types of columns. (If a column contains gaps in both x and y,
* we've already simply copied it to the shuffled sequence.)
*/
L = strlen(x);
if (strlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
ESL_ALLOC(xycol, sizeof(int) * L);
ESL_ALLOC(xcol, sizeof(int) * L);
ESL_ALLOC(ycol, sizeof(int) * L);
nxy = nx = ny = 0;
for (i = 0; i < L; i++)
{
if ( esl_abc_CIsGap(abc, x[i]) && esl_abc_CIsGap(abc, y[i])) { continue; }
else if (! esl_abc_CIsGap(abc, x[i]) && ! esl_abc_CIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
else if ( esl_abc_CIsGap(abc, x[i])) { ycol[ny] = i; ny++; }
else if ( esl_abc_CIsGap(abc, y[i])) { xcol[nx] = i; nx++; }
}
/* Second, shuffle the sequences indirectly, via shuffling these arrays.
* Yow, careful with those indices, and with order of the statements...
*/
for (; nxy > 1; nxy--) {
pos = esl_rnd_Roll(r, nxy);
xsym = xs[xycol[pos]]; ysym = ys[xycol[pos]]; c = xycol[pos];
xs[xycol[pos]] = xs[xycol[nxy-1]]; ys[xycol[pos]] = ys[xycol[nxy-1]]; xycol[pos] = xycol[nxy-1];
xs[xycol[nxy-1]] = xsym; ys[xycol[nxy-1]] = ysym; xycol[pos] = c;
}
for (; nx > 1; nx--) {
pos = esl_rnd_Roll(r, nx);
xsym = xs[xcol[pos]]; ysym = ys[xcol[pos]]; c = xcol[pos];
xs[xcol[pos]] = xs[xcol[nx-1]]; ys[xcol[pos]] = ys[xcol[nx-1]]; xcol[pos] = xcol[nx-1];
xs[xcol[nx-1]] = xsym; ys[xcol[nx-1]] = ysym; xcol[nx-1] = c;
}
for (; ny > 1; ny--) {
pos = esl_rnd_Roll(r, ny);
xsym = xs[ycol[pos]]; ysym = ys[ycol[pos]]; c = ycol[pos];
xs[ycol[pos]] = xs[ycol[ny-1]]; ys[ycol[pos]] = ys[ycol[ny-1]]; ycol[pos] = ycol[ny-1];
xs[ycol[ny-1]] = xsym; ys[ycol[ny-1]] = ysym; ycol[ny-1] = c;
}
free(xycol); free(xcol); free(ycol);
return eslOK;
ERROR:
if (xycol != NULL) free(xycol);
if (xcol != NULL) free(xcol);
if (ycol != NULL) free(ycol);
return status;
}
/*****************************************************************
* 4. Example.
*****************************************************************/
#ifdef eslMSASHUFFLE_EXAMPLE
#include <stdio.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_getopts.h"
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_random.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet", 0 },
{ "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet", 0 },
{ "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use protein alphabet", 0 },
{ "--text", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use text mode: no digital alphabet", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options] <msafile>";
static char banner[] = "example of multiple alignment shuffling/permuting";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
ESL_RANDOMNESS *rng = esl_randomness_Create(0);
char *msafile = esl_opt_GetArg(go, 1);
int fmt = eslMSAFILE_UNKNOWN;
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
int textmode = esl_opt_GetBoolean(go, "--text");
int nali = 0;
int status;
/* If you know the alphabet you want, create it - you'll pass it to esl_msafile_Open() */
if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
/* Open in text or digital mode.
* To let the Open() function autoguess the format, you pass <infmt=eslMSAFILE_UNKNOWN>.
* To let it autoguess the alphabet, you set <abc=NULL> and pass <&abc>.
* To open in text mode instead of digital, you pass <NULL> for the alphabet argument.
* esl_msafile_OpenFailure() is a convenience, printing various diagnostics of any
* open failure to <stderr>. You can of course handle your own diagnostics instead.
*/
if (textmode) status = esl_msafile_Open(NULL, msafile, NULL, fmt, NULL, &afp);
else status = esl_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp);
if (status != eslOK) esl_msafile_OpenFailure(afp, status);
fmt = afp->format;
while ((status = esl_msafile_Read(afp, &msa)) == eslOK)
{
/* if digital MSA: msa->ax[idx=0..nseq-1][acol=1..alen] is the alignment data;
* if text MSA: msa->aseq[idx=0..nseq-1][acol=0..alen-1] */
nali++;
/* permute it */
esl_msashuffle_PermuteSequenceOrder(rng, msa);
esl_msafile_Write(stdout, msa, fmt);
esl_msa_Destroy(msa);
}
if (nali == 0 || status != eslEOF) esl_msafile_ReadFailure(afp, status); /* a convenience, like esl_msafile_OpenFailure() */
esl_alphabet_Destroy(abc);
esl_msafile_Close(afp);
esl_randomness_Destroy(rng);
esl_getopts_Destroy(go);
exit(0);
}
#endif
You can’t perform that action at this time.