Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
402 lines (346 sloc) 13.4 KB
/* Clustering sequences in an MSA by % identity.
*
* Table of contents:
* 1. Single linkage clustering an MSA by %id
* 2. Internal functions, interface to the clustering API
* 3. Some internal functions needed for regression tests
* 4. Unit tests
* 5. Test driver
* 6. Example
*
* (Wondering why isn't this just part of the cluster or MSA modules?
* esl_cluster itself is a core module, dependent only on easel. MSA
* clustering involves at least the distance, cluster, and msa
* modules. We're better off separating its functionality away into a
* more highly derived module.)
*/
#include "esl_config.h"
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_cluster.h"
#include "esl_distance.h"
#include "esl_msa.h"
#include "esl_msacluster.h"
/* These functions are going to get defined in an internal regression
* testing section further below:
*/
#if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
#include <ctype.h>
static double squid_distance(char *s1, char *s2);
static double squid_xdistance(ESL_ALPHABET *a, ESL_DSQ *x1, ESL_DSQ *x2);
#endif
/* These functions will define linkage between a pair of text or
* digital aseq's:
*/
static int msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link);
static int msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link);
/* In digital mode, we'll need to pass the clustering routine two parameters -
* %id threshold and alphabet ptr - so make a structure that bundles them.
*/
struct msa_param_s {
double maxid;
ESL_ALPHABET *abc;
};
/*****************************************************************
* 1. Single linkage clustering an MSA by %id
*****************************************************************/
/* Function: esl_msacluster_SingleLinkage()
* Synopsis: Single linkage clustering by percent identity.
* Incept: SRE, Sun Nov 5 10:11:45 2006 [Janelia]
*
* Purpose: Perform single link clustering of the sequences in
* multiple alignment <msa>. Any pair of sequences with
* percent identity $\geq$ <maxid> are linked (using
* the definition from the \eslmod{distance} module).
*
* The resulting clustering is optionally returned in one
* or more of <opt_c>, <opt_nin>, and <opt_nc>. The
* <opt_c[0..nseq-1]> array assigns a cluster index
* <(0..nc-1)> to each sequence. For example, <c[4] = 1>
* means that sequence 4 is assigned to cluster 1. The
* <opt_nin[0..nc-1]> array is the number of sequences
* in each cluster. <opt_nc> is the number of clusters.
*
* Importantly, this algorithm runs in $O(N)$ memory, and
* produces one discrete clustering. Compare to
* <esl_tree_SingleLinkage()>, which requires an $O(N^2)$
* adjacency matrix, and produces a hierarchical clustering
* tree.
*
* The algorithm is worst case $O(LN^2)$ time, for $N$
* sequences of length $L$. However, the worst case is no
* links at all, and this is unusual. More typically, time
* scales as about $LN \log N$. The best case scales as
* $LN$, when there is just one cluster in a completely
* connected graph.
*
* Args: msa - multiple alignment to cluster
* maxid - pairwise identity threshold: cluster if $\geq$ <maxid>
* opt_c - optRETURN: cluster assignments for each sequence, [0..nseq-1]
* opt_nin - optRETURN: number of seqs in each cluster, [0..nc-1]
* opt_nc - optRETURN: number of clusters
*
* Returns: <eslOK> on success; the <opt_c[0..nseq-1]> array contains
* cluster indices <0..nc-1> assigned to each sequence; the
* <opt_nin[0..nc-1]> array contains the number of seqs in
* each cluster; and <opt_nc> contains the number of
* clusters. The <opt_c> array and <opt_nin> arrays will be
* allocated here, if non-<NULL>, and must be free'd by the
* caller. The input <msa> is unmodified.
*
* The caller may pass <NULL> for either <opt_c> or
* <opt_nc> if it is only interested in one of the two
* results.
*
* Throws: <eslEMEM> on allocation failure, and <eslEINVAL> if a pairwise
* comparison is invalid (which means the MSA is corrupted, so it
* shouldn't happen). In either case, <opt_c> and <opt_nin> are set to <NULL>
* and <opt_nc> is set to 0, and the <msa> is unmodified.
*/
int
esl_msacluster_SingleLinkage(const ESL_MSA *msa, double maxid,
int **opt_c, int **opt_nin, int *opt_nc)
{
int status;
int *workspace = NULL;
int *assignment = NULL;
int *nin = NULL;
int nc;
int i;
struct msa_param_s param;
/* Allocations */
ESL_ALLOC(workspace, sizeof(int) * msa->nseq * 2);
ESL_ALLOC(assignment, sizeof(int) * msa->nseq);
/* call to SLC API: */
if (! (msa->flags & eslMSA_DIGITAL))
status = esl_cluster_SingleLinkage((void *) msa->aseq, (size_t) msa->nseq, sizeof(char *),
msacluster_clinkage, (void *) &maxid,
workspace, assignment, &nc);
else {
param.maxid = maxid;
param.abc = msa->abc;
status = esl_cluster_SingleLinkage((void *) msa->ax, (size_t) msa->nseq, sizeof(ESL_DSQ *),
msacluster_xlinkage, (void *) &param,
workspace, assignment, &nc);
}
if (status != eslOK) goto ERROR;
if (opt_nin != NULL)
{
ESL_ALLOC(nin, sizeof(int) * nc);
for (i = 0; i < nc; i++) nin[i] = 0;
for (i = 0; i < msa->nseq; i++)
nin[assignment[i]]++;
*opt_nin = nin;
}
/* cleanup and return */
free(workspace);
if (opt_c != NULL) *opt_c = assignment; else free(assignment);
if (opt_nc != NULL) *opt_nc = nc;
return eslOK;
ERROR:
if (workspace != NULL) free(workspace);
if (assignment != NULL) free(assignment);
if (nin != NULL) free(nin);
if (opt_c != NULL) *opt_c = NULL;
if (opt_nc != NULL) *opt_nc = 0;
return status;
}
/*****************************************************************
* 2. Internal functions, interface to the clustering API
*****************************************************************/
/* Definition of %id linkage in text-mode aligned seqs (>= maxid): */
static int
msacluster_clinkage(const void *v1, const void *v2, const void *p, int *ret_link)
{
char *as1 = *(char **) v1;
char *as2 = *(char **) v2;
double maxid = *(double *) p;
double pid;
int status = eslOK;
#if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
pid = 1. - squid_distance(as1, as2);
#else
if ((status = esl_dst_CPairId(as1, as2, &pid, NULL, NULL)) != eslOK) return status;
#endif
*ret_link = (pid >= maxid ? TRUE : FALSE);
return status;
}
/* Definition of % id linkage in digital aligned seqs (>= maxid) */
static int
msacluster_xlinkage(const void *v1, const void *v2, const void *p, int *ret_link)
{
ESL_DSQ *ax1 = *(ESL_DSQ **) v1;
ESL_DSQ *ax2 = *(ESL_DSQ **) v2;
struct msa_param_s *param = (struct msa_param_s *) p;
double pid;
int status = eslOK;
#if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
pid = 1. - squid_xdistance(param->abc, ax1, ax2);
#else
if ( (status = esl_dst_XPairId(param->abc, ax1, ax2, &pid, NULL, NULL)) != eslOK) return status;
#endif
*ret_link = (pid >= param->maxid ? TRUE : FALSE);
return status;
}
/*****************************************************************
* 3. Some internal functions needed for regression tests
*****************************************************************/
/* When regression testing against squid, we have to replace
* Easel's distance calculations with a simpler, (even) less robust
* calculation that squid did.
*/
#if defined(eslMSACLUSTER_REGRESSION) || defined(eslMSAWEIGHT_REGRESSION)
static double
squid_distance(char *s1, char *s2)
{
int diff = 0;
int valid = 0;
for (; *s1 != '\0'; s1++, s2++)
{
if (!isalpha(*s1) || !isalpha(*s2)) continue;
if (*s1 != *s2) diff++;
valid++;
}
return (valid > 0 ? ((double) diff / (double) valid) : 0.0);
}
static double
squid_xdistance(ESL_ALPHABET *a, ESL_DSQ *x1, ESL_DSQ *x2)
{
int diff = 0;
int valid = 0;
for (; *x1 != eslDSQ_SENTINEL; x1++, x2++)
{
if (esl_abc_XIsGap(a, *x1) || esl_abc_XIsGap(a, *x2)) continue;
if (*x1 != *x2) diff++;
valid++;
}
return (valid > 0 ? ((double) diff / (double) valid) : 0.0);
}
#endif /* eslMSACLUSTER_REGRESSION || eslMSAWEIGHT_REGRESSION */
/*****************************************************************
* 4. Unit tests
*****************************************************************/
#ifdef eslMSACLUSTER_TESTDRIVE
#include "esl_getopts.h"
static void
utest_SingleLinkage(ESL_GETOPTS *go, const ESL_MSA *msa, double maxid, int expected_nc, int last_assignment)
{
char *msg = "utest_SingleLinkage() failed";
int *assignment = NULL;
int *nin = NULL;
int nc;
if (esl_msacluster_SingleLinkage(msa, maxid, &assignment, &nin, &nc) != eslOK) esl_fatal(msg);
if (nc != expected_nc) esl_fatal(msg);
if (assignment[msa->nseq-1] != last_assignment) esl_fatal(msg);
free(assignment);
free(nin);
}
#endif /*eslMSACLUSTER_TESTDRIVE*/
/*****************************************************************
* 5. Test driver
*****************************************************************/
#ifdef eslMSACLUSTER_TESTDRIVE
/* gcc -g -Wall -o msacluster_utest -I. -L. -DeslMSACLUSTER_TESTDRIVE esl_msacluster.c -leasel -lm
*/
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_alphabet.h"
#include "esl_getopts.h"
#include "esl_msa.h"
#include "esl_msacluster.h"
#include "esl_msafile.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "test driver for msacluster module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
ESL_MSA *msa = esl_msa_CreateFromString("\
# STOCKHOLM 1.0\n\
\n\
seq0 AAAAAAAAAA\n\
seq1 AAAAAAAAAA\n\
seq2 AAAAAAAAAC\n\
seq3 AAAAAAAADD\n\
seq4 AAAAAAAEEE\n\
seq5 AAAAAAFFFF\n\
seq6 AAAAAGGGGG\n\
seq7 AAAAHHHHHH\n\
seq8 AAAIIIIIII\n\
seq9 AAKKKKKKKK\n\
seq10 ALLLLLLLLL\n\
seq11 MMMMMMMMMM\n\
//", eslMSAFILE_STOCKHOLM);
utest_SingleLinkage(go, msa, 1.0, 11, 10); /* at 100% id, only seq0/seq1 cluster */
utest_SingleLinkage(go, msa, 0.5, 6, 5); /* at 50% id, seq0-seq6 cluster */
utest_SingleLinkage(go, msa, 0.0, 1, 0); /* at 0% id, everything clusters */
/* Do the same tests, but now with a digital MSA */
esl_msa_Digitize(abc, msa, NULL);
utest_SingleLinkage(go, msa, 1.0, 11, 10); /* at 100% id, only seq0/seq1 cluster */
utest_SingleLinkage(go, msa, 0.5, 6, 5); /* at 50% id, seq0-seq6 cluster */
utest_SingleLinkage(go, msa, 0.0, 1, 0); /* at 0% id, everything clusters */
esl_msa_Destroy(msa);
esl_alphabet_Destroy(abc);
esl_getopts_Destroy(go);
return 0;
}
#endif /* eslMSACLUSTER_TESTDRIVE*/
/*****************************************************************
* 6. Example
*****************************************************************/
#ifdef eslMSACLUSTER_EXAMPLE
/*::cexcerpt::msacluster_example::begin::*/
/*
gcc -g -Wall -o msacluster_example -I. -L. -DeslMSACLUSTER_EXAMPLE esl_msacluster.c -leasel -lm
./msacluster_example <MSA file>
*/
#include <stdio.h>
#include "easel.h"
#include "esl_msa.h"
#include "esl_msacluster.h"
#include "esl_msafile.h"
int
main(int argc, char **argv)
{
char *filename = argv[1];
int fmt = eslMSAFILE_UNKNOWN;
ESL_ALPHABET *abc = NULL;
ESL_MSAFILE *afp = NULL;
ESL_MSA *msa = NULL;
double maxid = 0.62; /* cluster at 62% identity: the BLOSUM62 rule */
int *assignment = NULL;
int *nin = NULL;
int nclusters;
int c, i;
int status;
/* Open; guess alphabet; set to digital mode */
if ((status = esl_msafile_Open(&abc, filename, NULL, fmt, NULL, &afp)) != eslOK)
esl_msafile_OpenFailure(afp, status);
/* read one alignment */
if ((status = esl_msafile_Read(afp, &msa)) != eslOK)
esl_msafile_ReadFailure(afp, status);
/* do the clustering */
esl_msacluster_SingleLinkage(msa, maxid, &assignment, &nin, &nclusters);
printf("%d clusters at threshold of %f fractional identity\n", nclusters, maxid);
for (c = 0; c < nclusters; c++) {
printf("cluster %d:\n", c);
for (i = 0; i < msa->nseq; i++) if (assignment[i] == c) printf(" %s\n", msa->sqname[i]);
printf("(%d sequences)\n\n", nin[c]);
}
esl_msa_Destroy(msa);
esl_msafile_Close(afp);
free(assignment);
free(nin);
return 0;
}
/*::cexcerpt::msacluster_example::end::*/
#endif /*eslMSACLUSTER_EXAMPLE*/
/*------------------------ end of example -----------------------*/
You can’t perform that action at this time.