Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
1161 lines (1031 sloc) 39.7 KB
/* Foundation and miscellenea for the statistics modules.
*
* Contents:
* 1. Summary statistics (means, variances)
* 2. Special functions
* 3. Standard statistical tests
* 4. Data fitting.
* 5. Unit tests.
* 6. Test driver.
* 7. Examples.
* - driver for linear regression
* - driver for G-test
*/
#include "esl_config.h"
#include <math.h>
#include "easel.h"
#include "esl_stats.h"
/*****************************************************************
* 1. Summary statistics calculations (means, variances)
*****************************************************************/
/* Function: esl_stats_DMean()
* Synopsis: Calculates mean and $\sigma^2$ for samples $x_i$.
*
* Purpose: Calculates the sample mean and $s^2$, the unbiased
* estimator of the population variance, for a
* sample of <n> numbers <x[0]..x[n-1]>, and optionally
* returns either or both through <ret_mean> and
* <ret_var>.
*
* <esl_stats_FMean()> and <esl_stats_IMean()> do the same,
* for float and integer vectors.
*
* Args: x - samples x[0]..x[n-1]
* n - number of samples
* opt_mean - optRETURN: mean
* opt_var - optRETURN: estimate of population variance
*
* Returns: <eslOK> on success.
*/
int
esl_stats_DMean(const double *x, int n, double *opt_mean, double *opt_var)
{
double sum = 0.;
double sqsum = 0.;
int i;
for (i = 0; i < n; i++)
{
sum += x[i];
sqsum += x[i]*x[i];
}
if (opt_mean != NULL) *opt_mean = sum / (double) n;
if (opt_var != NULL) *opt_var = (sqsum - sum*sum/(double)n) / ((double)n-1);
return eslOK;
}
int
esl_stats_FMean(const float *x, int n, double *opt_mean, double *opt_var)
{
double sum = 0.;
double sqsum = 0.;
int i;
for (i = 0; i < n; i++)
{
sum += x[i];
sqsum += x[i]*x[i];
}
if (opt_mean != NULL) *opt_mean = sum / (double) n;
if (opt_var != NULL) *opt_var = (sqsum - sum*sum/(double)n) / ((double)n-1);
return eslOK;
}
int
esl_stats_IMean(const int *x, int n, double *opt_mean, double *opt_var)
{
double sum = 0.;
double sqsum = 0.;
int i;
for (i = 0; i < n; i++)
{
sum += x[i];
sqsum += x[i]*x[i];
}
if (opt_mean != NULL) *opt_mean = sum / (double) n;
if (opt_var != NULL) *opt_var = (sqsum - sum*sum/(double)n) / ((double)n-1);
return eslOK;
}
/*--------------- end, summary statistics -----------------------*/
/*****************************************************************
* 2. Special functions.
*****************************************************************/
/* Function: esl_stats_LogGamma()
* Synopsis: Calculates $\log \Gamma(x)$.
*
* Purpose: Returns natural log of $\Gamma(x)$, for $x > 0$.
*
* Credit: Adapted from a public domain implementation in the
* NCBI core math library. Thanks to John Spouge and
* the NCBI. (According to NCBI, that's Dr. John
* "Gammas Galore" Spouge to you, pal.)
*
* Args: x : argument, x > 0.0
* ret_answer : RETURN: the answer
*
* Returns: Put the answer in <ret_answer>; returns <eslOK>.
*
* Throws: <eslERANGE> if $x <= 0$.
*/
int
esl_stats_LogGamma(double x, double *ret_answer)
{
int i;
double xx, tx;
double tmp, value;
static double cof[11] = {
4.694580336184385e+04,
-1.560605207784446e+05,
2.065049568014106e+05,
-1.388934775095388e+05,
5.031796415085709e+04,
-9.601592329182778e+03,
8.785855930895250e+02,
-3.155153906098611e+01,
2.908143421162229e-01,
-2.319827630494973e-04,
1.251639670050933e-10
};
/* Protect against invalid x<=0 */
if (x <= 0.0) ESL_EXCEPTION(eslERANGE, "invalid x <= 0 in esl_stats_LogGamma()");
xx = x - 1.0;
tx = tmp = xx + 11.0;
value = 1.0;
for (i = 10; i >= 0; i--) /* sum least significant terms first */
{
value += cof[i] / tmp;
tmp -= 1.0;
}
value = log(value);
tx += 0.5;
value += 0.918938533 + (xx+0.5)*log(tx) - tx;
*ret_answer = value;
return eslOK;
}
/* Function: esl_stats_Psi()
* Synopsis: Calculates $\Psi(x)$ (the digamma function).
*
* Purpose: Computes $\Psi(x)$ (the "digamma" function), which is
* the derivative of log of the Gamma function:
* $d/dx \log \Gamma(x) = \frac{\Gamma'(x)}{\Gamma(x)} = \Psi(x)$.
* Argument $x$ is $> 0$.
*
* This is J.M. Bernardo's "Algorithm AS103",
* Appl. Stat. 25:315-317 (1976).
*/
int
esl_stats_Psi(double x, double *ret_answer)
{
double answer = 0.;
double x2;
if (x <= 0.0) ESL_EXCEPTION(eslERANGE, "invalid x <= 0 in esl_stats_Psi()");
/* For small x, Psi(x) ~= -0.5772 - 1/x + O(x), we're done.
*/
if (x <= 1e-5) {
*ret_answer = -eslCONST_EULER - 1./x;
return eslOK;
}
/* For medium x, use Psi(1+x) = \Psi(x) + 1/x to c.o.v. x,
* big enough for Stirling approximation to work...
*/
while (x < 8.5) {
answer = answer - 1./x;
x += 1.;
}
/* For large X, use Stirling approximation
*/
x2 = 1./x;
answer += log(x) - 0.5 * x2;
x2 = x2*x2;
answer -= (1./12.)*x2;
answer += (1./120.)*x2*x2;
answer -= (1./252.)*x2*x2*x2;
*ret_answer = answer;
return eslOK;
}
/* Function: esl_stats_IncompleteGamma()
* Synopsis: Calculates the incomplete Gamma function.
*
* Purpose: Returns $P(a,x)$ and $Q(a,x)$ where:
*
* \begin{eqnarray*}
* P(a,x) & = & \frac{1}{\Gamma(a)} \int_{0}^{x} t^{a-1} e^{-t} dt \\
* & = & \frac{\gamma(a,x)}{\Gamma(a)} \\
* Q(a,x) & = & \frac{1}{\Gamma(a)} \int_{x}^{\infty} t^{a-1} e^{-t} dt\\
* & = & 1 - P(a,x) \\
* \end{eqnarray*}
*
* $P(a,x)$ is the CDF of a gamma density with $\lambda = 1$,
* and $Q(a,x)$ is the survival function.
*
* For $x \simeq 0$, $P(a,x) \simeq 0$ and $Q(a,x) \simeq 1$; and
* $P(a,x)$ is less prone to roundoff error.
*
* The opposite is the case for large $x >> a$, where
* $P(a,x) \simeq 1$ and $Q(a,x) \simeq 0$; there, $Q(a,x)$ is
* less prone to roundoff error.
*
* Method: Based on ideas from Numerical Recipes in C, Press et al.,
* Cambridge University Press, 1988.
*
* Args: a - for instance, degrees of freedom / 2 [a > 0]
* x - for instance, chi-squared statistic / 2 [x >= 0]
* ret_pax - RETURN: P(a,x)
* ret_qax - RETURN: Q(a,x)
*
* Return: <eslOK> on success.
*
* Throws: <eslERANGE> if <a> or <x> is out of accepted range.
* <eslENOHALT> if approximation fails to converge.
*/
int
esl_stats_IncompleteGamma(double a, double x, double *ret_pax, double *ret_qax)
{
int iter; /* iteration counter */
double pax; /* P(a,x) */
double qax; /* Q(a,x) */
int status;
if (a <= 0.) ESL_EXCEPTION(eslERANGE, "esl_stats_IncompleteGamma(): a must be > 0");
if (x < 0.) ESL_EXCEPTION(eslERANGE, "esl_stats_IncompleteGamma(): x must be >= 0");
/* For x > a + 1 the following gives rapid convergence;
* calculate Q(a,x) = \frac{\Gamma(a,x)}{\Gamma(a)},
* using a continued fraction development for \Gamma(a,x).
*/
if (x > a+1)
{
double oldp; /* previous value of p */
double nu0, nu1; /* numerators for continued fraction calc */
double de0, de1; /* denominators for continued fraction calc */
nu0 = 0.; /* A_0 = 0 */
de0 = 1.; /* B_0 = 1 */
nu1 = 1.; /* A_1 = 1 */
de1 = x; /* B_1 = x */
oldp = nu1;
for (iter = 1; iter < 100; iter++)
{
/* Continued fraction development:
* set A_j = b_j A_j-1 + a_j A_j-2
* B_j = b_j B_j-1 + a_j B_j-2
* We start with A_2, B_2.
*/
/* j = even: a_j = iter-a, b_j = 1 */
/* A,B_j-2 are in nu0, de0; A,B_j-1 are in nu1,de1 */
nu0 = nu1 + ((double)iter - a) * nu0;
de0 = de1 + ((double)iter - a) * de0;
/* j = odd: a_j = iter, b_j = x */
/* A,B_j-2 are in nu1, de1; A,B_j-1 in nu0,de0 */
nu1 = x * nu0 + (double) iter * nu1;
de1 = x * de0 + (double) iter * de1;
/* rescale */
if (de1 != 0.)
{
nu0 /= de1;
de0 /= de1;
nu1 /= de1;
de1 = 1.;
}
/* check for convergence */
if (fabs((nu1-oldp)/nu1) < 1.e-7)
{
if ((status = esl_stats_LogGamma(a, &qax)) != eslOK) return status;
qax = nu1 * exp(a * log(x) - x - qax);
if (ret_pax != NULL) *ret_pax = 1 - qax;
if (ret_qax != NULL) *ret_qax = qax;
return eslOK;
}
oldp = nu1;
}
ESL_EXCEPTION(eslENOHALT,
"esl_stats_IncompleteGamma(): fraction failed to converge");
}
else /* x <= a+1 */
{
double p; /* current sum */
double val; /* current value used in sum */
/* For x <= a+1 we use a convergent series instead:
* P(a,x) = \frac{\gamma(a,x)}{\Gamma(a)},
* where
* \gamma(a,x) = e^{-x}x^a \sum_{n=0}{\infty} \frac{\Gamma{a}}{\Gamma{a+1+n}} x^n
* which looks appalling but the sum is in fact rearrangeable to
* a simple series without the \Gamma functions:
* = \frac{1}{a} + \frac{x}{a(a+1)} + \frac{x^2}{a(a+1)(a+2)} ...
* and it's obvious that this should converge nicely for x <= a+1.
*/
p = val = 1. / a;
for (iter = 1; iter < 10000; iter++)
{
val *= x / (a+(double)iter);
p += val;
if (fabs(val/p) < 1.e-7)
{
if ((status = esl_stats_LogGamma(a, &pax)) != eslOK) return status;
pax = p * exp(a * log(x) - x - pax);
if (ret_pax != NULL) *ret_pax = pax;
if (ret_qax != NULL) *ret_qax = 1. - pax;
return eslOK;
}
}
ESL_EXCEPTION(eslENOHALT,
"esl_stats_IncompleteGamma(): series failed to converge");
}
/*NOTREACHED*/
return eslOK;
}
/* Function: esl_stats_erfc()
* Synopsis: Complementary error function.
*
* Purpose: Calculate and return the complementary error function,
* erfc(x).
*
* erfc(x) is mandated by the ANSI C99 standard but that
* doesn't mean it's available on supposedly modern systems
* (looking at you here, Microsoft).
*
* Used for cumulative distribution function calculations
* for the normal (Gaussian) distribution. See <esl_normal>
* module.
*
* erfc(-inf) = 2.0
* erfc(0) = 1.0
* erfc(inf) = 0.0
* erfc(NaN) = NaN
*
* Args: x : any real-numbered value -inf..inf
*
* Returns: erfc(x)
*
* Throws: (no abnormal error conditions)
*
* Source:
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this software is
* freely granted, provided that this notice is preserved.
* [as posted by eggcrook at stackexchange.com, 21 Dec 2012]
*
* Removed arcane incantations for runtime detection of endianness,
* and for treating IEEE754 doubles as two adjacent uint32_t;
* replaced with ANSI-compliant macros and compile-time detection
* of endianness. [Apr 2015]
*/
double
esl_stats_erfc(double x)
{
static const double tiny = 1e-300;
static const double half = 5.00000000000000000000e-01; /* 0x3FE00000, 0x00000000 */
static const double one = 1.00000000000000000000e+00; /* 0x3FF00000, 0x00000000 */
static const double two = 2.00000000000000000000e+00; /* 0x40000000, 0x00000000 */
static const double erx = 8.45062911510467529297e-01; /* 0x3FEB0AC1, 0x60000000 */
/*
* Coefficients for approximation to erf on [0,0.84375]
*/
static const double pp0 = 1.28379167095512558561e-01; /* 0x3FC06EBA, 0x8214DB68 */
static const double pp1 = -3.25042107247001499370e-01; /* 0xBFD4CD7D, 0x691CB913 */
static const double pp2 = -2.84817495755985104766e-02; /* 0xBF9D2A51, 0xDBD7194F */
static const double pp3 = -5.77027029648944159157e-03; /* 0xBF77A291, 0x236668E4 */
static const double pp4 = -2.37630166566501626084e-05; /* 0xBEF8EAD6, 0x120016AC */
static const double qq1 = 3.97917223959155352819e-01; /* 0x3FD97779, 0xCDDADC09 */
static const double qq2 = 6.50222499887672944485e-02; /* 0x3FB0A54C, 0x5536CEBA */
static const double qq3 = 5.08130628187576562776e-03; /* 0x3F74D022, 0xC4D36B0F */
static const double qq4 = 1.32494738004321644526e-04; /* 0x3F215DC9, 0x221C1A10 */
static const double qq5 = -3.96022827877536812320e-06; /* 0xBED09C43, 0x42A26120 */
/*
* Coefficients for approximation to erf in [0.84375,1.25]
*/
static const double pa0 = -2.36211856075265944077e-03; /* 0xBF6359B8, 0xBEF77538 */
static const double pa1 = 4.14856118683748331666e-01; /* 0x3FDA8D00, 0xAD92B34D */
static const double pa2 = -3.72207876035701323847e-01; /* 0xBFD7D240, 0xFBB8C3F1 */
static const double pa3 = 3.18346619901161753674e-01; /* 0x3FD45FCA, 0x805120E4 */
static const double pa4 = -1.10894694282396677476e-01; /* 0xBFBC6398, 0x3D3E28EC */
static const double pa5 = 3.54783043256182359371e-02; /* 0x3FA22A36, 0x599795EB */
static const double pa6 = -2.16637559486879084300e-03; /* 0xBF61BF38, 0x0A96073F */
static const double qa1 = 1.06420880400844228286e-01; /* 0x3FBB3E66, 0x18EEE323 */
static const double qa2 = 5.40397917702171048937e-01; /* 0x3FE14AF0, 0x92EB6F33 */
static const double qa3 = 7.18286544141962662868e-02; /* 0x3FB2635C, 0xD99FE9A7 */
static const double qa4 = 1.26171219808761642112e-01; /* 0x3FC02660, 0xE763351F */
static const double qa5 = 1.36370839120290507362e-02; /* 0x3F8BEDC2, 0x6B51DD1C */
static const double qa6 = 1.19844998467991074170e-02; /* 0x3F888B54, 0x5735151D */
/*
* Coefficients for approximation to erfc in [1.25,1/0.35]
*/
static const double ra0 = -9.86494403484714822705e-03; /* 0xBF843412, 0x600D6435 */
static const double ra1 = -6.93858572707181764372e-01; /* 0xBFE63416, 0xE4BA7360 */
static const double ra2 = -1.05586262253232909814e+01; /* 0xC0251E04, 0x41B0E726 */
static const double ra3 = -6.23753324503260060396e+01; /* 0xC04F300A, 0xE4CBA38D */
static const double ra4 = -1.62396669462573470355e+02; /* 0xC0644CB1, 0x84282266 */
static const double ra5 = -1.84605092906711035994e+02; /* 0xC067135C, 0xEBCCABB2 */
static const double ra6 = -8.12874355063065934246e+01; /* 0xC0545265, 0x57E4D2F2 */
static const double ra7 = -9.81432934416914548592e+00; /* 0xC023A0EF, 0xC69AC25C */
static const double sa1 = 1.96512716674392571292e+01; /* 0x4033A6B9, 0xBD707687 */
static const double sa2 = 1.37657754143519042600e+02; /* 0x4061350C, 0x526AE721 */
static const double sa3 = 4.34565877475229228821e+02; /* 0x407B290D, 0xD58A1A71 */
static const double sa4 = 6.45387271733267880336e+02; /* 0x40842B19, 0x21EC2868 */
static const double sa5 = 4.29008140027567833386e+02; /* 0x407AD021, 0x57700314 */
static const double sa6 = 1.08635005541779435134e+02; /* 0x405B28A3, 0xEE48AE2C */
static const double sa7 = 6.57024977031928170135e+00; /* 0x401A47EF, 0x8E484A93 */
static const double sa8 = -6.04244152148580987438e-02; /* 0xBFAEEFF2, 0xEE749A62 */
/*
* Coefficients for approximation to erfc in [1/.35,28]
*/
static const double rb0 = -9.86494292470009928597e-03; /* 0xBF843412, 0x39E86F4A */
static const double rb1 = -7.99283237680523006574e-01; /* 0xBFE993BA, 0x70C285DE */
static const double rb2 = -1.77579549177547519889e+01; /* 0xC031C209, 0x555F995A */
static const double rb3 = -1.60636384855821916062e+02; /* 0xC064145D, 0x43C5ED98 */
static const double rb4 = -6.37566443368389627722e+02; /* 0xC083EC88, 0x1375F228 */
static const double rb5 = -1.02509513161107724954e+03; /* 0xC0900461, 0x6A2E5992 */
static const double rb6 = -4.83519191608651397019e+02; /* 0xC07E384E, 0x9BDC383F */
static const double sb1 = 3.03380607434824582924e+01; /* 0x403E568B, 0x261D5190 */
static const double sb2 = 3.25792512996573918826e+02; /* 0x40745CAE, 0x221B9F0A */
static const double sb3 = 1.53672958608443695994e+03; /* 0x409802EB, 0x189D5118 */
static const double sb4 = 3.19985821950859553908e+03; /* 0x40A8FFB7, 0x688C246A */
static const double sb5 = 2.55305040643316442583e+03; /* 0x40A3F219, 0xCEDF3BE6 */
static const double sb6 = 4.74528541206955367215e+02; /* 0x407DA874, 0xE79FE763 */
static const double sb7 = -2.24409524465858183362e+01; /* 0xC03670E2, 0x42712D62 */
int hx,ix;
double R,S,P,Q,s,y,z,r;
ESL_GET_HIGHWORD(hx, x); // SRE: replaced original Sun incantation here.
ix = hx & 0x7fffffff;
if (ix>=0x7ff00000) /* erfc(nan)=nan; erfc(+-inf)=0,2 */
return (double)(((unsigned)hx>>31)<<1)+one/x;
if (ix < 0x3feb0000) /* |x|<0.84375 */
{
if (ix < 0x3c700000) return one-x; /* |x|<2**-56 */
z = x*x;
r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
y = r/s;
if (hx < 0x3fd00000) /* x<1/4 */
{
return one-(x+x*y);
}
else
{
r = x*y;
r += (x-half);
return half - r ;
}
}
if (ix < 0x3ff40000) /* 0.84375 <= |x| < 1.25 */
{
s = fabs(x)-one;
P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
if (hx>=0)
{
z = one-erx;
return z - P/Q;
}
else
{
z = erx+P/Q;
return one+z;
}
}
if (ix < 0x403c0000) /* |x|<28 */
{
x = fabs(x);
s = one/(x*x);
if (ix< 0x4006DB6D) /* |x| < 1/.35 ~ 2.857143*/
{
R = ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(ra5+s*(ra6+s*ra7))))));
S = one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(sa5+s*(sa6+s*(sa7+s*sa8)))))));
}
else /* |x| >= 1/.35 ~ 2.857143 */
{
if (hx < 0 && ix >= 0x40180000) return two-tiny; /* x < -6 */
R = rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(rb5+s*rb6)))));
S = one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(sb5+s*(sb6+s*sb7))))));
}
z = x;
ESL_SET_LOWWORD(z, 0); // SRE: replaced original Sun incantation here.
r = exp(-z*z-0.5625) * exp((z-x)*(z+x)+R/S);
if (hx>0) return r/x;
else return two-r/x;
}
else
{
if (hx>0) return tiny*tiny;
else return two-tiny;
}
}
/*----------------- end, special functions ----------------------*/
/*****************************************************************
* 3. Standard statistical tests.
*****************************************************************/
/* Function: esl_stats_GTest()
* Synopsis: Calculates a G-test on 2 vs. 1 binomials.
*
* Purpose: In experiment a, we've drawn <ca> successes in <na> total
* trials; in experiment b, we've drawn <cb> successes in
* <nb> total trials. Are the counts different enough to
* conclude that the two experiments are different? The
* null hypothesis is that the successes in both experiments
* were drawn from the same binomial distribution with
* per-trial probability $p$. The tested hypothesis is that
* experiments a,b have different binomial probabilities
* $p_a,p_b$. The G-test is a log-likelihood-ratio statistic,
* assuming maximum likelihood values for $p,p_a,p_b$.
* $2G$ is distributed approximately as $X^2(1)$,
* %"X" is "Chi"
* which we use to calculate a P-value for the G statistic.
*
* Args: ca - number of positives in experiment a
* na - total number in experiment a
* cb - number of positives in experiment b
* nb - total number in experiment b
* ret_G - RETURN: G statistic, a log likelihood ratio, in nats
* ret_P - RETURN: P-value for the G-statistic
*
* Returns: <eslOK> on success.
*
* Throws: (no abnormal error conditions)
*
* Xref: Archive1999/0906-sagescore/sagescore.c
*/
int
esl_stats_GTest(int ca, int na, int cb, int nb, double *ret_G, double *ret_P)
{
double a,b,c,d,n;
double G = 0.;
a = (double) ca;
b = (double) (na - ca);
c = (double) cb;
d = (double) (nb - cb);
n = (double) na+nb;
/* Yes, the calculation here is correct; algebraic
* rearrangement of the log-likelihood-ratio with
* p_a = ca/na, p_b = cb/nb, and p = (ca+cb)/(na+nb).
* Guard against 0 probabilities; assume 0 log 0 => 0.
*/
if (a > 0.) G = a * log(a);
if (b > 0.) G += b * log(b);
if (c > 0.) G += c * log(c);
if (d > 0.) G += d * log(d);
if (n > 0.) G += n * log(n);
if (a+b > 0.) G -= (a+b) * log(a+b);
if (c+d > 0.) G -= (c+d) * log(c+d);
if (a+c > 0.) G -= (a+c) * log(a+c);
if (b+d > 0.) G -= (b+d) * log(b+d);
*ret_G = G;
return esl_stats_IncompleteGamma( 0.5, G, NULL, ret_P);
}
/* Function: esl_stats_ChiSquaredTest()
* Synopsis: Calculates a $\chi^2$ P-value.
* Incept: SRE, Tue Jul 19 11:39:32 2005 [St. Louis]
*
* Purpose: Calculate the probability that a chi-squared statistic
* with <v> degrees of freedom would exceed the observed
* chi-squared value <x>; return it in <ret_answer>. If
* this probability is less than some small threshold (say,
* 0.05 or 0.01), then we may reject the hypothesis we're
* testing.
*
* Args: v - degrees of freedom
* x - observed chi-squared value
* ret_answer - RETURN: P(\chi^2 > x)
*
* Returns: <eslOK> on success.
*
* Throws: <eslERANGE> if <v> or <x> are out of valid range.
* <eslENOHALT> if iterative calculation fails.
*/
int
esl_stats_ChiSquaredTest(int v, double x, double *ret_answer)
{
return esl_stats_IncompleteGamma((double)v/2., x/2., NULL, ret_answer);
}
/*----------------- end, statistical tests ---------------------*/
/*****************************************************************
* 4. Data fitting.
*****************************************************************/
/* Function: esl_stats_LinearRegression()
* Synopsis: Fit data to a straight line.
* Incept: SRE, Sat May 26 11:33:46 2007 [Janelia]
*
* Purpose: Fit <n> points <x[i]>, <y[i]> to a straight line
* $y = a + bx$ by linear regression.
*
* The $x_i$ are taken to be known, and the $y_i$ are taken
* to be observed quantities associated with a sampling
* error $\sigma_i$. If known, the standard deviations
* $\sigma_i$ for $y_i$ are provided in the <sigma> array.
* If they are unknown, pass <sigma = NULL>, and the
* routine will proceed with the assumption that $\sigma_i
* = 1$ for all $i$.
*
* The maximum likelihood estimates for $a$ and $b$ are
* optionally returned in <opt_a> and <opt_b>.
*
* The estimated standard deviations of $a$ and $b$ and
* their estimated covariance are optionally returned in
* <opt_sigma_a>, <opt_sigma_b>, and <opt_cov_ab>.
*
* The Pearson correlation coefficient is optionally
* returned in <opt_cc>.
*
* The $\chi^2$ P-value for the regression fit is
* optionally returned in <opt_Q>. This P-value may only be
* obtained when the $\sigma_i$ are known. If <sigma> is
* passed as <NULL> and <opt_Q> is requested, <*opt_Q> is
* set to 1.0.
*
* This routine follows the description and algorithm in
* \citep[pp.661-666]{Press93}.
*
* <n> must be greater than 2; at least two x[i] must
* differ; and if <sigma> is provided, all <sigma[i]> must
* be $>0$. If any of these conditions isn't met, the
* routine throws <eslEINVAL>.
*
* Args: x - x[0..n-1]
* y - y[0..n-1]
* sigma - sample error in observed y_i
* n - number of data points
* opt_a - optRETURN: intercept estimate
* opt_b - optRETURN: slope estimate
* opt_sigma_a - optRETURN: error in estimate of a
* opt_sigma_b - optRETURN: error in estimate of b
* opt_cov_ab - optRETURN: covariance of a,b estimates
* opt_cc - optRETURN: Pearson correlation coefficient for x,y
* opt_Q - optRETURN: X^2 P-value for linear fit
*
* Returns: <eslOK> on success.
*
* Throws: <eslEMEM> on allocation error;
* <eslEINVAL> if a contract condition isn't met;
* <eslENORESULT> if the chi-squared test fails.
* In these cases, all optional return values are set to 0.
*/
int
esl_stats_LinearRegression(const double *x, const double *y, const double *sigma, int n,
double *opt_a, double *opt_b,
double *opt_sigma_a, double *opt_sigma_b, double *opt_cov_ab,
double *opt_cc, double *opt_Q)
{
int status;
double *t = NULL;
double S, Sx, Sy, Stt;
double Sxy, Sxx, Syy;
double a, b, sigma_a, sigma_b, cov_ab, cc, X2, Q;
double xdev, ydev;
double tmp;
int i;
/* Contract checks. */
if (n <= 2) ESL_XEXCEPTION(eslEINVAL, "n must be > 2 for linear regression fitting");
if (sigma != NULL)
for (i = 0; i < n; i++) if (sigma[i] <= 0.) ESL_XEXCEPTION(eslEINVAL, "sigma[%d] <= 0", i);
status = eslEINVAL;
for (i = 0; i < n; i++) if (x[i] != 0.) { status = eslOK; break; }
if (status != eslOK) ESL_XEXCEPTION(eslEINVAL, "all x[i] are 0.");
/* Allocations */
ESL_ALLOC(t, sizeof(double) * n);
/* S = \sum_{i=1}{n} \frac{1}{\sigma_i^2}. (S > 0.) */
if (sigma != NULL) { for (S = 0., i = 0; i < n; i++) S += 1./ (sigma[i] * sigma[i]); }
else S = (double) n;
/* S_x = \sum_{i=1}{n} \frac{x[i]}{ \sigma_i^2} (Sx real.) */
for (Sx = 0., i = 0; i < n; i++) {
if (sigma == NULL) Sx += x[i];
else Sx += x[i] / (sigma[i] * sigma[i]);
}
/* S_y = \sum_{i=1}{n} \frac{y[i]}{\sigma_i^2} (Sy real.) */
for (Sy = 0., i = 0; i < n; i++) {
if (sigma == NULL) Sy += y[i];
else Sy += y[i] / (sigma[i] * sigma[i]);
}
/* t_i = \frac{1}{\sigma_i} \left( x_i - \frac{S_x}{S} \right) (t_i real) */
for (i = 0; i < n; i++) {
t[i] = x[i] - Sx/S;
if (sigma != NULL) t[i] /= sigma[i];
}
/* S_{tt} = \sum_{i=1}^n t_i^2 (if at least one x is != 0, Stt > 0) */
for (Stt = 0., i = 0; i < n; i++) { Stt += t[i] * t[i]; }
/* b = \frac{1}{S_{tt}} \sum_{i=1}^{N} \frac{t_i y_i}{\sigma_i} */
for (b = 0., i = 0; i < n; i++) {
if (sigma != NULL) { b += t[i]*y[i] / sigma[i]; }
else { b += t[i]*y[i]; }
}
b /= Stt;
/* a = \frac{ S_y - S_x b } {S} */
a = (Sy - Sx * b) / S;
/* \sigma_a^2 = \frac{1}{S} \left( 1 + \frac{ S_x^2 }{S S_{tt}} \right) */
sigma_a = sqrt ((1. + (Sx*Sx) / (S*Stt)) / S);
/* \sigma_b = \frac{1}{S_{tt}} */
sigma_b = sqrt (1. / Stt);
/* Cov(a,b) = - \frac{S_x}{S S_{tt}} */
cov_ab = -Sx / (S * Stt);
/* Pearson correlation coefficient */
Sxy = Sxx = Syy = 0.;
for (i = 0; i < n; i++) {
if (sigma != NULL) {
xdev = (x[i] / (sigma[i] * sigma[i])) - (Sx / n);
ydev = (y[i] / (sigma[i] * sigma[i])) - (Sy / n);
} else {
xdev = x[i] - (Sx / n);
ydev = y[i] - (Sy / n);
}
Sxy += xdev * ydev;
Sxx += xdev * xdev;
Syy += ydev * ydev;
}
cc = Sxy / (sqrt(Sxx) * sqrt(Syy));
/* \chi^2 */
for (X2 = 0., i = 0; i < n; i++) {
tmp = y[i] - a - b*x[i];
if (sigma != NULL) tmp /= sigma[i];
X2 += tmp*tmp;
}
/* We can calculate a goodness of fit if we know the \sigma_i */
if (sigma != NULL) {
if (esl_stats_ChiSquaredTest(n-2, X2, &Q) != eslOK) { status = eslENORESULT; goto ERROR; }
} else Q = 1.0;
/* If we didn't use \sigma_i, adjust the sigmas for a,b */
if (sigma == NULL) {
tmp = sqrt(X2 / (double)(n-2));
sigma_a *= tmp;
sigma_b *= tmp;
}
/* Done. Set up for normal return.
*/
free(t);
if (opt_a != NULL) *opt_a = a;
if (opt_b != NULL) *opt_b = b;
if (opt_sigma_a != NULL) *opt_sigma_a = sigma_a;
if (opt_sigma_b != NULL) *opt_sigma_b = sigma_b;
if (opt_cov_ab != NULL) *opt_cov_ab = cov_ab;
if (opt_cc != NULL) *opt_cc = cc;
if (opt_Q != NULL) *opt_Q = Q;
return eslOK;
ERROR:
if (t != NULL) free(t);
if (opt_a != NULL) *opt_a = 0.;
if (opt_b != NULL) *opt_b = 0.;
if (opt_sigma_a != NULL) *opt_sigma_a = 0.;
if (opt_sigma_b != NULL) *opt_sigma_b = 0.;
if (opt_cov_ab != NULL) *opt_cov_ab = 0.;
if (opt_cc != NULL) *opt_cc = 0.;
if (opt_Q != NULL) *opt_Q = 0.;
return status;
}
/*------------------- end, data fitting -------------------------*/
/*****************************************************************
* 5. Unit tests.
*****************************************************************/
#ifdef eslSTATS_TESTDRIVE
#include "esl_random.h"
#include "esl_stopwatch.h"
#ifdef HAVE_LIBGSL
#include <gsl/gsl_sf_gamma.h>
#endif
/* Macros for treating IEEE754 double as two uint32_t halves, with
* compile-time handling of endianness; see esl_stats.h.
*/
static void
utest_doublesplitting(ESL_RANDOMNESS *rng)
{
char msg[] = "esl_stats:: doublesplitting unit test failed";
uint32_t ix0, ix1;
double x;
double x2;
int iteration; // iteration 0 uses x = 2; iteration 1 uses random x = [0,1).
for (iteration = 0; iteration < 2; iteration++)
{
x = (iteration == 0 ? 2.0 : esl_random(rng));
ESL_GET_WORDS(ix0, ix1, x);
ESL_SET_WORDS(x2, ix0, ix1);
if (x2 != x) esl_fatal(msg);
ESL_GET_HIGHWORD(ix0, x);
ESL_SET_HIGHWORD(x2, ix0);
if (x2 != x) esl_fatal(msg);
ESL_GET_LOWWORD(ix0, x);
ESL_SET_LOWWORD(x2, ix0);
if (iteration == 0 && ix0 != 0) esl_fatal(msg);
if (x2 != x) esl_fatal(msg);
}
}
/* The LogGamma() function is rate-limiting in hmmbuild, because it is
* used so heavily in mixture Dirichlet calculations.
* ./configure --with-gsl; [compile test driver]
* ./stats_utest -v
* runs a comparison of time/precision against GSL.
* SRE, Sat May 23 10:04:41 2009, on home Mac:
* LogGamma = 1.29u / N=1e8 = 13 nsec/call
* gsl_sf_lngamma = 1.43u / N=1e8 = 14 nsec/call
*/
static void
utest_LogGamma(ESL_RANDOMNESS *r, int N, int be_verbose)
{
char *msg = "esl_stats_LogGamma() unit test failed";
ESL_STOPWATCH *w = esl_stopwatch_Create();
double *x = malloc(sizeof(double) * N);
double *lg = malloc(sizeof(double) * N);
double *lg2 = malloc(sizeof(double) * N);
int i;
for (i = 0; i < N; i++)
x[i] = esl_random(r) * 100.;
esl_stopwatch_Start(w);
for (i = 0; i < N; i++)
if (esl_stats_LogGamma(x[i], &(lg[i])) != eslOK) esl_fatal(msg);
esl_stopwatch_Stop(w);
if (be_verbose) esl_stopwatch_Display(stdout, w, "esl_stats_LogGamma() timing: ");
#ifdef HAVE_LIBGSL
esl_stopwatch_Start(w);
for (i = 0; i < N; i++) lg2[i] = gsl_sf_lngamma(x[i]);
esl_stopwatch_Stop(w);
if (be_verbose) esl_stopwatch_Display(stdout, w, "gsl_sf_lngamma() timing: ");
for (i = 0; i < N; i++)
if (esl_DCompare(lg[i], lg2[i], 1e-2) != eslOK) esl_fatal(msg);
#endif
free(lg2);
free(lg);
free(x);
esl_stopwatch_Destroy(w);
}
/* The test of esl_stats_LinearRegression() is a statistical test,
* so we can't be too aggressive about testing results.
*
* Args:
* r - a source of randomness
* use_sigma - TRUE to pass sigma to the regression fit.
* be_verbose - TRUE to print results (manual, not automated test mode)
*/
static void
utest_LinearRegression(ESL_RANDOMNESS *r, int use_sigma, int be_verbose)
{
char msg[] = "linear regression unit test failed";
double a = -3.;
double b = 1.;
int n = 100;
double xori = -20.;
double xstep = 1.0;
double setsigma = 1.0; /* sigma on all points */
int i;
double *x = NULL;
double *y = NULL;
double *sigma = NULL;
double ae, be, siga, sigb, cov_ab, cc, Q;
if ((x = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
if ((y = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
if ((sigma = malloc(sizeof(double) * n)) == NULL) esl_fatal(msg);
/* Simulate some linear data */
for (i = 0; i < n; i++)
{
sigma[i] = setsigma;
x[i] = xori + i*xstep;
y[i] = esl_rnd_Gaussian(r, a + b*x[i], sigma[i]);
}
if (use_sigma) {
if (esl_stats_LinearRegression(x, y, sigma, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK) esl_fatal(msg);
} else {
if (esl_stats_LinearRegression(x, y, NULL, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK) esl_fatal(msg);
}
if (be_verbose) {
printf("Linear regression test:\n");
printf("estimated intercept a = %8.4f [true = %8.4f]\n", ae, a);
printf("estimated slope b = %8.4f [true = %8.4f]\n", be, b);
printf("estimated sigma on a = %8.4f\n", siga);
printf("estimated sigma on b = %8.4f\n", sigb);
printf("estimated cov(a,b) = %8.4f\n", cov_ab);
printf("correlation coeff = %8.4f\n", cc);
printf("P-value = %8.4f\n", Q);
}
/* The following tests are statistical.
*/
if ( fabs(ae-a) > 2*siga ) esl_fatal(msg);
if ( fabs(be-b) > 2*sigb ) esl_fatal(msg);
if ( cc < 0.95) esl_fatal(msg);
if (use_sigma) {
if (Q < 0.001) esl_fatal(msg);
} else {
if (Q != 1.0) esl_fatal(msg);
}
free(x);
free(y);
free(sigma);
}
static void
utest_erfc(ESL_RANDOMNESS *rng, int be_verbose)
{
char msg[] = "esl_stats:: erfc unit test failed";
double x;
double result;
int i;
if (be_verbose) {
printf("#--------------------------\n");
printf("# erfc unit testing...\n");
}
result = esl_stats_erfc( eslNaN);
if (! isnan(result)) esl_fatal(msg);
if (esl_stats_erfc(-eslINFINITY) != 2.0) esl_fatal(msg);
if (esl_stats_erfc( 0.0) != 1.0) esl_fatal(msg);
if (esl_stats_erfc( eslINFINITY) != 0.0) esl_fatal(msg);
for (i = 0; i < 42; i++)
{
x = esl_random(rng) * 10. - 5.;
result = esl_stats_erfc(x);
if (!isfinite(result)) esl_fatal(msg);
#ifdef HAVE_ERFC
if (esl_DCompare(result, erfc(x), 1e-6) != eslOK) esl_fatal(msg);
if (be_verbose)
printf("%15f %15f %15f\n", x, result, erfc(x));
#endif
}
if (be_verbose)
printf("#--------------------------\n");
return;
}
#endif /*eslSTATS_TESTDRIVE*/
/*-------------------- end of unit tests ------------------------*/
/*****************************************************************
* 6. Test driver.
*****************************************************************/
#ifdef eslSTATS_TESTDRIVE
/* gcc -g -Wall -o stats_utest -L. -I. -DeslSTATS_TESTDRIVE esl_stats.c -leasel -lm
* gcc -DHAVE_LIBGSL -O2 -o stats_utest -L. -I. -DeslSTATS_TESTDRIVE esl_stats.c -leasel -lgsl -lm
*/
#include <stdio.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_random.h"
#include "esl_stats.h"
static ESL_OPTIONS options[] = {
/* name type default env range togs reqs incomp help docgrp */
{"-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show help and usage", 0},
{"-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0},
{"-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "verbose: show verbose output", 0},
{"-N", eslARG_INT,"10000000", NULL, NULL, NULL, NULL, NULL, "number of trials in LogGamma test", 0},
{ 0,0,0,0,0,0,0,0,0,0},
};
static char usage[] = "[-options]";
static char banner[] = "test driver for stats special functions";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
int be_verbose = esl_opt_GetBoolean(go, "-v");
int N = esl_opt_GetInteger(go, "-N");
if (be_verbose) printf("seed = %" PRIu32 "\n", esl_randomness_GetSeed(r));
utest_doublesplitting(r);
utest_erfc(r, be_verbose);
utest_LogGamma(r, N, be_verbose);
utest_LinearRegression(r, TRUE, be_verbose);
utest_LinearRegression(r, FALSE, be_verbose);
esl_getopts_Destroy(go);
esl_randomness_Destroy(r);
exit(0);
}
#endif /*eslSTATS_TESTDRIVE*/
/*------------------- end of test driver ------------------------*/
/*****************************************************************
* 7. Examples.
*****************************************************************/
/* Compile: gcc -g -Wall -o example -I. -DeslSTATS_EXAMPLE esl_stats.c esl_random.c easel.c -lm
* or gcc -g -Wall -o example -I. -L. -DeslSTATS_EXAMPLE esl_stats.c -leasel -lm
*/
#ifdef eslSTATS_EXAMPLE
/*::cexcerpt::stats_example::begin::*/
/* gcc -g -Wall -o example -I. -DeslSTATS_EXAMPLE esl_stats.c esl_random.c easel.c -lm */
#include <stdio.h>
#include "easel.h"
#include "esl_random.h"
#include "esl_stats.h"
int main(void)
{
ESL_RANDOMNESS *r = esl_randomness_Create(0);
double a = -3.;
double b = 1.;
double xori = -20.;
double xstep = 1.0;
double setsigma = 1.0; /* sigma on all points */
int n = 100;
double *x = malloc(sizeof(double) * n);
double *y = malloc(sizeof(double) * n);
double *sigma = malloc(sizeof(double) * n);
int i;
double ae, be, siga, sigb, cov_ab, cc, Q;
/* Simulate some linear data, with Gaussian noise added to y_i */
for (i = 0; i < n; i++) {
sigma[i] = setsigma;
x[i] = xori + i*xstep;
y[i] = esl_rnd_Gaussian(r, a + b*x[i], sigma[i]);
}
if (esl_stats_LinearRegression(x, y, sigma, n, &ae, &be, &siga, &sigb, &cov_ab, &cc, &Q) != eslOK)
esl_fatal("linear regression failed");
printf("estimated intercept a = %8.4f [true = %8.4f]\n", ae, a);
printf("estimated slope b = %8.4f [true = %8.4f]\n", be, b);
printf("estimated sigma on a = %8.4f\n", siga);
printf("estimated sigma on b = %8.4f\n", sigb);
printf("estimated cov(a,b) = %8.4f\n", cov_ab);
printf("correlation coeff = %8.4f\n", cc);
printf("P-value = %8.4f\n", Q);
free(x); free(y); free(sigma);
esl_randomness_Destroy(r);
exit(0);
}
/*::cexcerpt::stats_example::end::*/
#endif /* eslSTATS_EXAMPLE */
#ifdef eslSTATS_EXAMPLE2
#include <stdlib.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_stats.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options] <ca> <na> <cb> <nb>";
static char banner[] = "example from the stats module: using a G-test";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 4, argc, argv, banner, usage);
int ca = strtol(esl_opt_GetArg(go, 1), NULL, 10);
int na = strtol(esl_opt_GetArg(go, 2), NULL, 10);
int cb = strtol(esl_opt_GetArg(go, 3), NULL, 10);
int nb = strtol(esl_opt_GetArg(go, 4), NULL, 10);
double G, P;
int status;
if (ca > na || cb > nb) esl_fatal("argument order wrong? expect ca, na, cb, nb for ca/na, cb/nb");
if ( (status = esl_stats_GTest(ca, na, cb, nb, &G, &P)) != eslOK) esl_fatal("G-test failed?");
printf("%-10.3g %12.2f\n", P, G);
exit(0);
}
#endif /* eslSTATS_EXAMPLE2 */
/*--------------------- end of examples -------------------------*/
You can’t perform that action at this time.