Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
/**Probability distribution CDFs, PDFs/PMFs, and a few inverse CDFs.
*
* Authors: David Simcha, Don Clugston*/
/*
* Acknowledgements: Some of this module was borrowed the mathstat module
* of Don Clugston's MathExtra library. This was done to create a
* coherent, complete library without massive dependencies, and without
* reinventing the wheel. These functions have been renamed to
* fit the naming conventions of this library, and are noted below.
* The code from Don Clugston's MathExtra library was based on the Cephes
* library by Stephen Moshier.
*
* Conventions:
* Cumulative distribution functions are named <distribution>CDF. For
* discrete distributions, are the P(X <= x) where X is the random variable,
* NOT P(X < x).
*
* All CDFs have a complement, named <distribution>CDFR, which stands for
* "Cumulative Distribution Function Right". For discrete distributions,
* this is P(X >= x), NOT P(X > x) and is therefore NOT equal to
* 1 - <distribution>CDF. Also, even for continuous distributions, the
* numerical accuracy is higher for small p-values if the CDFR is used than
* if 1 - CDF is used.
*
* If a PDF/PMF function is included for a distribution, it is named
* <distribution>PMF or <distribution>PDF (PMF for discrete, PDF for
* continuous distributions).
*
* If an inverse CDF is included, it is named inv<Distribution>CDF.
*
* For all distributions, the test statistic is the first function parameter
* and the distribution parameters are further down the function parameter
* list. This is important for certain generic code, such as tests and
* the parametrize template.
*
* The following functions are identical or functionally equivalent to
* functions found in MathExtra/Tango.Math.probability. This information
* might be useful if someone is trying to integrate this library into other code:
*
* normalCDF <=> normalDistribution
*
* normalCDFR <=> normalDistributionCompl
*
* invNormalCDF <=> normalDistributionComplInv
*
* studentsTCDF <=> studentsTDistribution (Note reversal in argument order)
*
* invStudentsTCDF <=> studentsTDistributionInv (Again, arg order reversed)
*
* binomialCDF <=> binomialDistribution
*
* negBinomCDF <=> negativeBinomialDistribution
*
* poissonCDF <=> poissonDistribution
*
* chiSqrCDF <=> chiSqrDistribution (Note reversed arg order)
*
* chiSqrCDFR <=> chiSqrDistributionCompl (Note reversed arg order)
*
* invChiSqCDFR <=> chiSqrDistributionComplInv
*
* fisherCDF <=> fDistribution (Note reversed arg order)
*
* fisherCDFR <=> fDistributionCompl (Note reversed arg order)
*
* invFisherCDFR <=> fDistributionComplInv
*
* gammaCDF <=> gammaDistribution (Note arg reversal)
*
* gammaCDFR <=> gammaDistributionCompl (Note arg reversal)
*
* Note that CDFRs/Compls of continuous distributions are not equivalent,
* because in Tango/MathExtra they represent P(X > x) while in dstats they
* represent P(X >= x).
*
*
* Copyright (c) 2008-2009, David Simcha and Don Clugston
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name of the authors nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ''AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
module dstats.distrib;
import std.algorithm, std.conv, std.exception, std.math, std.traits,
std.mathspecial, std.range;
alias std.mathspecial.erfc erfc;
alias std.mathspecial.erf erf;
import dstats.base;
// CTFE doesn't work yet for sqrt() in GDC. This value is sqrt(2 * PI).
enum SQ2PI = 2.50662827463100050241576528481104525300698674060993831662992;
version(unittest) {
import std.stdio, std.random;
alias std.math.approxEqual ae;
}
/**Takes a distribution function (CDF or PDF/PMF) as a template argument, and
* parameters as function arguments in the order that they appear in the
* function declaration and returns a delegate that binds the supplied
* parameters to the distribution function. Assumes the non-parameter
* argument is the first argument to the distribution function.
*
* Examples:
* ---
* auto stdNormal = parametrize!(normalCDF)(0.0L, 1.0L);
* ---
*
* stdNormal is now a delegate for the normal(0, 1) distribution.*/
double delegate(ParameterTypeTuple!(distrib)[0])
parametrize(alias distrib)(ParameterTypeTuple!(distrib)[1..$]
parameters) {
double calculate(ParameterTypeTuple!(distrib)[0] arg) {
return distrib(arg, parameters);
}
return &calculate;
}
unittest {
// Just basically see if this compiles.
auto stdNormal = parametrize!normalCDF(0, 1);
assert(approxEqual(stdNormal(2.5), normalCDF(2.5, 0, 1)));
}
///
struct ParamFunctor(alias distrib) {
ParameterTypeTuple!(distrib)[1..$] parameters;
double opCall(ParameterTypeTuple!(distrib)[0] arg) {
return distrib(arg, parameters);
}
}
/**Takes a distribution function (CDF or PDF/PMF) as a template argument, and
* parameters as function arguments in the order that they appear in the
* function declaration and returns a functor that binds the supplied
* parameters to the distribution function. Assumes the non-parameter
* argument is the first argument to the distribution function.
*
* Examples:
* ---
* auto stdNormal = paramFunctor!(normalCDF)(0.0L, 1.0L);
* ---
*
* stdNormal is now a functor for the normal(0, 1) distribution.*/
ParamFunctor!(distrib) paramFunctor(alias distrib)
(ParameterTypeTuple!(distrib)[1..$] parameters) {
ParamFunctor!(distrib) ret;
foreach(ti, elem; parameters) {
ret.tupleof[ti] = elem;
}
return ret;
}
unittest {
// Just basically see if this compiles.
auto stdNormal = paramFunctor!normalCDF(0, 1);
assert(approxEqual(stdNormal(2.5), normalCDF(2.5, 0, 1)));
}
///
double uniformPDF(double X, double lower, double upper) {
dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution.");
dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution.");
return 1.0L / (upper - lower);
}
///
double uniformCDF(double X, double lower, double upper) {
dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution.");
dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution.");
return (X - lower) / (upper - lower);
}
///
double uniformCDFR(double X, double lower, double upper) {
dstatsEnforce(X >= lower, "Can't have X < lower bound in uniform distribution.");
dstatsEnforce(X <= upper, "Can't have X > upper bound in uniform distribution.");
return (upper - X) / (upper - lower);
}
///
double poissonPMF(ulong k, double lambda) {
dstatsEnforce(lambda > 0, "Cannot have a Poisson with lambda <= 0 or nan.");
return exp(cast(double) k * log(lambda) -
(lambda + logFactorial(k))); //Grouped for best precision.
}
unittest {
assert(approxEqual(poissonPMF(1, .1), .0904837));
}
enum POISSON_NORMAL = 1UL << 12; // Where to switch to normal approx.
// The gamma incomplete function is too unstable and the distribution
// is for all practical purposes normal anyhow.
private double normApproxPoisCDF(ulong k, double lambda)
in {
assert(lambda > 0);
} do {
double sd = sqrt(lambda);
// mean == lambda.
return normalCDF(k + 0.5L, lambda, sd);
}
/**P(K <= k) where K is r.v.*/
double poissonCDF(ulong k, double lambda) {
dstatsEnforce(lambda > 0, "Cannot have a poisson with lambda <= 0 or nan.");
return (max(k, lambda) >= POISSON_NORMAL) ?
normApproxPoisCDF(k, lambda) :
gammaIncompleteCompl(k + 1, lambda);
}
unittest {
// Make sure this jives with adding up PMF elements, since this is a
// discrete distribution.
static double pmfSum(uint k, double lambda) {
double ret = 0;
foreach(i; 0..k + 1) {
ret += poissonPMF(i, lambda);
}
return ret;
}
assert(approxEqual(poissonCDF(1, 0.5), pmfSum(1, 0.5)));
assert(approxEqual(poissonCDF(3, 0.7), pmfSum(3, 0.7)));
// Absurdly huge values: Test normal approximation.
// Values from R.
double ans = poissonCDF( (1UL << 50) - 10_000_000, 1UL << 50);
assert(approxEqual(ans, 0.3828427));
// Make sure cutoff is reasonable, i.e. make sure gamma incomplete branch
// and normal branch get roughly the same answer near the cutoff.
for(double lambda = POISSON_NORMAL / 2; lambda <= POISSON_NORMAL * 2; lambda += 100) {
for(ulong k = POISSON_NORMAL / 2; k <= POISSON_NORMAL * 2; k += 100) {
double normAns = normApproxPoisCDF(k, lambda);
double gammaAns = gammaIncompleteCompl(k + 1, lambda);
assert(abs(normAns - gammaAns) < 0.01, text(normAns, '\t', gammaAns));
}
}
}
// The gamma incomplete function is too unstable and the distribution
// is for all practical purposes normal anyhow.
private double normApproxPoisCDFR(ulong k, double lambda)
in {
assert(lambda > 0);
} do {
double sd = sqrt(lambda);
// mean == lambda.
return normalCDFR(k - 0.5L, lambda, sd);
}
/**P(K >= k) where K is r.v.*/
double poissonCDFR(ulong k, double lambda) {
dstatsEnforce(lambda > 0, "Can't have a poisson with lambda <= 0 or nan.");
return (max(k, lambda) >= POISSON_NORMAL) ?
normApproxPoisCDFR(k, lambda) :
gammaIncomplete(k, lambda);
}
unittest {
// Make sure this jives with adding up PMF elements, since this is a
// discrete distribution.
static double pmfSum(uint k, double lambda) {
double ret = 0;
foreach(i; 0..k + 1) {
ret += poissonPMF(i, lambda);
}
return ret;
}
assert(approxEqual(poissonCDFR(1, 0.5), 1 - pmfSum(0, 0.5)));
assert(approxEqual(poissonCDFR(3, 0.7), 1 - pmfSum(2, 0.7)));
// Absurdly huge value to test normal approximation.
// Values from R.
double ans = poissonCDFR( (1UL << 50) - 10_000_000, 1UL << 50);
assert(approxEqual(ans, 0.6171573));
// Make sure cutoff is reasonable, i.e. make sure gamma incomplete branch
// and normal branch get roughly the same answer near the cutoff.
for(double lambda = POISSON_NORMAL / 2; lambda <= POISSON_NORMAL * 2; lambda += 100) {
for(ulong k = POISSON_NORMAL / 2; k <= POISSON_NORMAL * 2; k += 100) {
double normAns = normApproxPoisCDFR(k, lambda);
double gammaAns = gammaIncomplete(k, lambda);
assert(abs(normAns - gammaAns) < 0.01, text(normAns, '\t', gammaAns));
}
}
}
/**Returns the value of k for the given p-value and lambda. If p-val
* doesn't exactly map to a value of k, the k for which poissonCDF(k, lambda)
* is closest to pVal is used.*/
uint invPoissonCDF(double pVal, double lambda) {
dstatsEnforce(lambda > 0, "Cannot have a poisson with lambda <= 0 or nan.");
dstatsEnforce(pVal >= 0 && pVal <= 1, "P-values must be between 0, 1.");
// Use normal approximation to get approx answer, then brute force search.
// This works better than you think because for small n, there's not much
// search space and for large n, the normal approx. is doublely good.
uint guess = cast(uint) max(round(
invNormalCDF(pVal, lambda, sqrt(lambda)) + 0.5), 0.0L);
double guessP = poissonCDF(guess, lambda);
if(guessP == pVal) {
return guess;
} else if(guessP < pVal) {
for(uint k = guess + 1; ; k++) {
double newP = guessP + poissonPMF(k, lambda);
if(newP >= 1)
return k;
if(abs(newP - pVal) > abs(guessP - pVal)) {
return k - 1;
} else {
guessP = newP;
}
}
} else {
for(uint k = guess - 1; k != uint.max; k--) {
double newP = guessP - poissonPMF(k + 1, lambda);
if(abs(newP - pVal) > abs(guessP - pVal)) {
return k + 1;
} else {
guessP = newP;
}
}
return 0;
}
}
unittest {
foreach(i; 0..1_000) {
// Restricted variable ranges are because, in the tails, more than one
// value of k can map to the same p-value at machine precision.
// Obviously, this is one of those corner cases that nothing can be
// done about.
double lambda = uniform(.05L, 8.0L);
uint k = uniform(0U, cast(uint) ceil(3.0L * lambda));
double pVal = poissonCDF(k, lambda);
assert(invPoissonCDF(pVal, lambda) == k);
}
}
///
double binomialPMF(ulong k, ulong n, double p) {
dstatsEnforce(k <= n, "k cannot be > n in binomial distribution.");
dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
return exp(logNcomb(n, k) + k * log(p) + (n - k) * log(1 - p));
}
unittest {
assert(approxEqual(binomialPMF(0, 10, .5), cast(double) 1/1024));
assert(approxEqual(binomialPMF(100, 1000, .11), .024856));
}
// Determines what value of n we switch to normal approximation at b/c
// betaIncomplete becomes unstable.
private enum BINOM_APPROX = 1UL << 24;
// Cutoff value of n * p for deciding whether to go w/ normal or poisson approx
// when betaIncomplete becomes unstable.
private enum BINOM_POISSON = 1_024;
// betaIncomplete is numerically unstable for huge values of n.
// Luckily this is exactly when the normal approximation becomes
// for all practical purposes exact.
private double normApproxBinomCDF(double k, double n, double p)
in {
assert(k <= n);
assert(p >= 0 && p <= 1);
} do {
double mu = p * n;
double sd = sqrt( to!double(n) ) * sqrt(p) * sqrt(1 - p);
double xCC = k + 0.5L;
return normalCDF(xCC, mu, sd);
}
///P(K <= k) where K is random variable.
double binomialCDF(ulong k, ulong n, double p) {
dstatsEnforce(k <= n, "k cannot be > n in binomial distribution.");
dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
if(k == n) {
return 1;
} else if(k == 0) {
return pow(1.0 - p, cast(double) n);
}
if(n > BINOM_APPROX) {
if(n * p < BINOM_POISSON) {
return poissonCDF(k, n * p);
} else if(n * (1 - p) < BINOM_POISSON) {
return poissonCDFR(n - k, n * (1 - p));
} else {
return normApproxBinomCDF(k, n, p);
}
}
return betaIncomplete(n - k, k + 1, 1.0 - p);
}
unittest {
assert(approxEqual(binomialCDF(10, 100, .11), 0.4528744401));
assert(approxEqual(binomialCDF(15, 100, .12), 0.8585510507));
assert(approxEqual(binomialCDF(50, 1000, .04), 0.95093595));
assert(approxEqual(binomialCDF(7600, 15000, .5), .9496193045414));
assert(approxEqual(binomialCDF(0, 10, 0.2), 0.1073742));
// Absurdly huge numbers:
{
ulong k = (1UL << 60) - 100_000_000;
ulong n = 1UL << 61;
assert(approxEqual(binomialCDF(k, n, 0.5L), 0.4476073));
}
// Test Poisson branch.
double poisAns = binomialCDF(85, 1UL << 26, 1.49e-6);
assert(approxEqual(poisAns, 0.07085327));
// Test poissonCDFR branch.
poisAns = binomialCDF( (1UL << 25) - 100, 1UL << 25, 0.9999975L);
assert(approxEqual(poisAns, 0.04713316));
// Make sure cutoff is reasonable: Just below it, we should get similar
// results for normal, exact.
for(ulong n = BINOM_APPROX / 2; n < BINOM_APPROX; n += 200_000) {
for(double p = 0.01; p <= 0.99; p += 0.05) {
long lowerK = roundTo!long( n * p * 0.99);
long upperK = roundTo!long( n * p / 0.99);
for(ulong k = lowerK; k <= min(n, upperK); k += 1_000) {
double normRes = normApproxBinomCDF(k, n, p);
double exactRes = binomialCDF(k, n, p);
assert(abs(normRes - exactRes) < 0.001,
text(normRes, '\t', exactRes));
}
}
}
}
// betaIncomplete is numerically unstable for huge values of n.
// Luckily this is exactly when the normal approximation becomes
// for all practical purposes exact.
private double normApproxBinomCDFR(ulong k, ulong n, double p)
in {
assert(k <= n);
assert(p >= 0 && p <= 1);
} do {
double mu = p * n;
double sd = sqrt( to!double(n) ) * sqrt(p) * sqrt(1 - p);
double xCC = k - 0.5L;
return normalCDFR(xCC, mu, sd);
}
///P(K >= k) where K is random variable.
double binomialCDFR(ulong k, ulong n, double p) {
dstatsEnforce(k <= n, "k cannot be > n in binomial distribution.");
dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
if(k == 0) {
return 1;
} else if(k == n) {
return pow(p, cast(double) n);
}
if(n > BINOM_APPROX) {
if(n * p < BINOM_POISSON) {
return poissonCDFR(k, n * p);
} else if(n * (1 - p) < BINOM_POISSON) {
return poissonCDF(n - k, n * (1 - p));
} else {
return normApproxBinomCDFR(k, n, p);
}
}
return betaIncomplete(k, n - k + 1, p);
}
unittest {
// Values from R, Maxima.
assert(approxEqual(binomialCDF(10, 100, .11), 1 -
binomialCDFR(11, 100, .11)));
assert(approxEqual(binomialCDF(15, 100, .12), 1 -
binomialCDFR(16, 100, .12)));
assert(approxEqual(binomialCDF(50, 1000, .04), 1 -
binomialCDFR(51, 1000, .04)));
assert(approxEqual(binomialCDF(7600, 15000, .5), 1 -
binomialCDFR(7601, 15000, .5)));
assert(approxEqual(binomialCDF(9, 10, 0.3), 1 -
binomialCDFR(10, 10, 0.3)));
// Absurdly huge numbers, test normal branch.
{
ulong k = (1UL << 60) - 100_000_000;
ulong n = 1UL << 61;
assert(approxEqual(binomialCDFR(k, n, 0.5L), 0.5523927));
}
// Test Poisson inversion branch.
double poisRes = binomialCDFR((1UL << 25) - 70, 1UL << 25, 0.9999975L);
assert(approxEqual(poisRes, 0.06883905));
// Test Poisson branch.
poisRes = binomialCDFR(350, 1UL << 25, 1e-5);
assert(approxEqual(poisRes, 0.2219235));
// Make sure cutoff is reasonable: Just below it, we should get similar
// results for normal, exact.
for(ulong n = BINOM_APPROX / 2; n < BINOM_APPROX; n += 200_000) {
for(double p = 0.01; p <= 0.99; p += 0.05) {
long lowerK = roundTo!long( n * p * 0.99);
long upperK = roundTo!long( n * p / 0.99);
for(ulong k = lowerK; k <= min(n, upperK); k += 1_000) {
double normRes = normApproxBinomCDFR(k, n, p);
double exactRes = binomialCDFR(k, n, p);
assert(abs(normRes - exactRes) < 0.001,
text(normRes, '\t', exactRes));
}
}
}
}
/**Returns the value of k for the given p-value, n and p. If p-value does
* not exactly map to a value of k, the value for which binomialCDF(k, n, p)
* is closest to pVal is used.*/
uint invBinomialCDF(double pVal, uint n, double p) {
dstatsEnforce(pVal >= 0 && pVal <= 1, "p-values must be between 0, 1.");
dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in binomial distribution.");
// Use normal approximation to get approx answer, then brute force search.
// This works better than you think because for small n, there's not much
// search space and for large n, the normal approx. is doublely good.
uint guess = cast(uint) max(round(
invNormalCDF(pVal, n * p, sqrt(n * p * (1 - p)))) + 0.5, 0);
if(guess > n) {
if(pVal < 0.5) // Numerical issues/overflow.
guess = 0;
else guess = n;
}
double guessP = binomialCDF(guess, n, p);
if(guessP == pVal) {
return guess;
} else if(guessP < pVal) {
for(uint k = guess + 1; k <= n; k++) {
double newP = guessP + binomialPMF(k, n, p);
if(abs(newP - pVal) > abs(guessP - pVal)) {
return k - 1;
} else {
guessP = newP;
}
}
return n;
} else {
for(uint k = guess - 1; k != uint.max; k--) {
double newP = guessP - binomialPMF(k + 1, n, p);
if(abs(newP - pVal) > abs(guessP - pVal)) {
return k + 1;
} else {
guessP = newP;
}
}
return 0;
}
}
unittest {
Random gen = Random(unpredictableSeed);
foreach(i; 0..1_000) {
// Restricted variable ranges are because, in the tails, more than one
// value of k can map to the same p-value at machine precision.
// Obviously, this is one of those corner cases that nothing can be
// done about. Using small n's, moderate p's prevents this.
uint n = uniform(5U, 10U);
uint k = uniform(0U, n);
double p = uniform(0.1L, 0.9L);
double pVal = binomialCDF(k, n, p);
assert(invBinomialCDF(pVal, n, p) == k);
}
}
///
double hypergeometricPMF(long x, long n1, long n2, long n)
in {
assert(x <= n);
} do {
if(x > n1 || x < (n - n2)) {
return 0;
}
double result = logNcomb(n1, x) + logNcomb(n2, n - x) - logNcomb(n1 + n2, n);
return exp(result);
}
unittest {
assert(approxEqual(hypergeometricPMF(5, 10, 10, 10), .3437182));
assert(approxEqual(hypergeometricPMF(9, 12, 10, 15), .27089783));
assert(approxEqual(hypergeometricPMF(9, 100, 100, 15), .15500003));
}
/**P(X <= x), where X is random variable. Uses either direct summation,
* normal or binomial approximation depending on parameters.*/
// If anyone knows a better algorithm for this, feel free...
// I've read a decent amount about it, though, and getting hypergeometric
// CDFs that are both accurate and fast is just plain hard. This
// implementation attempts to strike a balance between the two, so that
// both speed and accuracy are "good enough" for most practical purposes.
double hypergeometricCDF(long x, long n1, long n2, long n) {
dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution.");
dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution.");
dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution.");
ulong expec = (n1 * n) / (n1 + n2);
long nComp = n1 + n2 - n, xComp = n2 + x - n;
// Try to reduce number of calculations using identities.
if(x >= n1 || x == n) {
return 1;
} else if(x > expec && x > n / 2) {
return 1 - hypergeometricCDF(n - x - 1, n2, n1, n);
} else if(xComp < x && xComp > 0) {
return hypergeometricCDF(xComp, n2, n1, nComp);
}
// Speed depends on x mostly, so always use exact for small x.
if(x <= 100) {
return hyperExact(x, n1, n2, n);
}
// Determine whether to use exact, normal approx or binomial approx.
// Using obviously arbitrary but relatively stringent standards
// for determining whether to approximate.
enum NEXACT = 50L;
double p = cast(double) n1 / (n1 + n2);
double pComp = cast(double) n2 / (n1 + n2);
double pMin = min(p, pComp);
if(min(n, nComp) * pMin >= 100) {
// Since high relative error in the lower tail is a significant problem,
// this is a hack to improve the normal approximation: Use the normal
// approximation, except calculate the last NEXACT elements exactly,
// since elements around the e.v. are where absolute error is highest.
// For large x, gives most of the accuracy of an exact calculation in
// only a small fraction of the time.
if(x <= expec + NEXACT / 2) {
return min(1, normApproxHyper(x - NEXACT, n1, n2, n) +
hyperExact(x, n1, n2, n, x - NEXACT + 1));
} else {
// Just use plain old normal approx. Since P is large, the
// relative error won't be so bad anyhow.
return normApproxHyper(x, n1, n2, n);
}
}
// Try to make n as small as possible by applying mathematically equivalent
// transformations so that binomial approx. works as well as possible.
ulong bSc1 = (n1 + n2) / n, bSc2 = (n1 + n2) / n1;
if(bSc1 >= 50 && bSc1 > bSc2) {
// Same hack as normal approximation for rel. acc. in lower tail.
if(x <= expec + NEXACT / 2) {
return min(1, binomialCDF(cast(uint) (x - NEXACT), cast(uint) n, p) +
hyperExact(x, n1, n2, n, x - NEXACT + 1));
} else {
return binomialCDF(cast(uint) x, cast(uint) n, p);
}
} else if(bSc2 >= 50 && bSc2 > bSc1) {
double p2 = cast(double) n / (n1 + n2);
if(x <= expec + NEXACT / 2) {
return min(1, binomialCDF(cast(uint) (x - NEXACT), cast(uint) n1, p2) +
hyperExact(x, n1, n2, n, x - NEXACT + 1));
} else {
return binomialCDF(cast(uint) x, cast(uint) n1, p2);
}
} else {
return hyperExact(x, n1, n2, n);
}
}
unittest {
// Values from R and the Maxima CAS.
// Test exact branch, including reversing, complementing.
assert(approxEqual(hypergeometricCDF(5, 10, 10, 10), 0.6718591));
assert(approxEqual(hypergeometricCDF(3, 11, 15, 10), 0.27745322));
assert(approxEqual(hypergeometricCDF(18, 27, 31, 35), 0.88271714));
assert(approxEqual(hypergeometricCDF(21, 29, 31, 35), 0.99229253));
// Normal branch.
assert(approxEqual(hypergeometricCDF(501, 2000, 1000, 800), 0.002767073));
assert(approxEqual(hypergeometricCDF(565, 2000, 1000, 800), 0.9977068));
assert(approxEqual(hypergeometricCDF(2700, 10000, 20000, 8000), 0.825652));
// Binomial branch. One for each transformation.
assert(approxEqual(hypergeometricCDF(110, 5000, 7000, 239), 0.9255627));
assert(approxEqual(hypergeometricCDF(19840, 2950998, 12624, 19933), 0.2020618));
assert(approxEqual(hypergeometricCDF(130, 24195, 52354, 295), 0.9999973));
assert(approxEqual(hypergeometricCDF(103, 901, 49014, 3522), 0.999999));
}
///P(X >= x), where X is random variable.
double hypergeometricCDFR(ulong x, ulong n1, ulong n2, ulong n) {
dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution.");
dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution.");
dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution.");
return hypergeometricCDF(n - x, n2, n1, n);
}
unittest {
//Reverses n1, n2 and subtracts x from n to get mirror image.
assert(approxEqual(hypergeometricCDF(5,10,10,10),
hypergeometricCDFR(5,10,10,10)));
assert(approxEqual(hypergeometricCDF(3, 11, 15, 10),
hypergeometricCDFR(7, 15, 11, 10)));
assert(approxEqual(hypergeometricCDF(18, 27, 31, 35),
hypergeometricCDFR(17, 31, 27, 35)));
assert(approxEqual(hypergeometricCDF(21, 29, 31, 35),
hypergeometricCDFR(14, 31, 29, 35)));
}
double hyperExact(ulong x, ulong n1, ulong n2, ulong n, ulong startAt = 0) {
dstatsEnforce(x <= n, "x must be <= n in hypergeometric distribution.");
dstatsEnforce(n <= n1 + n2, "n must be <= n1 + n2 in hypergeometric distribution.");
dstatsEnforce(x >= 0, "x must be >= 0 in hypergeometric distribution.");
immutable double constPart = logFactorial(n1) + logFactorial(n2) +
logFactorial(n) + logFactorial(n1 + n2 - n) - logFactorial(n1 + n2);
double sum = 0;
for(ulong i = x; i != startAt - 1; i--) {
double oldSum = sum;
sum += exp(constPart - logFactorial(i) - logFactorial(n1 - i) -
logFactorial(n2 + i - n) - logFactorial(n - i));
if(isIdentical(sum, oldSum)) { // At full machine precision.
break;
}
}
return sum;
}
double normApproxHyper(ulong x, ulong n1, ulong n2, ulong n) {
double p1 = cast(double) n1 / (n1 + n2);
double p2 = cast(double) n2 / (n1 + n2);
double numer = x + 0.5L - n * p1;
double denom = sqrt(n * p1 * p2 * (n1 + n2 - n) / (n1 + n2 - 1));
return normalCDF(numer / denom);
}
// Aliases for old names. Not documented because new names should be used.
deprecated {
alias chiSquareCDF chiSqrCDF;
alias chiSquareCDFR chiSqrCDFR;
alias invChiSquareCDFR invChiSqCDFR;
}
///
double chiSquarePDF(double x, double v) {
dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution.");
dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
// Calculate in log space for stability.
immutable logX = log(x);
immutable numerator = logX * (0.5 * v - 1) - 0.5 * x;
immutable denominator = LN2 * (0.5 * v) + logGamma(0.5 * v);
return exp(numerator - denominator);
}
unittest {
assert( approxEqual(chiSquarePDF(1, 2), 0.3032653));
assert( approxEqual(chiSquarePDF(2, 1), 0.1037769));
}
/**
* $(POWER &chi;,2) distribution function and its complement.
*
* Returns the area under the left hand tail (from 0 to x)
* of the Chi square probability density function with
* v degrees of freedom. The complement returns the area under
* the right hand tail (from x to &infin;).
*
* chiSquareCDF(x | v) = ($(INTEGRATE 0, x)
* $(POWER t, v/2-1) $(POWER e, -t/2) dt )
* / $(POWER 2, v/2) $(GAMMA)(v/2)
*
* chiSquareCDFR(x | v) = ($(INTEGRATE x, &infin;)
* $(POWER t, v/2-1) $(POWER e, -t/2) dt )
* / $(POWER 2, v/2) $(GAMMA)(v/2)
*
* Params:
* v = degrees of freedom. Must be positive.
* x = the $(POWER &chi;,2) variable. Must be positive.
*
*/
double chiSquareCDF(double x, double v) {
dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution.");
dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
// These are very common special cases where we can make the calculation
// a lot faster and/or more accurate.
if(v == 1) {
// Then it's the square of a normal(0, 1).
return 1.0L - erfc(sqrt(x) * SQRT1_2);
} else if(v == 2) {
// Then it's an exponential w/ lambda == 1/2.
return 1.0L - exp(-0.5 * x);
} else {
return gammaIncomplete(0.5 * v, 0.5 * x);
}
}
///
double chiSquareCDFR(double x, double v) {
dstatsEnforce(x >= 0, "x must be >= 0 in chi-square distribution.");
dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
// These are very common special cases where we can make the calculation
// a lot faster and/or more accurate.
if(v == 1) {
// Then it's the square of a normal(0, 1).
return erfc(sqrt(x) * SQRT1_2);
} else if(v == 2) {
// Then it's an exponential w/ lambda == 1/2.
return exp(-0.5 * x);
} else {
return gammaIncompleteCompl(0.5 * v, 0.5 * x);
}
}
/**
* Inverse of complemented $(POWER &chi;, 2) distribution
*
* Finds the $(POWER &chi;, 2) argument x such that the integral
* from x to &infin; of the $(POWER &chi;, 2) density is equal
* to the given cumulative probability p.
*
* Params:
* p = Cumulative probability. 0<= p <=1.
* v = Degrees of freedom. Must be positive.
*
*/
double invChiSquareCDFR(double v, double p) {
dstatsEnforce(v >= 1.0, "Must have at least 1 degree of freedom for chi-square.");
dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
return 2.0 * gammaIncompleteComplInverse( 0.5*v, p);
}
unittest {
assert(feqrel(chiSquareCDFR(invChiSquareCDFR(3.5, 0.1), 3.5), 0.1)>=double.mant_dig-3);
assert(approxEqual(
chiSquareCDF(0.4L, 19.02L) + chiSquareCDFR(0.4L, 19.02L), 1.0L));
assert(ae( invChiSquareCDFR( 3, chiSquareCDFR(1, 3)), 1));
assert(ae(chiSquareCDFR(0.2, 1), 0.6547208));
assert(ae(chiSquareCDFR(0.2, 2), 0.9048374));
assert(ae(chiSquareCDFR(0.8, 1), 0.3710934));
assert(ae(chiSquareCDFR(0.8, 2), 0.67032));
assert(ae(chiSquareCDF(0.2, 1), 0.3452792));
assert(ae(chiSquareCDF(0.2, 2), 0.09516258));
assert(ae(chiSquareCDF(0.8, 1), 0.6289066));
assert(ae(chiSquareCDF(0.8, 2), 0.3296800));
}
///
double normalPDF(double x, double mean = 0, double sd = 1) {
dstatsEnforce(sd > 0, "Standard deviation must be > 0 for normal distribution.");
double dev = x - mean;
return exp(-(dev * dev) / (2 * sd * sd)) / (sd * SQ2PI);
}
unittest {
assert(approxEqual(normalPDF(3, 1, 2), 0.1209854));
}
///P(X < x) for normal distribution where X is random var.
double normalCDF(double x, double mean = 0, double stdev = 1) {
dstatsEnforce(stdev > 0, "Standard deviation must be > 0 for normal distribution.");
// Using a slightly non-obvious implementation in terms of erfc because
// it seems more accurate than erf for very small values of Z.
double Z = (-x + mean) / stdev;
return erfc(Z*SQRT1_2)/2;
}
unittest {
assert(approxEqual(normalCDF(2), .9772498));
assert(approxEqual(normalCDF(-2), .02275013));
assert(approxEqual(normalCDF(1.3), .90319951));
}
///P(X > x) for normal distribution where X is random var.
double normalCDFR(double x, double mean = 0, double stdev = 1) {
dstatsEnforce(stdev > 0, "Standard deviation must be > 0 for normal distribution.");
double Z = (x - mean) / stdev;
return erfc(Z * SQRT1_2) / 2;
}
unittest {
//Should be essentially a mirror image of normalCDF.
for(double i = -8; i < 8; i += .1) {
assert(approxEqual(normalCDF(i), normalCDFR(-i)));
}
}
private enum SQRT2PI = 0x1.40d931ff62705966p+1; // 2.5066282746310005024
private enum EXP_2 = 0.13533528323661269189L; /* exp(-2) */
/******************************
* Inverse of Normal distribution function
*
* Returns the argument, x, for which the area under the
* Normal probability density function (integrated from
* minus infinity to x) is equal to p.
*/
double invNormalCDF(double p, double mean = 0, double sd = 1) {
dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
dstatsEnforce(sd > 0, "Standard deviation must be > 0 for normal distribution.");
return normalDistributionInverse(p) * sd + mean;
}
unittest {
// The values below are from Excel 2003.
assert(fabs(invNormalCDF(0.001) - (-3.09023230616779))< 0.00000000000005);
assert(fabs(invNormalCDF(1e-50) - (-14.9333375347885))< 0.00000000000005);
assert(feqrel(invNormalCDF(0.999), -invNormalCDF(0.001))>double.mant_dig-6);
// Excel 2003 gets all the following values wrong!
assert(invNormalCDF(0.0)==-double.infinity);
assert(invNormalCDF(1.0)==double.infinity);
assert(invNormalCDF(0.5)==0);
// I don't know the correct result for low values
// (Excel 2003 returns norminv(p) = -30 for all p < 1e-200).
// The value tested here is the one the function returned in Jan 2006.
double unknown1 = invNormalCDF(1e-250L);
assert( fabs(unknown1 -(-33.79958617269L) ) < 0.00000005);
Random gen;
gen.seed(unpredictableSeed);
// normalCDF function trivial given ERF, unlikely to contain subtle bugs.
// Just make sure invNormalCDF works like it should as the inverse.
foreach(i; 0..1000) {
double x = uniform(0.0L, 1.0L);
double mean = uniform(0.0L, 100.0L);
double sd = uniform(1.0L, 3.0L);
double inv = invNormalCDF(x, mean, sd);
double rec = normalCDF(inv, mean, sd);
assert(approxEqual(x, rec));
}
}
///
double logNormalPDF(double x, double mu = 0, double sigma = 1) {
dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution.");
immutable mulTerm = 1.0L / (x * sigma * SQ2PI);
double expTerm = log(x) - mu;
expTerm *= expTerm;
expTerm /= 2 * sigma * sigma;
return mulTerm * exp(-expTerm);
}
unittest {
// Values from R.
assert(approxEqual(logNormalPDF(1, 0, 1), 0.3989423));
assert(approxEqual(logNormalPDF(2, 2, 3), 0.06047173));
}
///
double logNormalCDF(double x, double mu = 0, double sigma = 1) {
dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution.");
return 0.5L + 0.5L * erf((log(x) - mu) / (sigma * SQRT2));
}
unittest {
assert(approxEqual(logNormalCDF(4), 0.9171715));
assert(approxEqual(logNormalCDF(1, -2, 3), 0.7475075));
}
///
double logNormalCDFR(double x, double mu = 0, double sigma = 1) {
dstatsEnforce(sigma > 0, "sigma must be > 0 for log-normal distribution.");
return 0.5L - 0.5L * erf((log(x) - mu) / (sigma * SQRT2));
}
unittest {
assert(approxEqual(logNormalCDF(4) + logNormalCDFR(4), 1));
assert(approxEqual(logNormalCDF(1, -2, 3) + logNormalCDFR(1, -2, 3), 1));
}
///
double weibullPDF(double x, double shape, double scale = 1) {
dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution.");
dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution.");
if(x < 0) {
return 0;
}
double ret = pow(x / scale, shape - 1) * exp( -pow(x / scale, shape));
return ret * (shape / scale);
}
unittest {
assert(approxEqual(weibullPDF(2,1,3), 0.1711390));
}
///
double weibullCDF(double x, double shape, double scale = 1) {
dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution.");
dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution.");
double exponent = pow(x / scale, shape);
return 1 - exp(-exponent);
}
unittest {
assert(approxEqual(weibullCDF(2, 3, 4), 0.1175031));
}
///
double weibullCDFR(double x, double shape, double scale = 1) {
dstatsEnforce(shape > 0, "shape must be > 0 for weibull distribution.");
dstatsEnforce(scale > 0, "scale must be > 0 for weibull distribution.");
double exponent = pow(x / scale, shape);
return exp(-exponent);
}
unittest {
assert(approxEqual(weibullCDF(2, 3, 4) + weibullCDFR(2, 3, 4), 1));
}
// For K-S tests in dstats.random. Todo: Flesh out.
double waldCDF(double x, double mu, double lambda) {
double sqr = sqrt(lambda / (2 * x));
double term1 = 1 + erf(sqr * (x / mu - 1));
double term2 = exp(2 * lambda / mu);
double term3 = 1 - erf(sqr * (x / mu + 1));
return 0.5L * term1 + 0.5L * term2 * term3;
}
// ditto.
double rayleighCDF(double x, double mode) {
return 1.0L - exp(-x * x / (2 * mode * mode));
}
///
double studentsTPDF(double t, double df) {
dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom.");
immutable logPart = logGamma(0.5 * df + 0.5) - logGamma(0.5 * df);
immutable term1 = exp(logPart) / sqrt(df * PI);
immutable term2 = (1.0 + t / df * t) ^^ (-0.5 * df - 0.5);
return term1 * term2;
}
///
double studentsTCDF(double t, double df) {
dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom.");
double x = (t + sqrt(t * t + df)) / (2 * sqrt(t * t + df));
return betaIncomplete(df * 0.5L, df * 0.5L, x);
}
///
double studentsTCDFR(double t, double df) {
return studentsTCDF(-t, df);
}
unittest {
assert(approxEqual(studentsTPDF(1, 1), 0.1591549));
assert(approxEqual(studentsTPDF(3, 10), 0.0114055));
assert(approxEqual(studentsTPDF(-4, 5), 0.005123727));
assert(approxEqual(studentsTCDF(1, 1), 0.75));
assert(approxEqual(studentsTCDF(1.061, 2), 0.8));
assert(approxEqual(studentsTCDF(5.959, 5), 0.9995));
assert(approxEqual(studentsTCDF(.667, 20), 0.75));
assert(approxEqual(studentsTCDF(2.353, 3), 0.95));
}
/******************************************
* Inverse of Student's t distribution
*
* Given probability p and degrees of freedom df,
* finds the argument t such that the one-sided
* studentsDistribution(nu,t) is equal to p.
* Used to test whether two distributions have the same
* standard deviation.
*
* Params:
* df = degrees of freedom. Must be >1
* p = probability. 0 < p < 1
*/
double invStudentsTCDF(double p, double df) {
dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
dstatsEnforce(df > 0, "Student's T must have >0 degrees of freedom.");
if (p==0) return -double.infinity;
if (p==1) return double.infinity;
double rk, z;
rk = df;
if ( p > 0.25L && p < 0.75L ) {
if ( p == 0.5L ) return 0;
z = 1.0L - 2.0L * p;
z = betaIncompleteInverse( 0.5L, 0.5L*rk, fabs(z) );
double t = sqrt( rk*z/(1.0L-z) );
if( p < 0.5L )
t = -t;
return t;
}
int rflg = -1; // sign of the result
if (p >= 0.5L) {
p = 1.0L - p;
rflg = 1;
}
z = betaIncompleteInverse( 0.5L*rk, 0.5L, 2.0L*p );
if (z<0) return rflg * double.infinity;
return rflg * sqrt( rk/z - rk );
}
unittest {
// The remaining values listed here are from Excel, and are unlikely to be accurate
// in the last decimal places. However, they are helpful as a sanity check.
// Microsoft Excel 2003 gives TINV(2*(1-0.995), 10) == 3.16927267160917
assert(approxEqual(invStudentsTCDF(0.995, 10), 3.169_272_67L));
assert(approxEqual(invStudentsTCDF(0.6, 8), 0.261_921_096_769_043L));
assert(approxEqual(invStudentsTCDF(0.4, 18), -0.257_123_042_655_869L));
assert(approxEqual(studentsTCDF(invStudentsTCDF(0.4L, 18), 18), .4L));
assert(approxEqual(studentsTCDF( invStudentsTCDF(0.9L, 11), 11), 0.9L));
}
/**
* The Fisher distribution, its complement, and inverse.
*
* The F density function (also known as Snedcor's density or the
* variance ratio density) is the density
* of x = (u1/df1)/(u2/df2), where u1 and u2 are random
* variables having $(POWER &chi;,2) distributions with df1
* and df2 degrees of freedom, respectively.
*
* fisherCDF returns the area from zero to x under the F density
* function. The complementary function,
* fisherCDFR, returns the area from x to &infin; under the F density function.
*
* The inverse of the complemented Fisher distribution,
* invFisherCDFR, finds the argument x such that the integral
* from x to infinity of the F density is equal to the given probability y.
* Params:
* df1 = Degrees of freedom of the first variable. Must be >= 1
* df2 = Degrees of freedom of the second variable. Must be >= 1
* x = Must be >= 0
*/
double fisherCDF(double x, double df1, double df2) {
dstatsEnforce(df1 > 0 && df2 > 0,
"Fisher distribution must have >0 degrees of freedom.");
dstatsEnforce(x >= 0, "x must be >=0 for Fisher distribution.");
double a = cast(double)(df1);
double b = cast(double)(df2);
double w = a * x;
w = w/(b + w);
return betaIncomplete(0.5L*a, 0.5L*b, w);
}
/** ditto */
double fisherCDFR(double x, double df1, double df2) {
dstatsEnforce(df1 > 0 && df2 > 0,
"Fisher distribution must have >0 degrees of freedom.");
dstatsEnforce(x >= 0, "x must be >=0 for Fisher distribution.");
double a = cast(double)(df1);
double b = cast(double)(df2);
double w = b / (b + a * x);
return betaIncomplete( 0.5L*b, 0.5L*a, w );
}
/**
* Inverse of complemented Fisher distribution
*
* Finds the F density argument x such that the integral
* from x to infinity of the F density is equal to the
* given probability p.
*
* This is accomplished using the inverse beta integral
* function and the relations
*
* z = betaIncompleteInverse( df2/2, df1/2, p ),
* x = df2 (1-z) / (df1 z).
*
* Note that the following relations hold for the inverse of
* the uncomplemented F distribution:
*
* z = betaIncompleteInverse( df1/2, df2/2, p ),
* x = df2 z / (df1 (1-z)).
*/
double invFisherCDFR(double df1, double df2, double p ) {
dstatsEnforce(df1 > 0 && df2 > 0,
"Fisher distribution must have >0 degrees of freedom.");
dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
double a = df1;
double b = df2;
/* Compute probability for x = 0.5. */
double w = betaIncomplete( 0.5L*b, 0.5L*a, 0.5L );
/* If that is greater than p, then the solution w < .5.
Otherwise, solve at 1-p to remove cancellation in (b - b*w). */
if ( w > p || p < 0.001L) {
w = betaIncompleteInverse( 0.5L*b, 0.5L*a, p );
return (b - b*w)/(a*w);
} else {
w = betaIncompleteInverse( 0.5L*a, 0.5L*b, 1.0L - p );
return b*w/(a*(1.0L-w));
}
}
unittest {
// fDistCompl(df1, df2, x) = Excel's FDIST(x, df1, df2)
assert(fabs(fisherCDFR(16.5, 6, 4) - 0.00858719177897249L)< 0.0000000000005L);
assert(fabs((1-fisherCDF(0.1, 12, 23)) - 0.99990562845505L)< 0.0000000000005L);
assert(fabs(invFisherCDFR(8, 34, 0.2) - 1.48267037661408L)< 0.0000000005L);
assert(fabs(invFisherCDFR(4, 16, 0.008) - 5.043_537_593_48596L)< 0.0000000005L);
// This one used to fail because of a bug in the definition of MINLOG.
assert(approxEqual(fisherCDFR(invFisherCDFR(4,16, 0.008), 4, 16), 0.008));
}
///
double negBinomPMF(ulong k, ulong n, double p) {
dstatsEnforce(p >= 0 && p <= 1,
"p must be between 0, 1 for negative binomial distribution.");
return exp(logNcomb(k - 1 + n, k) + n * log(p) + k * log(1 - p));
}
unittest {
// Values from R.
assert(approxEqual(negBinomPMF(1, 8, 0.7), 0.1383552));
assert(approxEqual(negBinomPMF(3, 2, 0.5), 0.125));
}
/**********************
* Negative binomial distribution.
*
* Returns the sum of the terms 0 through k of the negative
* binomial distribution:
*
* $(BIGSUM j=0, k) $(CHOOSE n+j-1, j) $(POWER p, n) $(POWER (1-p), j)
* ???? In mathworld, it is
* $(BIGSUM j=0, k) $(CHOOSE n+j-1, j-1) $(POWER p, j) $(POWER (1-p), n)
*
* In a sequence of Bernoulli trials, this is the probability
* that k or fewer failures precede the n-th success.
*
* The arguments must be positive, with 0 < p < 1 and r>0.
*
* The Geometric Distribution is a special case of the negative binomial
* distribution.
* -----------------------
* geometricDistribution(k, p) = negativeBinomialDistribution(k, 1, p);
* -----------------------
* References:
* $(LINK http://mathworld.wolfram.com/NegativeBinomialDistribution.html)
*/
double negBinomCDF(ulong k, ulong n, double p ) {
dstatsEnforce(p >= 0 && p <= 1,
"p must be between 0, 1 for negative binomial distribution.");
if ( k == 0 ) return pow(p, cast(double) n);
return betaIncomplete( n, k + 1, p );
}
unittest {
// Values from R.
assert(approxEqual(negBinomCDF(50, 50, 0.5), 0.5397946));
assert(approxEqual(negBinomCDF(2, 1, 0.5), 0.875));
}
/**Probability that k or more failures precede the nth success.*/
double negBinomCDFR(ulong k, ulong n, double p) {
dstatsEnforce(p >= 0 && p <= 1,
"p must be between 0, 1 for negative binomial distribution.");
if(k == 0)
return 1;
return betaIncomplete(k, n, 1.0L - p);
}
unittest {
assert(approxEqual(negBinomCDFR(10, 20, 0.5), 1 - negBinomCDF(9, 20, 0.5)));
}
///
ulong invNegBinomCDF(double pVal, ulong n, double p) {
dstatsEnforce(p >= 0 && p <= 1,
"p must be between 0, 1 for negative binomial distribution.");
dstatsEnforce(pVal >= 0 && pVal <= 1,
"P-values must be between 0, 1.");
// Normal or gamma approx, then adjust.
double mean = n * (1 - p) / p;
double var = n * (1 - p) / (p * p);
double skew = (2 - p) / sqrt(n * (1 - p));
double kk = 4.0L / (skew * skew);
double theta = sqrt(var / kk);
double offset = (kk * theta) - mean + 0.5L;
ulong guess;
// invGammaCDFR is very expensive, but worth it in cases where normal approx
// would be the worst. Otherwise, use normal b/c it's *MUCH* cheaper to
// calculate.
if(skew > 1.5 && var > 1_048_576)
guess = cast(long) max(round(
invGammaCDFR(1 - pVal, 1 / theta, kk) - offset), 0.0L);
else
guess = cast(long) max(round(
invNormalCDF(pVal, mean, sqrt(var)) + 0.5), 0.0L);
// This is pretty arbitrary behavior, but I don't want to use exceptions
// and it has to be handled as a special case.
if(pVal > 1 - double.epsilon)
return ulong.max;
double guessP = negBinomCDF(guess, n, p);
if(guessP == pVal) {
return guess;
} else if(guessP < pVal) {
for(ulong k = guess + 1; ; k++) {
double newP = guessP + negBinomPMF(k, n, p);
// Test for aliasing.
if(newP == guessP)
return k - 1;
if(abs(pVal - newP) > abs(guessP - pVal)) {
return k - 1;
} else if(newP >= 1) {
return k;
} else {
guessP = newP;
}
}
} else {
for(ulong k = guess - 1; guess != ulong.max; k--) {
double newP = guessP - negBinomPMF(k + 1, n, p);
// Test for aliasing.
if(newP == guessP)
return k + 1;
if(abs(newP - pVal) > abs(guessP - pVal)) {
return k + 1;
} else {
guessP = newP;
}
}
return 0;
}
}
unittest {
Random gen = Random(unpredictableSeed);
uint nSkipped;
foreach(i; 0..1000) {
uint n = uniform(1u, 10u);
double p = uniform(0.0L, 1L);
uint k = uniform(0, 20);
double pVal = negBinomCDF(k, n, p);
// In extreme tails, p-values can alias, giving incorrect results.
// This is a corner case that nothing can be done about. Just skip
// these.
if(pVal >= 1 - 10 * double.epsilon) {
nSkipped++;
continue;
}
assert(invNegBinomCDF(pVal, n, p) == k);
}
}
///
double exponentialPDF(double x, double lambda) {
dstatsEnforce(x >= 0, "x must be >0 in exponential distribution");
dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
return lambda * exp(-lambda * x);
}
///
double exponentialCDF(double x, double lambda) {
dstatsEnforce(x >= 0, "x must be >0 in exponential distribution");
dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
return 1.0 - exp(-lambda * x);
}
///
double exponentialCDFR(double x, double lambda) {
dstatsEnforce(x >= 0, "x must be >0 in exponential distribution");
dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
return exp(-lambda * x);
}
///
double invExponentialCDF(double p, double lambda) {
dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in exponential distribution");
dstatsEnforce(lambda > 0, "lambda must be >0 in exponential distribution");
return log(-1.0 / (p - 1.0)) / lambda;
}
unittest {
// Values from R.
assert(approxEqual(exponentialPDF(0.75, 3), 0.3161977));
assert(approxEqual(exponentialCDF(0.75, 3), 0.8946008));
assert(approxEqual(exponentialCDFR(0.75, 3), 0.1053992));
assert(approxEqual(invExponentialCDF(0.8, 2), 0.804719));
assert(approxEqual(invExponentialCDF(0.2, 7), 0.03187765));
}
///
double gammaPDF(double x, double rate, double shape) {
dstatsEnforce(x > 0, "x must be >0 in gamma distribution.");
dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
immutable scale = 1.0 / rate;
immutable firstPart = x ^^ (shape - 1);
immutable logNumer = -x / scale;
immutable logDenom = logGamma(shape) + shape * log(scale);
return firstPart * exp(logNumer - logDenom);
}
///
double gammaCDF(double x, double rate, double shape) {
dstatsEnforce(x > 0, "x must be >0 in gamma distribution.");
dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
return gammaIncomplete(shape, rate * x);
}
///
double gammaCDFR(double x, double rate, double shape) {
dstatsEnforce(x > 0, "x must be >0 in gamma distribution.");
dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
return gammaIncompleteCompl(shape, rate * x);
}
/**This just calls invGammaCDFR w/ 1 - p b/c invGammaCDFR is more accurate,
* but this function is necessary for consistency.
*/
double invGammaCDF(double p, double rate, double shape) {
return invGammaCDFR(1.0 - p, rate, shape);
}
///
double invGammaCDFR(double p, double rate, double shape) {
dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 in gamma distribution.");
dstatsEnforce(rate > 0, "rate must be >0 in gamma distribution.");
dstatsEnforce(shape > 0, "shape must be >0 in gamma distribution.");
double ratex = gammaIncompleteComplInverse(shape, p);
return ratex / rate;
}
unittest {
assert(approxEqual(gammaPDF(1, 2, 5), 0.1804470));
assert(approxEqual(gammaPDF(0.5, 8, 4), 1.562935));
assert(approxEqual(gammaPDF(3, 2, 7), 0.3212463));
assert(approxEqual(gammaCDF(1, 2, 5), 0.05265302));
assert(approxEqual(gammaCDFR(1, 2, 5), 0.947347));
double inv = invGammaCDFR(0.78, 2, 1);
assert(approxEqual(gammaCDFR(inv, 2, 1), 0.78));
double inv2 = invGammaCDF(0.78, 2, 1);
assert(approxEqual(gammaCDF(inv2, 2, 1), 0.78));
}
///
double betaPDF(double x, double alpha, double beta) {
dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution.");
return x ^^ (alpha - 1) * (1 - x) ^^ (beta - 1) /
std.mathspecial.beta(alpha, beta);
}
///
double betaCDF(double x, double alpha, double beta) {
dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution.");
return std.mathspecial.betaIncomplete(alpha, beta, x);
}
///
double betaCDFR(double x, double alpha, double beta) {
dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
dstatsEnforce(x >= 0 && x <= 1, "x must be between 0, 1 for beta distribution.");
return std.mathspecial.betaIncomplete(beta, alpha, 1 - x);
}
///
double invBetaCDF(double p, double alpha, double beta) {
dstatsEnforce(alpha > 0, "Alpha must be >0 for beta distribution.");
dstatsEnforce(beta > 0, "Beta must be >0 for beta distribution.");
dstatsEnforce(p >= 0 && p <= 1, "p must be between 0, 1 for beta distribution.");
return std.mathspecial.betaIncompleteInverse(alpha, beta, p);
}
unittest {
// Values from R.
assert(approxEqual(betaPDF(0.3, 2, 3), 1.764));
assert(approxEqual(betaPDF(0.78, 0.9, 4), 0.03518569));
assert(approxEqual(betaCDF(0.3, 2, 3), 0.3483));
assert(approxEqual(betaCDF(0.78, 0.9, 4), 0.9980752));
assert(approxEqual(betaCDFR(0.3, 2, 3), 0.6517));
assert(approxEqual(betaCDFR(0.78, 0.9, 4), 0.001924818));
assert(approxEqual(invBetaCDF(0.3483, 2, 3), 0.3));
assert(approxEqual(invBetaCDF(0.9980752, 0.9, 4), 0.78));
}
/**
The Dirichlet probability density.
Params:
x = An input range of observed values. All must be between [0, 1]. They
must also sum to 1, though this is not checked because small deviations from
this may result due to numerical error.
alpha = A forward range of parameters. This must have the same length as
x.
*/
double dirichletPDF(X, A)(X x, A alpha)
if(isInputRange!X && isForwardRange!A && is(ElementType!X : double) &&
is(ElementType!A : double)) {
// Evaluating the multinomial beta function = product(gamma(alpha_1)) over
// gamma(sum(alpha)), in log space.
double logNormalizer = 0;
double sumAlpha = 0;
foreach(a; alpha.save) {
dstatsEnforce(a > 0, "All alpha values must be > 0 for Dirichlet distribution.");
logNormalizer += logGamma(a);
sumAlpha += a;
}
logNormalizer -= logGamma(sumAlpha);
double sum = 0;
foreach(xElem, a; lockstep(x, alpha)) {
dstatsEnforce(xElem > 0, "All x values must be > 0 for Dirichlet distribution.");
sum += log(xElem) * (a - 1);
}
sum -= logNormalizer;
return exp(sum);
}
unittest {
// Test against beta
assert(approxEqual(dirichletPDF([0.1, 0.9], [2, 3]), betaPDF(0.1, 2, 3)));
// A few values from R's gregmisc package
assert(approxEqual(dirichletPDF([0.1, 0.2, 0.7], [4, 5, 6]), 1.356672));
assert(approxEqual(dirichletPDF([0.8, 0.05, 0.15], [8, 5, 6]), 0.04390199));
}
///
double cauchyPDF(double X, double X0 = 0, double gamma = 1) {
dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
double toSquare = (X - X0) / gamma;
return 1.0L / (
PI * gamma * (1 + toSquare * toSquare));
}
unittest {
assert(approxEqual(cauchyPDF(5), 0.01224269));
assert(approxEqual(cauchyPDF(2), 0.06366198));
}
///
double cauchyCDF(double X, double X0 = 0, double gamma = 1) {
dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
return M_1_PI * atan((X - X0) / gamma) + 0.5L;
}
unittest {
// Values from R
assert(approxEqual(cauchyCDF(-10), 0.03172552));
assert(approxEqual(cauchyCDF(1), 0.75));
}
///
double cauchyCDFR(double X, double X0 = 0, double gamma = 1) {
dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
return M_1_PI * atan((X0 - X) / gamma) + 0.5L;
}
unittest {
// Values from R
assert(approxEqual(1 - cauchyCDFR(-10), 0.03172552));
assert(approxEqual(1 - cauchyCDFR(1), 0.75));
}
///
double invCauchyCDF(double p, double X0 = 0, double gamma = 1) {
dstatsEnforce(gamma > 0, "gamma must be > 0 for Cauchy distribution.");
dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
return X0 + gamma * tan(PI * (p - 0.5L));
}
unittest {
// cauchyCDF already tested. Just make sure this is the inverse.
assert(approxEqual(invCauchyCDF(cauchyCDF(.5)), .5));
assert(approxEqual(invCauchyCDF(cauchyCDF(.99)), .99));
assert(approxEqual(invCauchyCDF(cauchyCDF(.03)), .03));
}
// For K-S tests in dstats.random. To be fleshed out later. Intentionally
// lacking ddoc.
double logisticCDF(double x, double loc, double shape) {
return 1.0L / (1 + exp(-(x - loc) / shape));
}
///
double laplacePDF(double x, double mu = 0, double b = 1) {
dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
return (exp(-abs(x - mu) / b)) / (2 * b);
}
unittest {
// Values from Maxima.
assert(approxEqual(laplacePDF(3, 2, 1), 0.18393972058572));
assert(approxEqual(laplacePDF(-8, 6, 7), 0.0096668059454723));
}
///
double laplaceCDF(double X, double mu = 0, double b = 1) {
dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
double diff = (X - mu);
double sign = (diff > 0) ? 1 : -1;
return 0.5L *(1 + sign * (1 - exp(-abs(diff) / b)));
}
unittest {
// Values from Octave.
assert(approxEqual(laplaceCDF(5), 0.9963));
assert(approxEqual(laplaceCDF(-3.14), .021641));
assert(approxEqual(laplaceCDF(0.012), 0.50596));
}
///
double laplaceCDFR(double X, double mu = 0, double b = 1) {
dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
double diff = (mu - X);
double sign = (diff > 0) ? 1 : -1;
return 0.5L *(1 + sign * (1 - exp(-abs(diff) / b)));
}
unittest {
// Values from Octave.
assert(approxEqual(1 - laplaceCDFR(5), 0.9963));
assert(approxEqual(1 - laplaceCDFR(-3.14), .021641));
assert(approxEqual(1 - laplaceCDFR(0.012), 0.50596));
}
///
double invLaplaceCDF(double p, double mu = 0, double b = 1) {
dstatsEnforce(p >= 0 && p <= 1, "P-values must be between 0, 1.");
dstatsEnforce(b > 0, "b must be > 0 for laplace distribution.");
double p05 = p - 0.5L;
double sign = (p05 < 0) ? -1.0L : 1.0L;
return mu - b * sign * log(1.0L - 2 * abs(p05));
}
unittest {
assert(approxEqual(invLaplaceCDF(0.012), -3.7297));
assert(approxEqual(invLaplaceCDF(0.82), 1.0217));
}
double kolmDist()(double x) {
pragma(msg, "kolmDist is scheduled for deprecation. Please use " ~
"kolmogorovDistrib instead.");
return kolmogorovDistrib(x);
}
/**Kolmogorov distribution. Used in Kolmogorov-Smirnov testing.
*
* References: http://en.wikipedia.org/wiki/Kolmogorov-Smirnov
*/
double kolmogorovDistrib(immutable double x) {
dstatsEnforce(x >= 0, "x must be >= 0 for Kolmogorov distribution.");
if(x == 0) {
//Handle as a special case. Otherwise, get NAN b/c of divide by zero.
return 0;
}
double result = 0;
double i = 1;
immutable xSquared = x * x;
while(true) {
immutable delta = exp(-(2 * i - 1) * (2 * i - 1) * PI * PI / (8 * xSquared));
i++;
immutable oldResult = result;
result += delta;
if(isNaN(result) || oldResult == result) {
break;
}
}
result *= (sqrt(2 * PI) / x);
return result;
}
unittest {
assert(approxEqual(1 - kolmogorovDistrib(.75), 0.627167));
assert(approxEqual(1 - kolmogorovDistrib(.5), 0.9639452436));
assert(approxEqual(1 - kolmogorovDistrib(.9), 0.39273070));
assert(approxEqual(1 - kolmogorovDistrib(1.2), 0.112249666));
}