JuliaStats · diegozea · May 21, 2016 · May 24, 2016 · May 24, 2016 · May 24, 2016
diff --git a/src/HypothesisTests.jl b/src/HypothesisTests.jl
@@ -37,7 +37,7 @@ check_same_length(x::Vector, y::Vector) = if length(x) != length(y)
 end
 
 # Basic function for finding a p-value given a distribution and tail
-pvalue(dist::ContinuousUnivariateDistribution, x::Number; tail=:both) = 
+pvalue(dist::ContinuousUnivariateDistribution, x::Number; tail=:both) =
     if tail == :both
         min(2 * min(cdf(dist, x), ccdf(dist, x)), 1.0)
     elseif tail == :left
@@ -48,7 +48,7 @@ pvalue(dist::ContinuousUnivariateDistribution, x::Number; tail=:both) =
         throw(ArgumentError("tail=$(tail) is invalid"))
     end
 
-pvalue(dist::DiscreteUnivariateDistribution, x::Number; tail=:both) = 
+pvalue(dist::DiscreteUnivariateDistribution, x::Number; tail=:both) =
     if tail == :both
         min(2 * min(ccdf(dist, x-1), cdf(dist, x)), 1.0)
     elseif tail == :left
@@ -138,4 +138,5 @@ include("t.jl")
 include("wilcoxon.jl")
 include("power_divergence.jl")
 include("anderson_darling.jl")
+include("spearman.jl")
 end
diff --git a/src/spearman.jl b/src/spearman.jl
@@ -0,0 +1,169 @@
+# spearman.jl
+# Spearman's rank correlation test in Julia
+#
+# Copyright (C) 2016   Diego Javier Zea
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+export CorrelationTest, SpearmanCorrelationTest
+
+abstract CorrelationTest <: HypothesisTest
+
+"Sum squared difference of ranks (midranks for ties)"
+spearman_S(xrank, yrank) = sumabs2(xrank .- yrank)
+
+immutable SpearmanCorrelationTest <: CorrelationTest
+    # Tied ranking for x and y
+    xrank::Vector{Float64}
+    yrank::Vector{Float64}
+    # Adjustment for ties
+    xtiesadj::Float64
+    ytiesadj::Float64
+    # Sum squared difference of ranks
+    S::Float64
+    # Number of points
+    n::Int
+    # Spearman's ρ
+    ρ::Float64
+
+    function SpearmanCorrelationTest(x, y)
+
+        n = length(x)
+        (n != length(y)) && throw(ErrorException("x and y must have the same length"))
+
+        xrank, xtiesadj = HypothesisTests.tiedrank_adj(x)
+        yrank, ytiesadj = HypothesisTests.tiedrank_adj(y)
+
+        S = spearman_S(xrank, yrank)
+
+        ρ = corspearman(x, y)
+
+        new(xrank, yrank, xtiesadj, ytiesadj, S, n, ρ)
+    end
+
+end
+
+testname(::SpearmanCorrelationTest) = "Spearman's rank correlation test"
+
+# parameter of interest: name, value under h0, point estimate
+population_param_of_interest(x::SpearmanCorrelationTest) = ("Spearman's ρ", 0.0, x.ρ)
+
+function show_params(io::IO, x::SpearmanCorrelationTest, ident)
+    println(io, ident, "Number of points:                    ", x.n)
+    println(io, ident, "Spearman's ρ:                        ", x.ρ)
+    println(io, ident, "S (Sum squared difference of ranks): ", x.S)
+    println(io, ident, "adjustment for ties in x:            ", x.xtiesadj)
+    println(io, ident, "adjustment for ties in y:            ", x.ytiesadj)
+end
+
+function P_from_null_S_values(S_null, x::SpearmanCorrelationTest, tail)
+    S_null_mean = mean(S_null)
+    # S is approximately normally distributed
+    # S and ρ are inversely proportional
+    S_null[:] = S_null .- S_null_mean # center
+    S_centered = x.S - S_null_mean    # center
+    if tail == :both
+        modS = abs(S_centered)
+        mean(S_null .<= -modS) + mean(S_null .>= modS)
+    elseif tail == :right
+        mean(S_null .<= S_centered)
+    elseif tail == :left
+        mean(S_null .>= S_centered)
+    else
+        throw(ArgumentError("tail=$(tail) is invalid"))
+    end
+end
+
+function spearman_P_exact(x::SpearmanCorrelationTest, tail)
+    S_null = Float64[ spearman_S(perm, x.yrank) for perm in permutations(x.xrank) ]
+    P_from_null_S_values(S_null, x, tail)
+end
+
+function spearman_P_sampling(x::SpearmanCorrelationTest, tail)
+    # 360000 samples gives an se(P) < 0.0005 for P < 0.1
+    X = copy(x.xrank)
+    S_null = Float64[ spearman_S(shuffle!(X), x.yrank) for sample in 1:360000 ]
+    P_from_null_S_values(S_null, x, tail)
+end
+
+# Use estimated mean and std for the S null distribution as in:
+#
+# Press WH, Teukolsky SA, Vetterling WT, Flannery BP.
+# Numerical recipes in C.
+# Cambridge: Cambridge university press; 1996.
+function spearman_P_estimated(x::SpearmanCorrelationTest, tail)
+    N = float(x.n)
+    a = (N^3 - N)
+    # Numerical Recipes (14.6.6)
+    S_null_mean = (a/6.) - (x.xtiesadj/12.) - (x.ytiesadj/12.)
+    # Numerical Recipes (14.6.7)
+    S_null_std  = sqrt( ((N - 1.)/36.) * (1. - (x.xtiesadj/a) ) * (1. - (x.ytiesadj/a) ) ) * N * (N + 1.)
+    zscore = (x.S - S_null_mean)/S_null_std
+    # S is approximately normally distributed
+    # S and ρ are inversely proportional
+    if tail == :both
+        cdf(Normal(), -abs(zscore)) + ccdf(Normal(), abs(zscore))
+    elseif tail == :right
+        cdf(Normal(),  zscore)
+    elseif tail == :left
+        ccdf(Normal(), zscore)
+    else
+        throw(ArgumentError("tail=$(tail) is invalid"))
+    end
+end
+
+# Using T test for n > 10 as in:
+#
+# McDonald JH.
+# Handbook of biological statistics.
+# Baltimore, MD: Sparky House Publishing; 2009 Aug.
+function spearman_P_ttest(x::SpearmanCorrelationTest, tail)
+    ρ2 = x.ρ^2
+    df = x.n-2
+    t = sqrt((df*ρ2)/(1-ρ2))
+    if tail == :both
+        cdf(TDist(df), -t) + ccdf(Normal(), t)
+    elseif tail == :right
+        ccdf(TDist(df), t)
+    elseif tail == :left
+        cdf(TDist(df),  t)
+    else
+        throw(ArgumentError("tail=$(tail) is invalid"))
+    end
+end
+
+function pvalue(x::SpearmanCorrelationTest; tail=:both, method=:estimated)
+    if x.n <= 10
+        # Exact P value using permutations
+        return( spearman_P_exact(x, tail) )
+    end
+    if method == :sampling
+        return( spearman_P_sampling(x, tail) )
+    elseif method == :exact
+        return( spearman_P_exact(x, tail) )
+    elseif method == :estimated
+        return( spearman_P_estimated(x, tail) )
+    elseif method == :ttest
+        return( spearman_P_ttest(x, tail) )
+    else
+        throw(ArgumentError("method=$(method) is invalid"))
+    end
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -9,4 +9,5 @@ include("t.jl")
 include("wilcoxon.jl")
 include("power_divergence.jl")
 include("anderson_darling.jl")
+include("spearman.jl")
 include("common.jl")
diff --git a/test/spearman.jl b/test/spearman.jl
@@ -0,0 +1,67 @@
+using HypothesisTests, Base.Test
+
+# Test Exact P value: n <= 10
+let x = [ 44.4, 45.9, 41.9, 53.3, 44.7, 44.1 ],
+    y = [  2.6,  3.1,  2.5,  5.0,  3.6,  4.0 ]
+
+    corr = HypothesisTests.SpearmanCorrelationTest(x, y)
+
+    # R values
+    @test_approx_eq corr.ρ 0.6
+    @test_approx_eq_eps HypothesisTests.pvalue(corr) 0.2417 0.0001
+    @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right) 0.1208 0.0001
+    @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:left) 0.9125 0.0001
+end
+
+show(IOBuffer(),
+     HypothesisTests.SpearmanCorrelationTest([ 44.4, 45.9, 41.9, 53.3, 44.7, 44.1 ],
+                                             [  2.6,  3.1,  2.5,  5.0,  3.6,  4.0 ])
+     )
+
+let x = collect(1:11),
+    y = [6,5,4,3,2,1,7,11,10,9,8]
+    # https://stat.ethz.ch/pipermail/r-devel/2009-February/052112.html
+    # correct P value 0.03044548
+
+    corr = HypothesisTests.SpearmanCorrelationTest(x, y)
+
+    @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:exact)     0.03044548 1e-8
+    @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:sampling)  0.030      1e-3
+    @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:estimated) 0.030      1e-3
+    @test_approx_eq_eps HypothesisTests.pvalue(corr, tail=:right, method=:ttest)     0.03       1e-2
+end
+
+let x = collect(1:10),
+    y = [5,4,3,2,1,6,10,9,8,7]
+
+    # R's pspearman: 0.05443067 is the exact value
+    corr = HypothesisTests.SpearmanCorrelationTest(x, y)
+    @test_approx_eq_eps HypothesisTests.pvalue(corr) 0.05443067 1e-8
+end
+
+# Using (N-1)N²(N+1)² overflows with N = 10153
+let x = rand(10153)
+
+    corr = SpearmanCorrelationTest(x,x)
+
+    @test_approx_eq corr.ρ 1.0
+    @test_approx_eq pvalue(corr) 0.0
+
+end
+
+# Test S value with ties
+
+function rho_with_ties(S,N,tx,ty) # S == D
+    # Equation (14.6.5) from Numerical Recipes for rho with ties
+    a=(N^3)-N
+    (1-((6/a)*(S+(tx/12)+(ty/12)))) / (sqrt(1-(tx/a))*sqrt(1-(ty/a)))
+end
+
+function diff_rho(x,y)
+    corr = SpearmanCorrelationTest(x, y)
+    corr.ρ - rho_with_ties(corr.S, corr.n, corr.xtiesadj, corr.ytiesadj)
+end
+
+for i in 20:100
+    @test_approx_eq_eps diff_rho(rand(1:10,i), rand(1:10,i)) 0.0 1e-10
+end