add H-test

fslaborg · Sep 1, 2020 · 8fc3c5f · 8fc3c5f
1 parent ce00f67
commit 8fc3c5f
Show file tree

Hide file tree

Showing 5 changed files with 139 additions and 0 deletions.
diff --git a/FSharp.Stats.sln b/FSharp.Stats.sln
@@ -35,6 +35,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "content", "content", "{8E6D
 		docsrc\content\Fitting.fsx = docsrc\content\Fitting.fsx
 		docsrc\content\GoodnessOfFit.fsx = docsrc\content\GoodnessOfFit.fsx
 		docsrc\content\GrowthCurve.fsx = docsrc\content\GrowthCurve.fsx
+		HTest.fsx = HTest.fsx
 		docsrc\content\Imputation.fsx = docsrc\content\Imputation.fsx
 		docsrc\content\index.fsx = docsrc\content\index.fsx
 		docsrc\content\Integration.fsx = docsrc\content\Integration.fsx

diff --git a/HTest.fsx b/HTest.fsx
@@ -0,0 +1,59 @@
+(*** hide ***)
+// This block of code is omitted in the generated HTML documentation. Use 
+// it to define helpers that you do not want to show in the documentation.
+#r @"C:\Users\Selly\source\repos\FSharp.Stats\bin\FSharp.Stats\netstandard2.0\FSharp.Stats.dll"
+#r @"C:\Users\Selly\source\repos\FSharp.Plotly-developer\FSharp.Plotly-developer\bin\FSharp.Plotly\netstandard2.0\FSharp.Plotly.dll"
+
+
+open FSharp.Plotly
+open FSharp.Plotly.Axis
+open FSharp.Plotly.StyleParam
+
+let myAxis title = LinearAxis.init(Title=title,Mirror=Mirror.All,Ticks=TickOptions.Inside,Showgrid=false,Showline=true,Zeroline=true)
+let myAxisRange title range = LinearAxis.init(Title=title,Range=Range.MinMax range,Mirror=Mirror.All,Ticks=TickOptions.Inside,Showgrid=false,Showline=true,Zeroline=true)
+let styleChart x y chart = chart |> Chart.withX_Axis (myAxis x) |> Chart.withY_Axis (myAxis y)
+let styleChartRange x y rx ry chart = chart |> Chart.withX_Axis (myAxisRange x rx) |> Chart.withY_Axis (myAxisRange y ry)
+
+(**
+#Statistical testing
+FSharp.Stats provides hypothesis tests for different applications.
+A hypothesis test is a statistical test that is used to determine whether there is enough evidence 
+in a sample of data to infer that a certain condition is true for the entire population. 
+A hypothesis test examines two opposing hypotheses about a population: the null hypothesis and the alternative hypothesis.
+<a name="TestStatistics"></a>
+##Test Statistics
+<a name="Anova"></a>
+##Anova
+*)
+
+open FSharp.Stats
+open FSharp.Stats.Testing
+
+(** 
+< a name = "HTest"></a>
+##H-Test
+The H test is also known as Kruskal-Wallis one-way analysis-of-variance-by-ranks and is the nonparametric equivalent of one-way ANOVA. 
+It is a non-parametric test for comparing the means of more than two independent samples (equal or different sample size), and therefor is an extension of Wilcoxon-Mann-Whitney two sample test.
+Testing with H test gives information whether the samples are from the same distribution.
+A benefit of the H-test is, that it does not require normal distribution of the samples.
+The downside is that there is no information which samples are different from each other, or how many differences occur. For further investigation a Post Hoc test is recommended. 
+    Prerequisite : 
+        - random and independent samples
+        - observations are from populations with same shape of distribution
+        - nominal scale, ordinal scale, ratio scale or interval scale data
+The distribution of the H test statistic is approximated by chi-square distribution with degrees of freedom - 1. 
+
+References : 
+        - E. Ostertagová,  Methodology and Application of the Kruskal-Wallis Test (2014)
+        - Y. Chan, RP Walmsley, Learning and understanding the Kruskal-Wallis one-way analysis-of-variance-by-ranks test for differences among three or more independent groups (1997)
+
+*H-test*
+input : seq{seq<float>} 
+*)
+
+let groupA = seq {23.;41.;54.;66.;78.} 
+let groupB = seq {45.;55.;60.;70.;72.}
+let groupC = seq {18.;30.;34.;40.;44.} 
+let samples = seq{groupA;groupB;groupC}
+
+// calculation of p-Value
diff --git a/docsrc/content/Testing.fsx b/docsrc/content/Testing.fsx
@@ -310,6 +310,7 @@ let fTestFromData = FTest.testVariances sampleFA sampleFB
 (** 
 *F-Test from given parameters:*
 *)
+
 // sample properties are given as (variance,degree of freedom) 
 let sampleF1 = (0.1, 15.)
 let sampleF2 = (0.05, 7.)

diff --git a/src/FSharp.Stats/FSharp.Stats.fsproj b/src/FSharp.Stats/FSharp.Stats.fsproj
@@ -106,6 +106,7 @@
     <Compile Include="Testing\SAM.fs" />
     <Compile Include="Testing\FisherHotelling.fs" />
     <Compile Include="Testing\RMT.fs" />
+    <Compile Include="HTest.fs" />
     <!-- Fitting -->
     <Compile Include="Fitting\CrossValidation.fs" />
     <Compile Include="Fitting\LinearRegression.fs" />

diff --git a/src/FSharp.Stats/HTest.fs b/src/FSharp.Stats/HTest.fs
@@ -0,0 +1,77 @@
+namespace FSharp.Stats.Testing
+
+
+module HTest =
+
+    open FSharp.Stats
+    // H-test / one-way ANOVA of ranks 
+    // input : seq{seq<float>} 
+    let htest (samples : seq<#seq<float>>) = 
+        // calculating n for each group 
+        let n = Seq.map Seq.length samples |> Seq.map float  
+
+        // preparing samples for ranking and calculating samplesize 
+        let allValues = 
+            samples 
+            |> Seq.concat
+            |> Seq.toArray
+
+        let samplesize = Seq.length allValues |> float
+
+        // ranking all values 
+        let ranked = FSharp.Stats.Rank.rankAverage allValues
+
+        let valuesAndRanks = Array.zip allValues ranked 
+
+        // match ranks with each group 
+        let rankingOfEachGroup data = 
+            data 
+            |> Seq.choose (fun x ->   
+                Array.tryFind (fun (i,j) -> i = x) valuesAndRanks
+                )
+            |> Seq.map snd
+            |> Seq.sum 
+            |> float 
+        let ranks = Seq.map rankingOfEachGroup samples 
+
+        // counting ties in data 
+
+        let ties = 
+            valuesAndRanks
+            |> Seq.countBy id 
+            |> Seq.filter (fun (i,j) -> j > 1 )
+            |> Seq.map snd
+            |> Seq.map float 
+
+        if ties = seq [] then 
+            let totalties = 
+                ties 
+                |> Seq.map (fun x -> x**3. - x )
+                |> Seq.sum 
+            // correction factor for ties in data
+            let correctionFactor = 1. - ((totalties)/((samplesize**3.)-samplesize))           
+            let parts data x = 
+                Seq.map2 (fun data x -> (data**2. /x)) data x
+            let sums = parts ranks n |> Seq.sum 
+            let dof =
+                let length = 
+                    samples 
+                    |> Seq.length 
+                    |> float 
+                length - 1. 
+            let testWithoutBindings = (12./(samplesize*(samplesize+1.))) * (sums) - 3.*(samplesize + 1.)
+            let statistic = testWithoutBindings / correctionFactor
+            FSharp.Stats.Testing.TestStatistics.createChiSquare statistic dof 
+        else 
+            // test statistic for H-test without ties 
+               let parts data x = 
+                    Seq.map2 (fun data x -> (data**2. /x)) data x
+               let sums = parts ranks n |> Seq.sum 
+               let dof =
+                   let length = 
+                       samples 
+                       |> Seq.length 
+                       |> float 
+                   length - 1. 
+               let statistic = (12./(samplesize*(samplesize+1.))) * (sums) - 3.*(samplesize + 1.)
+               FSharp.Stats.Testing.TestStatistics.createChiSquare statistic dof