From e4b3bc2ebda9a505d2050a85020d059fa1cff565 Mon Sep 17 00:00:00 2001 From: Pierre-Alain Castella Date: Fri, 22 Aug 2025 23:21:28 +0200 Subject: [PATCH 1/4] feat: Remove unused imports. --- src/DataFrame/Operations/Aggregation.hs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/DataFrame/Operations/Aggregation.hs b/src/DataFrame/Operations/Aggregation.hs index 43c794ac..219fe9c9 100644 --- a/src/DataFrame/Operations/Aggregation.hs +++ b/src/DataFrame/Operations/Aggregation.hs @@ -20,8 +20,6 @@ import qualified Data.Vector.Algorithms.Merge as VA import qualified Data.Vector.Generic as VG import qualified Data.Vector.Mutable as VM import qualified Data.Vector.Unboxed as VU -import qualified Statistics.Quantile as SS -import qualified Statistics.Sample as SS import Control.Exception (throw) import Control.Monad (foldM_) From 926f6334e33f36b8e86a843b374c8a6aae950610 Mon Sep 17 00:00:00 2001 From: Pierre-Alain Castella Date: Fri, 22 Aug 2025 23:22:06 +0200 Subject: [PATCH 2/4] feat: Create custom version of standard variation. --- src/DataFrame/Operations/Statistics.hs | 12 +++++++- tests/Operations/Statistics.hs | 42 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/DataFrame/Operations/Statistics.hs b/src/DataFrame/Operations/Statistics.hs index 62e9c83f..e30f10dd 100644 --- a/src/DataFrame/Operations/Statistics.hs +++ b/src/DataFrame/Operations/Statistics.hs @@ -62,7 +62,7 @@ median :: T.Text -> DataFrame -> Maybe Double median = applyStatistic median' standardDeviation :: T.Text -> DataFrame -> Maybe Double -standardDeviation = applyStatistic SS.fastStdDev +standardDeviation = applyStatistic standardDeviation' skewness :: T.Text -> DataFrame -> Maybe Double skewness = applyStatistic SS.skewness @@ -178,6 +178,16 @@ median' samp = runST $ do else if odd length then sortedSamp VU.! middleIndex else (sortedSamp VU.! (middleIndex - 1) + sortedSamp VU.! middleIndex) / 2 +standardDeviation' :: VU.Vector Double -> Double +standardDeviation' samp + | length == 0 = throw $ EmptyDataSetException "standardDeviation" + | otherwise = sqrt variance + where + m = mean' samp + length = VU.length samp + squares = VU.map (\x -> (x - m) ^ 2) samp + variance = VU.sum squares / fromIntegral length + -- accumulator: count, mean, m2 data VarAcc = VarAcc !Int !Double !Double deriving (Show) diff --git a/tests/Operations/Statistics.hs b/tests/Operations/Statistics.hs index 72211629..3646bdfe 100644 --- a/tests/Operations/Statistics.hs +++ b/tests/Operations/Statistics.hs @@ -10,6 +10,7 @@ import qualified DataFrame as DE import Assertions import Test.HUnit +-- median' medianOfOddLengthDataSet :: Test medianOfOddLengthDataSet = TestCase @@ -37,9 +38,50 @@ medianOfEmptyDataSet = (print $ D.median' (VU.fromList [])) ) +-- standardDeviation' +standardDeviationOfSingleElementDataSet :: Test +standardDeviationOfSingleElementDataSet = + TestCase + ( assertEqual + "Standard deviation of a data set with a single element" + (D.standardDeviation' (VU.fromList [-3.5])) + 0 + ) + +standardDeviationOfSameElementsDataSet :: Test +standardDeviationOfSameElementsDataSet = + TestCase + ( assertEqual + "Standard deviation of a data set with the same elements" + (D.standardDeviation' (VU.fromList [3.5, 3.5, 3.5, 3.5])) + 0 + ) + +standardDeviationOfSimpleDataSet :: Test +standardDeviationOfSimpleDataSet = + TestCase + ( assertEqual + "Standard deviation of a simple data set" + (D.standardDeviation' (VU.fromList [2, 4, 4, 4, 5, 5, 7, 9])) + 2 + ) + +standardDeviationOfEmptyDataSet :: Test +standardDeviationOfEmptyDataSet = + TestCase + ( assertExpectException + "[Error Case]" + (DE.emptyDataSetError "standardDeviation") + (print $ D.standardDeviation' (VU.fromList [])) + ) + tests :: [Test] tests = [ TestLabel "medianOfOddLengthDataSet" medianOfOddLengthDataSet , TestLabel "medianOfEvenLengthDataSet" medianOfEvenLengthDataSet , TestLabel "medianOfEmptyDataSet" medianOfEmptyDataSet + , TestLabel "standardDeviationOfSingleElementDataSet" standardDeviationOfSingleElementDataSet + , TestLabel "standardDeviationOfSameElementsDataSet" standardDeviationOfSameElementsDataSet + , TestLabel "standardDeviationOfSimpleDataSet" standardDeviationOfSimpleDataSet + , TestLabel "standardDeviationOfEmptyDataSet" standardDeviationOfEmptyDataSet ] From c9f97d78ca6d67c4f3100d415636fec3efde268c Mon Sep 17 00:00:00 2001 From: Pierre-Alain Castella Date: Fri, 22 Aug 2025 23:46:33 +0200 Subject: [PATCH 3/4] feat: Rewrite standard variation implementation using existing variance. --- src/DataFrame/Operations/Statistics.hs | 11 ++----- tests/Operations/Statistics.hs | 42 -------------------------- 2 files changed, 3 insertions(+), 50 deletions(-) diff --git a/src/DataFrame/Operations/Statistics.hs b/src/DataFrame/Operations/Statistics.hs index e30f10dd..57318469 100644 --- a/src/DataFrame/Operations/Statistics.hs +++ b/src/DataFrame/Operations/Statistics.hs @@ -62,7 +62,7 @@ median :: T.Text -> DataFrame -> Maybe Double median = applyStatistic median' standardDeviation :: T.Text -> DataFrame -> Maybe Double -standardDeviation = applyStatistic standardDeviation' +standardDeviation = applyStatistic (sqrt . variance') skewness :: T.Text -> DataFrame -> Maybe Double skewness = applyStatistic SS.skewness @@ -180,13 +180,8 @@ median' samp = runST $ do standardDeviation' :: VU.Vector Double -> Double standardDeviation' samp - | length == 0 = throw $ EmptyDataSetException "standardDeviation" - | otherwise = sqrt variance - where - m = mean' samp - length = VU.length samp - squares = VU.map (\x -> (x - m) ^ 2) samp - variance = VU.sum squares / fromIntegral length + | VU.length samp == 0 = throw $ EmptyDataSetException "standardDeviation" + | otherwise = sqrt $ variance' samp -- accumulator: count, mean, m2 data VarAcc = VarAcc !Int !Double !Double deriving (Show) diff --git a/tests/Operations/Statistics.hs b/tests/Operations/Statistics.hs index 3646bdfe..72211629 100644 --- a/tests/Operations/Statistics.hs +++ b/tests/Operations/Statistics.hs @@ -10,7 +10,6 @@ import qualified DataFrame as DE import Assertions import Test.HUnit --- median' medianOfOddLengthDataSet :: Test medianOfOddLengthDataSet = TestCase @@ -38,50 +37,9 @@ medianOfEmptyDataSet = (print $ D.median' (VU.fromList [])) ) --- standardDeviation' -standardDeviationOfSingleElementDataSet :: Test -standardDeviationOfSingleElementDataSet = - TestCase - ( assertEqual - "Standard deviation of a data set with a single element" - (D.standardDeviation' (VU.fromList [-3.5])) - 0 - ) - -standardDeviationOfSameElementsDataSet :: Test -standardDeviationOfSameElementsDataSet = - TestCase - ( assertEqual - "Standard deviation of a data set with the same elements" - (D.standardDeviation' (VU.fromList [3.5, 3.5, 3.5, 3.5])) - 0 - ) - -standardDeviationOfSimpleDataSet :: Test -standardDeviationOfSimpleDataSet = - TestCase - ( assertEqual - "Standard deviation of a simple data set" - (D.standardDeviation' (VU.fromList [2, 4, 4, 4, 5, 5, 7, 9])) - 2 - ) - -standardDeviationOfEmptyDataSet :: Test -standardDeviationOfEmptyDataSet = - TestCase - ( assertExpectException - "[Error Case]" - (DE.emptyDataSetError "standardDeviation") - (print $ D.standardDeviation' (VU.fromList [])) - ) - tests :: [Test] tests = [ TestLabel "medianOfOddLengthDataSet" medianOfOddLengthDataSet , TestLabel "medianOfEvenLengthDataSet" medianOfEvenLengthDataSet , TestLabel "medianOfEmptyDataSet" medianOfEmptyDataSet - , TestLabel "standardDeviationOfSingleElementDataSet" standardDeviationOfSingleElementDataSet - , TestLabel "standardDeviationOfSameElementsDataSet" standardDeviationOfSameElementsDataSet - , TestLabel "standardDeviationOfSimpleDataSet" standardDeviationOfSimpleDataSet - , TestLabel "standardDeviationOfEmptyDataSet" standardDeviationOfEmptyDataSet ] From a602271b414a4e78eef70994d84cf8db07b88a0d Mon Sep 17 00:00:00 2001 From: Pierre-Alain Castella Date: Fri, 22 Aug 2025 23:48:08 +0200 Subject: [PATCH 4/4] chore: Remove unused standard variation implementation. --- src/DataFrame/Operations/Statistics.hs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/DataFrame/Operations/Statistics.hs b/src/DataFrame/Operations/Statistics.hs index 57318469..1352f680 100644 --- a/src/DataFrame/Operations/Statistics.hs +++ b/src/DataFrame/Operations/Statistics.hs @@ -178,11 +178,6 @@ median' samp = runST $ do else if odd length then sortedSamp VU.! middleIndex else (sortedSamp VU.! (middleIndex - 1) + sortedSamp VU.! middleIndex) / 2 -standardDeviation' :: VU.Vector Double -> Double -standardDeviation' samp - | VU.length samp == 0 = throw $ EmptyDataSetException "standardDeviation" - | otherwise = sqrt $ variance' samp - -- accumulator: count, mean, m2 data VarAcc = VarAcc !Int !Double !Double deriving (Show)