From d2ce39387f9a3ef09582a7a914952d76c69e77ac Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Tue, 5 Dec 2023 07:44:42 -0800 Subject: [PATCH 1/9] implemented series_outlier method --- .../functions/time-series-functions.md | 51 ++++++ src/Functions/seriesOutliersTukey.cpp | 173 ++++++++++++++++++ .../02813_seriesOutliersTukey.reference | 3 + .../0_stateless/02813_seriesOutliersTukey.sql | 12 ++ .../aspell-ignore/en/aspell-dict.txt | 1 + 5 files changed, 240 insertions(+) create mode 100644 src/Functions/seriesOutliersTukey.cpp create mode 100644 tests/queries/0_stateless/02813_seriesOutliersTukey.reference create mode 100644 tests/queries/0_stateless/02813_seriesOutliersTukey.sql diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 016c3410944e..4eb838721b27 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -8,6 +8,57 @@ sidebar_label: Time Series Below functions are used for time series analysis. +## seriesOutliersTukey + +Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). + +**Syntax** + +``` sql +seriesOutliersTukey(series); +``` + +**Arguments** + +- `series` - An array of numeric values + +**Returned value** + +- Returns an array of the same length where each value represents a modified Z-score of possible anomaly of corresponding element in the series. +- A value greater than 3 or lesser than -3 indicates a possible anomaly. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql +seriesOutliersTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; +``` + +Result: + +``` text +┌───────────print_0──────────────────────────────────────────────────────────────────┐ +│[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] │ +└────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +seriesOutliersTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; +``` + +Result: + +``` text +┌───────────print_0────────────────────────────────────────────┐ +│[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] │ +└──────────────────────────────────────────────────────────────┘ +``` + ## seriesPeriodDetectFFT Finds the period of the given time series data using FFT diff --git a/src/Functions/seriesOutliersTukey.cpp b/src/Functions/seriesOutliersTukey.cpp new file mode 100644 index 000000000000..1291d5bf09c9 --- /dev/null +++ b/src/Functions/seriesOutliersTukey.cpp @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +extern const int ILLEGAL_COLUMN; +} + +//Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) +class FunctionSeriesOutliersTukey : public IFunction +{ +public: + static constexpr auto name = "seriesOutliersTukey"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + FunctionArgumentDescriptors args{{"time_series", &isArray, nullptr, "Array"}}; + validateFunctionArgumentTypes(*this, arguments, args); + + return std::make_shared(std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + { + ColumnPtr array_ptr = arguments[0].column; + const ColumnArray * array = checkAndGetColumn(array_ptr.get()); + + const IColumn & src_data = array->getData(); + const ColumnArray::Offsets & src_offsets = array->getOffsets(); + + ColumnPtr res; + + if (executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) + || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) + || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) + || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) + || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res)) + { + return res; + } + else + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), + getName()); + } + + template + bool executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, ColumnPtr & res_ptr) const + { + const ColumnVector * src_data_concrete = checkAndGetColumn>(&src_data); + if (!src_data_concrete) + return false; + + const PaddedPODArray & src_vec = src_data_concrete->getData(); + + auto outliers = ColumnFloat64::create(); + auto & outlier_data = outliers->getData(); + + ColumnArray::ColumnOffsets::MutablePtr res_offsets = ColumnArray::ColumnOffsets::create(); + auto & res_offsets_data = res_offsets->getData(); + + ColumnArray::Offset prev_src_offset = 0; + for (auto curr_src_offset : src_offsets) + { + chassert(prev_src_offset < curr_src_offset); + size_t len = curr_src_offset - prev_src_offset; + if (len < 4) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName()); + + std::vector src_sorted(src_vec.begin() + prev_src_offset, src_vec.begin() + curr_src_offset); + std::sort(src_sorted.begin(), src_sorted.end()); + + size_t q1_index = len / 4; + size_t q3_index = (len * 3) / 4; + + Float64 q1 = (len % 2 != 0) ? src_sorted[q1_index] : (src_sorted[q1_index - 1] + src_sorted[q1_index]) / 2; + Float64 q3 = (len % 2 != 0) ? src_sorted[q3_index] : (src_sorted[q3_index - 1] + src_sorted[q3_index]) / 2; + + Float64 iqr = q3 - q1; + + Float64 lower_fence = q1 - 1.5 * iqr; + Float64 upper_fence = q3 + 1.5 * iqr; + + for (auto elem : src_vec) + { + auto score = std::min((elem - lower_fence) / iqr, 0.0) + std::max((elem - upper_fence) / iqr, 0.0); + outlier_data.push_back(score); + } + res_offsets_data.push_back(outlier_data.size()); + prev_src_offset = curr_src_offset; + } + + res_ptr = ColumnArray::create(std::move(outliers), std::move(res_offsets)); + return true; + } +}; + +REGISTER_FUNCTION(SeriesOutliersTukey) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). + +**Syntax** + +``` sql +seriesOutliersTukey(series); +``` + +**Arguments** + +- `series` - An array of numeric values + +**Returned value** + +- Returns an array of the same length where each value represents a modified Z-score of possible anomaly of corresponding element in the series. +- A value greater than 3 or lesser than -3 indicates a possible anomaly. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql +seriesOutliersTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; +``` + +Result: + +``` text +┌───────────print_0──────────────────────────────────────────────────────────────────┐ +│[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] │ +└────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +``` sql +seriesOutliersTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; +``` + +Result: + +``` text +┌───────────print_0────────────────────────────────────────────┐ +│[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] │ +└──────────────────────────────────────────────────────────────┘ +```)", + .categories{"Time series analysis"}}); +} +} diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference new file mode 100644 index 000000000000..b370b4b5fe22 --- /dev/null +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference @@ -0,0 +1,3 @@ +[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] +[0,0,0,0,0,0,0,0,0,5.228971962616823,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql new file mode 100644 index 000000000000..4b24f1bffbb7 --- /dev/null +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql @@ -0,0 +1,12 @@ +DROP TABLE IF EXISTS tb1; + +CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; +INSERT INTO tb1 VALUES (1, [-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]); +INSERT INTO tb1 VALUES (2, [-3,2.4,15,3.9,5,6,4.5,5.2,12,45,12,3.4,3,4,5,6]); + +SELECT seriesOutliersTukey(a) FROM tb1 ORDER BY n; +DROP TABLE IF EXISTS tb1; +SELECT seriesOutliersTukey(arrayMap(x -> sin(x / 10), range(30))); +SELECT seriesOutliersTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersTukey([]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS} \ No newline at end of file diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 9f87255c9fa0..64327aba2d10 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -939,6 +939,7 @@ TotalTemporaryFiles Tradeoff Transactional TwoColumnList +Tukey UBSan UDFs UInt From f4c14e15a2c4010b16240dfa4990a2f45c6d2f37 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Wed, 10 Jan 2024 09:20:49 -0800 Subject: [PATCH 2/9] added a fix for failing testcase --- src/Functions/seriesOutliersTukey.cpp | 4 ++-- tests/queries/0_stateless/02813_seriesOutliersTukey.sql | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Functions/seriesOutliersTukey.cpp b/src/Functions/seriesOutliersTukey.cpp index 1291d5bf09c9..4640258acc19 100644 --- a/src/Functions/seriesOutliersTukey.cpp +++ b/src/Functions/seriesOutliersTukey.cpp @@ -102,9 +102,9 @@ class FunctionSeriesOutliersTukey : public IFunction Float64 lower_fence = q1 - 1.5 * iqr; Float64 upper_fence = q3 + 1.5 * iqr; - for (auto elem : src_vec) + for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) { - auto score = std::min((elem - lower_fence) / iqr, 0.0) + std::max((elem - upper_fence) / iqr, 0.0); + auto score = std::min((src_vec[j] - lower_fence) / iqr, 0.0) + std::max((src_vec[j] - upper_fence) / iqr, 0.0); outlier_data.push_back(score); } res_offsets_data.push_back(outlier_data.size()); diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql index 4b24f1bffbb7..baf8a229403d 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql @@ -1,8 +1,7 @@ DROP TABLE IF EXISTS tb1; CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; -INSERT INTO tb1 VALUES (1, [-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]); -INSERT INTO tb1 VALUES (2, [-3,2.4,15,3.9,5,6,4.5,5.2,12,45,12,3.4,3,4,5,6]); +INSERT INTO tb1 VALUES (1, [-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]), (2, [-3,2.4,15,3.9,5,6,4.5,5.2,12,45,12,3.4,3,4,5,6]); SELECT seriesOutliersTukey(a) FROM tb1 ORDER BY n; DROP TABLE IF EXISTS tb1; From 90dfacd5a67e0addde4973484a4bc1944a6ca4b1 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Wed, 10 Jan 2024 09:20:49 -0800 Subject: [PATCH 3/9] added a fix for failing testcase --- src/Functions/seriesOutliersTukey.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/seriesOutliersTukey.cpp b/src/Functions/seriesOutliersTukey.cpp index 4640258acc19..697f32f20464 100644 --- a/src/Functions/seriesOutliersTukey.cpp +++ b/src/Functions/seriesOutliersTukey.cpp @@ -83,7 +83,7 @@ class FunctionSeriesOutliersTukey : public IFunction ColumnArray::Offset prev_src_offset = 0; for (auto curr_src_offset : src_offsets) { - chassert(prev_src_offset < curr_src_offset); + chassert(prev_src_offset <= curr_src_offset); size_t len = curr_src_offset - prev_src_offset; if (len < 4) throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName()); From 487ef67e9759394de5ebfcc91749db5624d2fd03 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Thu, 18 Jan 2024 07:26:32 -0800 Subject: [PATCH 4/9] Added custom tukey --- .../functions/time-series-functions.md | 8 +- src/Functions/seriesOutliersTukey.cpp | 120 ++++++++++++++---- .../02813_seriesOutliersTukey.reference | 9 ++ .../0_stateless/02813_seriesOutliersTukey.sql | 19 ++- 4 files changed, 124 insertions(+), 32 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 4eb838721b27..9eea5a8eb1fa 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -8,14 +8,14 @@ sidebar_label: Time Series Below functions are used for time series analysis. -## seriesOutliersTukey +## seriesOutliersDetectTukey Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). **Syntax** ``` sql -seriesOutliersTukey(series); +seriesOutliersDetectTukey(series); ``` **Arguments** @@ -34,7 +34,7 @@ Type: [Array](../../sql-reference/data-types/array.md). Query: ``` sql -seriesOutliersTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; +seriesOutliersDetectTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; ``` Result: @@ -48,7 +48,7 @@ Result: Query: ``` sql -seriesOutliersTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; +seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; ``` Result: diff --git a/src/Functions/seriesOutliersTukey.cpp b/src/Functions/seriesOutliersTukey.cpp index 697f32f20464..72a16949605e 100644 --- a/src/Functions/seriesOutliersTukey.cpp +++ b/src/Functions/seriesOutliersTukey.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -16,16 +17,18 @@ extern const int ILLEGAL_COLUMN; } //Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) -class FunctionSeriesOutliersTukey : public IFunction +class FunctionSeriesOutliersDetectTukey : public IFunction { public: - static constexpr auto name = "seriesOutliersTukey"; + static constexpr auto name = "seriesOutliersDetectTukey"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } std::string getName() const override { return name; } - size_t getNumberOfArguments() const override { return 1; } + bool isVariadic() const override { return true; } + + size_t getNumberOfArguments() const override { return 0; } bool useDefaultImplementationForConstants() const override { return true; } @@ -33,12 +36,20 @@ class FunctionSeriesOutliersTukey : public IFunction DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { - FunctionArgumentDescriptors args{{"time_series", &isArray, nullptr, "Array"}}; - validateFunctionArgumentTypes(*this, arguments, args); + FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray, nullptr, "Array"}}; + FunctionArgumentDescriptors optional_args{ + {"kind", &isString, isColumnConst, "const String"}, + {"min_percentile", &isNativeNumber, isColumnConst, "Number"}, + {"max_percentile", &isNativeNumber, isColumnConst, "Number"} + }; + + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); return std::make_shared(std::make_shared()); } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1,2,3}; } + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { ColumnPtr array_ptr = arguments[0].column; @@ -47,13 +58,54 @@ class FunctionSeriesOutliersTukey : public IFunction const IColumn & src_data = array->getData(); const ColumnArray::Offsets & src_offsets = array->getOffsets(); + Float64 min_percentile = 0.25; + Float64 max_percentile = 0.75; + + if(arguments.size() > 1) + { + //const IColumn * arg_column = arguments[1].column.get(); + const ColumnConst * arg_string = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); + + if (!arg_string) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The second argument of function {} must be constant String", getName()); + + String kind = arg_string->getValue(); + if(kind == "ctukey"){ + min_percentile = 0.10; //default 10th percentile + max_percentile = 0.90; //default 90th percentile + + if(arguments.size() > 2) + { + Float64 p_min = arguments[2].column->getFloat64(0); + if(p_min >= 2.0 && p_min <= 98.0) + min_percentile = p_min/100; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argumet of function {} must be in range [2.0, 98.0]", getName()); + } + + if(arguments.size() == 4) + { + Float64 p_max = arguments[3].column->getFloat64(0); + if(p_max >= 2.0 && p_max <= 98.0 && p_max > min_percentile*100) + max_percentile = p_max/100; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argumet of function {} must be in range [2.0, 98.0]", getName()); + } + } + else + { + if(kind != "tukey") + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} can only be 'tukey' or 'ctukey'.", getName()); + } + } + ColumnPtr res; - if (executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) - || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) - || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) - || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res) - || executeNumber(src_data, src_offsets, res) || executeNumber(src_data, src_offsets, res)) + if (executeNumber(src_data, src_offsets, min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) + || executeNumber(src_data, src_offsets, min_percentile, max_percentile,res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) + || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) + || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets, min_percentile, max_percentile,res) + || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res)) { return res; } @@ -66,7 +118,11 @@ class FunctionSeriesOutliersTukey : public IFunction } template - bool executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, ColumnPtr & res_ptr) const + bool executeNumber(const IColumn & src_data, + const ColumnArray::Offsets & src_offsets, + Float64 min_percentile, + Float64 max_percentile, + ColumnPtr & res_ptr) const { const ColumnVector * src_data_concrete = checkAndGetColumn>(&src_data); if (!src_data_concrete) @@ -91,16 +147,34 @@ class FunctionSeriesOutliersTukey : public IFunction std::vector src_sorted(src_vec.begin() + prev_src_offset, src_vec.begin() + curr_src_offset); std::sort(src_sorted.begin(), src_sorted.end()); - size_t q1_index = len / 4; - size_t q3_index = (len * 3) / 4; + Float64 q1, q2; - Float64 q1 = (len % 2 != 0) ? src_sorted[q1_index] : (src_sorted[q1_index - 1] + src_sorted[q1_index]) / 2; - Float64 q3 = (len % 2 != 0) ? src_sorted[q3_index] : (src_sorted[q3_index - 1] + src_sorted[q3_index]) / 2; + auto p1 = len * min_percentile; + if(p1 == static_cast(p1)){ + size_t index = static_cast(p1)-1; + q1 = (src_sorted[index] + src_sorted[index+1])/2; + } + else + { + size_t index = static_cast(std::ceil(p1))-1; + q1 = src_sorted[index]; + } + + auto p2 = len * max_percentile; + if(p2 == static_cast(p2)){ + size_t index = static_cast(p2)-1; + q2 = (src_sorted[index] + src_sorted[index+1])/2; + } + else + { + size_t index = static_cast(std::ceil(p2))-1; + q2 = src_sorted[index]; + } - Float64 iqr = q3 - q1; + Float64 iqr = q2 - q1; Float64 lower_fence = q1 - 1.5 * iqr; - Float64 upper_fence = q3 + 1.5 * iqr; + Float64 upper_fence = q2 + 1.5 * iqr; for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) { @@ -116,16 +190,16 @@ class FunctionSeriesOutliersTukey : public IFunction } }; -REGISTER_FUNCTION(SeriesOutliersTukey) +REGISTER_FUNCTION(SeriesOutliersDetectTukey) { - factory.registerFunction(FunctionDocumentation{ + factory.registerFunction(FunctionDocumentation{ .description = R"( Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). **Syntax** ``` sql -seriesOutliersTukey(series); +seriesOutliersDetectTukey(series); ``` **Arguments** @@ -144,7 +218,7 @@ Type: [Array](../../sql-reference/data-types/array.md). Query: ``` sql -seriesOutliersTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; +seriesOutliersDetectTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; ``` Result: @@ -158,7 +232,7 @@ seriesOutliersTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; Query: ``` sql -seriesOutliersTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; +seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; ``` Result: diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference index b370b4b5fe22..990c8c11e9e7 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference @@ -1,3 +1,12 @@ [-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] [0,0,0,0,0,0,0,0,0,5.228971962616823,0,0,0,0,0,0] +[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] +[0,0,0,0,0,0,0,0,0,5.228971962616823,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,4.706896551724138,0,0,0,0,0,0] +[-0.9615384615384613,0,1.9615384615384612,0,0,0,0,0,0.8076923076923075,19.26923076923077,0.8076923076923075,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,4.706896551724138,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0.8076923076923077,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,4.706896551724138,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql index baf8a229403d..f8debc7b6dbc 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql @@ -3,9 +3,18 @@ DROP TABLE IF EXISTS tb1; CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; INSERT INTO tb1 VALUES (1, [-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]), (2, [-3,2.4,15,3.9,5,6,4.5,5.2,12,45,12,3.4,3,4,5,6]); -SELECT seriesOutliersTukey(a) FROM tb1 ORDER BY n; +SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n; +SELECT seriesOutliersDetectTukey(a,'ctukey', 25,75) FROM tb1 ORDER BY n; DROP TABLE IF EXISTS tb1; -SELECT seriesOutliersTukey(arrayMap(x -> sin(x / 10), range(30))); -SELECT seriesOutliersTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} -SELECT seriesOutliersTukey([]); -- { serverError ILLEGAL_COLUMN} -SELECT seriesOutliersTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS} \ No newline at end of file +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6]); +SELECT seriesOutliersDetectTukey([-3, 2.4, 15, 3.9, 5, 6, 4.5, 5.2, 12, 60, 12, 3.4, 3, 4, 5, 6, 3.4, 2.7]); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'ctukey', 25, 75); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'ctukey', 10, 90); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'tukey', 10, 90); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'ctukey', 2, 98) +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'ctukey', 2, 98); +SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'xyz', 33, 53); -- { serverError BAD_ARGUMENTS} +SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS} \ No newline at end of file From 4bc26fe45fa78fbfd1394d22f73a86cf1f83e586 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Wed, 31 Jan 2024 10:35:08 -0800 Subject: [PATCH 5/9] Added support for custom percentiles and K --- .../functions/time-series-functions.md | 41 ++-- src/Functions/seriesOutliersTukey.cpp | 180 +++++++++++------- .../02813_seriesOutliersTukey.reference | 20 +- .../0_stateless/02813_seriesOutliersTukey.sql | 26 ++- .../aspell-ignore/en/aspell-dict.txt | 3 + 5 files changed, 166 insertions(+), 104 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 9eea5a8eb1fa..bd50ef556f73 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -6,7 +6,7 @@ sidebar_label: Time Series # Time Series Functions -Below functions are used for time series analysis. +Below functions are used for series data analysis. ## seriesOutliersDetectTukey @@ -16,16 +16,27 @@ Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.o ``` sql seriesOutliersDetectTukey(series); +seriesOutliersDetectTukey(series, kind, min_percentile, max_percentile, K); ``` **Arguments** -- `series` - An array of numeric values +- `series` - An array of numeric values. +- `kind` - Kind of algorithm to use. Supported values are 'tukey' for standard tukey and 'ctukey' for custom tukey algorithm. The default is 'ctukey'. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 10. This value is only supported for 'ctukey'. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 90. This value is only supported for 'ctukey'. +- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 + +At least four data points are required in `series` to detect outliers. + +Default quantile range: +- `tukey` - 25%/75% +- `ctukey` - 10%/90% **Returned value** -- Returns an array of the same length where each value represents a modified Z-score of possible anomaly of corresponding element in the series. -- A value greater than 3 or lesser than -3 indicates a possible anomaly. +- Returns an array of the same length where each value represents score of possible anomaly of corresponding element in the series. +- A non-zero score indicates a possible anomaly. Type: [Array](../../sql-reference/data-types/array.md). @@ -34,34 +45,34 @@ Type: [Array](../../sql-reference/data-types/array.md). Query: ``` sql -seriesOutliersDetectTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0; ``` Result: ``` text -┌───────────print_0──────────────────────────────────────────────────────────────────┐ -│[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] │ -└────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────print_0───────────────────┐ +│[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] │ +└─────────────────────────────────────┘ ``` Query: ``` sql -seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 20, 80, 1.5) AS print_0; ``` Result: ``` text -┌───────────print_0────────────────────────────────────────────┐ -│[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] │ -└──────────────────────────────────────────────────────────────┘ +┌─print_0────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0] │ +└────────────────────────────────────┘ ``` ## seriesPeriodDetectFFT -Finds the period of the given time series data using FFT +Finds the period of the given series data data using FFT FFT - [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) **Syntax** @@ -76,7 +87,7 @@ seriesPeriodDetectFFT(series); **Returned value** -- A real value equal to the period of time series +- A real value equal to the period of series data - Returns NAN when number of data points are less than four. Type: [Float64](../../sql-reference/data-types/float.md). @@ -111,7 +122,7 @@ Result: ## seriesDecomposeSTL -Decomposes a time series using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component. +Decomposes a series data using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component. **Syntax** diff --git a/src/Functions/seriesOutliersTukey.cpp b/src/Functions/seriesOutliersTukey.cpp index 72a16949605e..4c2c1ccd8820 100644 --- a/src/Functions/seriesOutliersTukey.cpp +++ b/src/Functions/seriesOutliersTukey.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -7,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -16,7 +16,7 @@ extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; } -//Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) +///Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) class FunctionSeriesOutliersDetectTukey : public IFunction { public: @@ -40,28 +40,28 @@ class FunctionSeriesOutliersDetectTukey : public IFunction FunctionArgumentDescriptors optional_args{ {"kind", &isString, isColumnConst, "const String"}, {"min_percentile", &isNativeNumber, isColumnConst, "Number"}, - {"max_percentile", &isNativeNumber, isColumnConst, "Number"} - }; + {"max_percentile", &isNativeNumber, isColumnConst, "Number"}, + {"k", &isNativeNumber, isColumnConst, "Number"}}; validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); return std::make_shared(std::make_shared()); } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1,2,3}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3, 4}; } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { - ColumnPtr array_ptr = arguments[0].column; - const ColumnArray * array = checkAndGetColumn(array_ptr.get()); + ColumnPtr col = arguments[0].column; + const ColumnArray * col_arr = checkAndGetColumn(col.get()); - const IColumn & src_data = array->getData(); - const ColumnArray::Offsets & src_offsets = array->getOffsets(); + const IColumn & arr_data = col_arr->getData(); + const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets(); - Float64 min_percentile = 0.25; - Float64 max_percentile = 0.75; + Float64 min_percentile = 0.10; //default 10th percentile + Float64 max_percentile = 0.90; //default 90th percentile - if(arguments.size() > 1) + if (arguments.size() > 1) { //const IColumn * arg_column = arguments[1].column.get(); const ColumnConst * arg_string = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); @@ -70,44 +70,62 @@ class FunctionSeriesOutliersDetectTukey : public IFunction throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The second argument of function {} must be constant String", getName()); String kind = arg_string->getValue(); - if(kind == "ctukey"){ - min_percentile = 0.10; //default 10th percentile - max_percentile = 0.90; //default 90th percentile - - if(arguments.size() > 2) + if (kind == "ctukey") + { + if (arguments.size() > 2) { Float64 p_min = arguments[2].column->getFloat64(0); - if(p_min >= 2.0 && p_min <= 98.0) - min_percentile = p_min/100; + if (p_min >= 2.0 && p_min <= 98.0) + min_percentile = p_min / 100; else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argumet of function {} must be in range [2.0, 98.0]", getName()); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "The third argumet of function {} must be in range [2.0, 98.0]", getName()); } - if(arguments.size() == 4) + if (arguments.size() > 3) { Float64 p_max = arguments[3].column->getFloat64(0); - if(p_max >= 2.0 && p_max <= 98.0 && p_max > min_percentile*100) - max_percentile = p_max/100; + if (p_max >= 2.0 && p_max <= 98.0 && p_max > min_percentile * 100) + max_percentile = p_max / 100; else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argumet of function {} must be in range [2.0, 98.0]", getName()); + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "The fourth argumet of function {} must be in range [2.0, 98.0]", getName()); } - } - else + } + else if (kind == "tukey") { - if(kind != "tukey") - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} can only be 'tukey' or 'ctukey'.", getName()); + min_percentile = 0.25; + max_percentile = 0.75; } + else + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} can only be 'tukey' or 'ctukey'.", getName()); } - ColumnPtr res; + Float64 K = 1.50; + if (arguments.size() == 5) + { + auto k_val = arguments[4].column->getFloat64(0); + if (k_val >= 0.0) + K = k_val; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fifth argumet of function {} must be a positive number", getName()); + } - if (executeNumber(src_data, src_offsets, min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) - || executeNumber(src_data, src_offsets, min_percentile, max_percentile,res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) - || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) - || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets, min_percentile, max_percentile,res) - || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res) || executeNumber(src_data, src_offsets,min_percentile, max_percentile, res)) + ColumnPtr col_res; + + if (executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)) { - return res; + return col_res; } else throw Exception( @@ -117,14 +135,17 @@ class FunctionSeriesOutliersDetectTukey : public IFunction getName()); } +private: template - bool executeNumber(const IColumn & src_data, - const ColumnArray::Offsets & src_offsets, - Float64 min_percentile, - Float64 max_percentile, - ColumnPtr & res_ptr) const + bool executeNumber( + const IColumn & arr_data, + const ColumnArray::Offsets & arr_offsets, + Float64 min_percentile, + Float64 max_percentile, + Float64 K, + ColumnPtr & res_ptr) const { - const ColumnVector * src_data_concrete = checkAndGetColumn>(&src_data); + const ColumnVector * src_data_concrete = checkAndGetColumn>(&arr_data); if (!src_data_concrete) return false; @@ -136,53 +157,57 @@ class FunctionSeriesOutliersDetectTukey : public IFunction ColumnArray::ColumnOffsets::MutablePtr res_offsets = ColumnArray::ColumnOffsets::create(); auto & res_offsets_data = res_offsets->getData(); + std::vector src_sorted; + ColumnArray::Offset prev_src_offset = 0; - for (auto curr_src_offset : src_offsets) + for (auto src_offset : arr_offsets) { - chassert(prev_src_offset <= curr_src_offset); - size_t len = curr_src_offset - prev_src_offset; + chassert(prev_src_offset <= src_offset); + size_t len = src_offset - prev_src_offset; if (len < 4) throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName()); - std::vector src_sorted(src_vec.begin() + prev_src_offset, src_vec.begin() + curr_src_offset); + src_sorted.assign(src_vec.begin() + prev_src_offset, src_vec.begin() + src_offset); std::sort(src_sorted.begin(), src_sorted.end()); Float64 q1, q2; auto p1 = len * min_percentile; - if(p1 == static_cast(p1)){ - size_t index = static_cast(p1)-1; - q1 = (src_sorted[index] + src_sorted[index+1])/2; + if (p1 == static_cast(p1)) + { + size_t index = static_cast(p1) - 1; + q1 = (src_sorted[index] + src_sorted[index + 1]) / 2; } else - { - size_t index = static_cast(std::ceil(p1))-1; + { + size_t index = static_cast(std::ceil(p1)) - 1; q1 = src_sorted[index]; } auto p2 = len * max_percentile; - if(p2 == static_cast(p2)){ - size_t index = static_cast(p2)-1; - q2 = (src_sorted[index] + src_sorted[index+1])/2; + if (p2 == static_cast(p2)) + { + size_t index = static_cast(p2) - 1; + q2 = (src_sorted[index] + src_sorted[index + 1]) / 2; } else - { - size_t index = static_cast(std::ceil(p2))-1; + { + size_t index = static_cast(std::ceil(p2)) - 1; q2 = src_sorted[index]; } - Float64 iqr = q2 - q1; + Float64 iqr = q2 - q1; /// interquantile range - Float64 lower_fence = q1 - 1.5 * iqr; - Float64 upper_fence = q2 + 1.5 * iqr; + Float64 lower_fence = q1 - K * iqr; + Float64 upper_fence = q2 + K * iqr; - for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) + for (ColumnArray::Offset j = prev_src_offset; j < src_offset; ++j) { - auto score = std::min((src_vec[j] - lower_fence) / iqr, 0.0) + std::max((src_vec[j] - upper_fence) / iqr, 0.0); + auto score = std::min((src_vec[j] - lower_fence), 0.0) + std::max((src_vec[j] - upper_fence), 0.0); outlier_data.push_back(score); } res_offsets_data.push_back(outlier_data.size()); - prev_src_offset = curr_src_offset; + prev_src_offset = src_offset; } res_ptr = ColumnArray::create(std::move(outliers), std::move(res_offsets)); @@ -196,20 +221,31 @@ REGISTER_FUNCTION(SeriesOutliersDetectTukey) .description = R"( Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). +Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). + **Syntax** ``` sql seriesOutliersDetectTukey(series); +seriesOutliersDetectTukey(series, kind, min_percentile, max_percentile, K); ``` **Arguments** -- `series` - An array of numeric values +- `series` - An array of numeric values. +- `kind` - Kind of algorithm to use. Supported values are 'tukey' for standard tukey and 'ctukey' for custom tukey algorithm. The default is 'ctukey'. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 10. This value is only supported for 'ctukey'. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 90. This value is only supported for 'ctukey'. +- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 + +Default quantile range: +- `tukey` - 25%/75% +- `ctukey` - 10%/90% **Returned value** -- Returns an array of the same length where each value represents a modified Z-score of possible anomaly of corresponding element in the series. -- A value greater than 3 or lesser than -3 indicates a possible anomaly. +- Returns an array of the same length where each value represents score of possible anomaly of corresponding element in the series. +- A non-zero score indicates a possible anomaly. Type: [Array](../../sql-reference/data-types/array.md). @@ -218,29 +254,29 @@ Type: [Array](../../sql-reference/data-types/array.md). Query: ``` sql -seriesOutliersDetectTukey([-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]) AS print_0; +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0; ``` Result: ``` text -┌───────────print_0──────────────────────────────────────────────────────────────────┐ -│[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] │ -└────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────print_0───────────────────┐ +│[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] │ +└─────────────────────────────────────┘ ``` Query: ``` sql -seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))) AS print_0; +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 20, 80, 1.5) AS print_0; ``` Result: ``` text -┌───────────print_0────────────────────────────────────────────┐ -│[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] │ -└──────────────────────────────────────────────────────────────┘ +┌─print_0────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0] │ +└────────────────────────────────────┘ ```)", .categories{"Time series analysis"}}); } diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference index 990c8c11e9e7..bdcde0419a42 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference @@ -1,12 +1,14 @@ -[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] -[0,0,0,0,0,0,0,0,0,5.228971962616823,0,0,0,0,0,0] -[-2.7121212121212137,0,4.196969696969699,0,0,0,0,0,0,0,0,4.803030303030305,0,0,0,0] -[0,0,0,0,0,0,0,0,0,5.228971962616823,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,4.706896551724138,0,0,0,0,0,0] -[-0.9615384615384613,0,1.9615384615384612,0,0,0,0,0,0.8076923076923075,19.26923076923077,0.8076923076923075,0,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,4.706896551724138,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,0.8076923076923077,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,4.706896551724138,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0] +[-4.475000000000001,0,6.925000000000001,0,0,0,0,0,0,0,0,7.925000000000001,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.975,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,26.1,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] [0,0,0,0] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0] diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql index f8debc7b6dbc..7efe4903249c 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql @@ -1,19 +1,29 @@ DROP TABLE IF EXISTS tb1; CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; -INSERT INTO tb1 VALUES (1, [-3,2.4,15,3.9,5,6,4.5,5.2,3,4,5,16,7,5,5,4]), (2, [-3,2.4,15,3.9,5,6,4.5,5.2,12,45,12,3.4,3,4,5,6]); +INSERT INTO tb1 VALUES (1, [-3,2.40,15,3.90,5,6,4.50,5.20,3,4,5,16,7,5,5,4]), (2, [-3,2.40,15,3.90,5,6,4.50,5.20,12,45,12,3.40,3,4,5,6]); +-- non-const inputs SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n; SELECT seriesOutliersDetectTukey(a,'ctukey', 25,75) FROM tb1 ORDER BY n; DROP TABLE IF EXISTS tb1; -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6]); -SELECT seriesOutliersDetectTukey([-3, 2.4, 15, 3.9, 5, 6, 4.5, 5.2, 12, 60, 12, 3.4, 3, 4, 5, 6, 3.4, 2.7]); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'ctukey', 25, 75); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'ctukey', 10, 90); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'tukey', 10, 90); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.5, 5, 12, 45, 12, 3.4, 3, 4, 5, 6], 'ctukey', 2, 98) + +-- const inputs +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6]); +SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]); + +-- const inputs with optional arguments +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 25, 75); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 10, 90); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'tukey', 10, 90); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 2, 98); SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'ctukey', 2, 98); -SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))); +SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30)), 'tukey'); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, 3); + +-- negative tests +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, -1); -- { serverError BAD_ARGUMENTS} SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'xyz', 33, 53); -- { serverError BAD_ARGUMENTS} SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN} diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 64327aba2d10..3c8ec34de8df 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1332,6 +1332,7 @@ cryptographic csv csvwithnames csvwithnamesandtypes +ctukey curdate currentDatabase currentProfiles @@ -2271,6 +2272,7 @@ seektable sequenceCount sequenceMatch sequenceNextNode +seriesOutliersDetectTukey seriesDecomposeSTL seriesPeriodDetectFFT serverTimeZone @@ -2564,6 +2566,7 @@ tryPunycodeDecode tskv tsv tui +tukey tumbleEnd tumbleStart tupleConcat From 6d24ffc9761d05422f0d98d3a4797d0c348e3303 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Wed, 31 Jan 2024 10:57:43 -0800 Subject: [PATCH 6/9] Fix filename and spelling errors --- ...eriesOutliersTukey.cpp => seriesOutliersDetectTukey.cpp} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename src/Functions/{seriesOutliersTukey.cpp => seriesOutliersDetectTukey.cpp} (98%) diff --git a/src/Functions/seriesOutliersTukey.cpp b/src/Functions/seriesOutliersDetectTukey.cpp similarity index 98% rename from src/Functions/seriesOutliersTukey.cpp rename to src/Functions/seriesOutliersDetectTukey.cpp index 4c2c1ccd8820..ce5ed391fa0f 100644 --- a/src/Functions/seriesOutliersTukey.cpp +++ b/src/Functions/seriesOutliersDetectTukey.cpp @@ -79,7 +79,7 @@ class FunctionSeriesOutliersDetectTukey : public IFunction min_percentile = p_min / 100; else throw Exception( - ErrorCodes::BAD_ARGUMENTS, "The third argumet of function {} must be in range [2.0, 98.0]", getName()); + ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); } if (arguments.size() > 3) @@ -89,7 +89,7 @@ class FunctionSeriesOutliersDetectTukey : public IFunction max_percentile = p_max / 100; else throw Exception( - ErrorCodes::BAD_ARGUMENTS, "The fourth argumet of function {} must be in range [2.0, 98.0]", getName()); + ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be in range [2.0, 98.0]", getName()); } } else if (kind == "tukey") @@ -109,7 +109,7 @@ class FunctionSeriesOutliersDetectTukey : public IFunction if (k_val >= 0.0) K = k_val; else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fifth argumet of function {} must be a positive number", getName()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fifth argument of function {} must be a positive number", getName()); } ColumnPtr col_res; From ce31fa912b2f67411350d2de85a21090726101fa Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Thu, 1 Feb 2024 09:24:12 -0800 Subject: [PATCH 7/9] removed unnecessary method overloading and fixed documentation --- .../functions/time-series-functions.md | 30 ++--- src/Functions/seriesOutliersDetectTukey.cpp | 121 +++++++----------- .../02813_seriesOutliersTukey.reference | 10 +- .../0_stateless/02813_seriesOutliersTukey.sql | 23 ++-- 4 files changed, 75 insertions(+), 109 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index bd50ef556f73..bb6f3da25fb7 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -10,33 +10,27 @@ Below functions are used for series data analysis. ## seriesOutliersDetectTukey -Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). +Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). **Syntax** ``` sql seriesOutliersDetectTukey(series); -seriesOutliersDetectTukey(series, kind, min_percentile, max_percentile, K); +seriesOutliersDetectTukey(series, min_percentile, max_percentile, K); ``` **Arguments** - `series` - An array of numeric values. -- `kind` - Kind of algorithm to use. Supported values are 'tukey' for standard tukey and 'ctukey' for custom tukey algorithm. The default is 'ctukey'. -- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 10. This value is only supported for 'ctukey'. -- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 90. This value is only supported for 'ctukey'. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. - `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 At least four data points are required in `series` to detect outliers. -Default quantile range: -- `tukey` - 25%/75% -- `ctukey` - 10%/90% - **Returned value** -- Returns an array of the same length where each value represents score of possible anomaly of corresponding element in the series. -- A non-zero score indicates a possible anomaly. +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. Type: [Array](../../sql-reference/data-types/array.md). @@ -51,23 +45,23 @@ SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, Result: ``` text -┌───────────print_0───────────────────┐ -│[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] │ -└─────────────────────────────────────┘ +┌───────────print_0─────────────────┐ +│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │ +└───────────────────────────────────┘ ``` Query: ``` sql -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 20, 80, 1.5) AS print_0; +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0; ``` Result: ``` text -┌─print_0────────────────────────────┐ -│ [0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0] │ -└────────────────────────────────────┘ +┌─print_0──────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │ +└──────────────────────────────────────┘ ``` ## seriesPeriodDetectFFT diff --git a/src/Functions/seriesOutliersDetectTukey.cpp b/src/Functions/seriesOutliersDetectTukey.cpp index ce5ed391fa0f..66fda8ce976f 100644 --- a/src/Functions/seriesOutliersDetectTukey.cpp +++ b/src/Functions/seriesOutliersDetectTukey.cpp @@ -14,9 +14,10 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int ILLEGAL_COLUMN; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -///Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) +/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) class FunctionSeriesOutliersDetectTukey : public IFunction { public: @@ -36,9 +37,15 @@ class FunctionSeriesOutliersDetectTukey : public IFunction DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { + if (arguments.size() != 1 && arguments.size() != 4) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Function {} needs either 1 or 4 arguments; passed {}.", + getName(), + arguments.size()); + FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray, nullptr, "Array"}}; FunctionArgumentDescriptors optional_args{ - {"kind", &isString, isColumnConst, "const String"}, {"min_percentile", &isNativeNumber, isColumnConst, "Number"}, {"max_percentile", &isNativeNumber, isColumnConst, "Number"}, {"k", &isNativeNumber, isColumnConst, "Number"}}; @@ -48,9 +55,9 @@ class FunctionSeriesOutliersDetectTukey : public IFunction return std::make_shared(std::make_shared()); } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3, 4}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { ColumnPtr col = arguments[0].column; const ColumnArray * col_arr = checkAndGetColumn(col.get()); @@ -58,61 +65,35 @@ class FunctionSeriesOutliersDetectTukey : public IFunction const IColumn & arr_data = col_arr->getData(); const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets(); - Float64 min_percentile = 0.10; //default 10th percentile - Float64 max_percentile = 0.90; //default 90th percentile + ColumnPtr col_res; + if (input_rows_count == 0) + return ColumnArray::create(ColumnFloat64::create()); + + + Float64 min_percentile = 0.25; /// default 25th percentile + Float64 max_percentile = 0.75; /// default 75th percentile + Float64 K = 1.50; if (arguments.size() > 1) { - //const IColumn * arg_column = arguments[1].column.get(); - const ColumnConst * arg_string = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); + Float64 p_min = arguments[1].column->getFloat64(0); + if (p_min < 2.0 || p_min > 98.0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName()); - if (!arg_string) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The second argument of function {} must be constant String", getName()); + min_percentile = p_min / 100; - String kind = arg_string->getValue(); - if (kind == "ctukey") - { - if (arguments.size() > 2) - { - Float64 p_min = arguments[2].column->getFloat64(0); - if (p_min >= 2.0 && p_min <= 98.0) - min_percentile = p_min / 100; - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); - } - - if (arguments.size() > 3) - { - Float64 p_max = arguments[3].column->getFloat64(0); - if (p_max >= 2.0 && p_max <= 98.0 && p_max > min_percentile * 100) - max_percentile = p_max / 100; - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be in range [2.0, 98.0]", getName()); - } - } - else if (kind == "tukey") - { - min_percentile = 0.25; - max_percentile = 0.75; - } - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} can only be 'tukey' or 'ctukey'.", getName()); - } + Float64 p_max = arguments[2].column->getFloat64(0); + if (p_max < 2.0 || p_max > 98.0 || p_max < min_percentile * 100) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); - Float64 K = 1.50; - if (arguments.size() == 5) - { - auto k_val = arguments[4].column->getFloat64(0); - if (k_val >= 0.0) - K = k_val; - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fifth argument of function {} must be a positive number", getName()); - } + max_percentile = p_max / 100; - ColumnPtr col_res; + auto k_val = arguments[3].column->getFloat64(0); + if (k_val < 0.0) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName()); + + K = k_val; + } if (executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) @@ -172,7 +153,7 @@ class FunctionSeriesOutliersDetectTukey : public IFunction Float64 q1, q2; - auto p1 = len * min_percentile; + Float64 p1 = len * min_percentile; if (p1 == static_cast(p1)) { size_t index = static_cast(p1) - 1; @@ -184,7 +165,7 @@ class FunctionSeriesOutliersDetectTukey : public IFunction q1 = src_sorted[index]; } - auto p2 = len * max_percentile; + Float64 p2 = len * max_percentile; if (p2 == static_cast(p2)) { size_t index = static_cast(p2) - 1; @@ -219,33 +200,27 @@ REGISTER_FUNCTION(SeriesOutliersDetectTukey) { factory.registerFunction(FunctionDocumentation{ .description = R"( -Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). - -Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). +Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). **Syntax** ``` sql seriesOutliersDetectTukey(series); -seriesOutliersDetectTukey(series, kind, min_percentile, max_percentile, K); +seriesOutliersDetectTukey(series, min_percentile, max_percentile, K); ``` **Arguments** - `series` - An array of numeric values. -- `kind` - Kind of algorithm to use. Supported values are 'tukey' for standard tukey and 'ctukey' for custom tukey algorithm. The default is 'ctukey'. -- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 10. This value is only supported for 'ctukey'. -- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 90. This value is only supported for 'ctukey'. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. - `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 -Default quantile range: -- `tukey` - 25%/75% -- `ctukey` - 10%/90% +At least four data points are required in `series` to detect outliers. **Returned value** -- Returns an array of the same length where each value represents score of possible anomaly of corresponding element in the series. -- A non-zero score indicates a possible anomaly. +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. Type: [Array](../../sql-reference/data-types/array.md). @@ -260,23 +235,23 @@ SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, Result: ``` text -┌───────────print_0───────────────────┐ -│[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] │ -└─────────────────────────────────────┘ +┌───────────print_0─────────────────┐ +│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │ +└───────────────────────────────────┘ ``` Query: ``` sql -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 20, 80, 1.5) AS print_0; +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0; ``` Result: ``` text -┌─print_0────────────────────────────┐ -│ [0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0] │ -└────────────────────────────────────┘ +┌─print_0──────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │ +└──────────────────────────────────────┘ ```)", .categories{"Time series analysis"}}); } diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference index bdcde0419a42..85c65ab10ba2 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.reference @@ -1,14 +1,12 @@ -[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0] [-4.475000000000001,0,6.925000000000001,0,0,0,0,0,0,0,0,7.925000000000001,0,0,0,0] [0,0,0,0,0,0,0,0,0,27.975,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,26.1,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0] [0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] +[-2.4999999999999996,0,5.1,0,0,0,0,0,2.0999999999999996,50.1,2.0999999999999996,0,0,0,0,0,0,0] [0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] [0,0,0,0] [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] [0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0] diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql index 7efe4903249c..b43fa40e82b2 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql +++ b/tests/queries/0_stateless/02813_seriesOutliersTukey.sql @@ -1,11 +1,11 @@ DROP TABLE IF EXISTS tb1; CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; -INSERT INTO tb1 VALUES (1, [-3,2.40,15,3.90,5,6,4.50,5.20,3,4,5,16,7,5,5,4]), (2, [-3,2.40,15,3.90,5,6,4.50,5.20,12,45,12,3.40,3,4,5,6]); +INSERT INTO tb1 VALUES (1, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 3, 4, 5, 16, 7, 5, 5, 4]), (2, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 45, 12, 3.40, 3, 4, 5, 6]); -- non-const inputs SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n; -SELECT seriesOutliersDetectTukey(a,'ctukey', 25,75) FROM tb1 ORDER BY n; +SELECT seriesOutliersDetectTukey(a,10,90,1.5) FROM tb1 ORDER BY n; DROP TABLE IF EXISTS tb1; -- const inputs @@ -13,18 +13,17 @@ SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]); -- const inputs with optional arguments -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 25, 75); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 10, 90); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'tukey', 10, 90); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 2, 98); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'ctukey', 2, 98); -SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30)), 'tukey'); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, 1.5); -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, 3); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 25, 75, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 10, 90, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 2, 98, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 2, 98, 1.5); +SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, 3); -- negative tests -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, -1); -- { serverError BAD_ARGUMENTS} -SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'xyz', 33, 53); -- { serverError BAD_ARGUMENTS} +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, -1); -- { serverError BAD_ARGUMENTS} +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33, 53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN} SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS} \ No newline at end of file From 28e6e290045c54dffedcbdb8879e785368f7bf24 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Fri, 2 Feb 2024 07:46:36 -0800 Subject: [PATCH 8/9] Minor fixes --- .../functions/time-series-functions.md | 2 +- src/Functions/seriesOutliersDetectTukey.cpp | 41 ++++++++++--------- ...02813_seriesOutliersDetectTukey.reference} | 0 ...ql => 02813_seriesOutliersDetectTukey.sql} | 3 ++ 4 files changed, 26 insertions(+), 20 deletions(-) rename tests/queries/0_stateless/{02813_seriesOutliersTukey.reference => 02813_seriesOutliersDetectTukey.reference} (100%) rename tests/queries/0_stateless/{02813_seriesOutliersTukey.sql => 02813_seriesOutliersDetectTukey.sql} (94%) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index bb6f3da25fb7..ce36c89f473d 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -24,7 +24,7 @@ seriesOutliersDetectTukey(series, min_percentile, max_percentile, K); - `series` - An array of numeric values. - `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. - `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. -- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 +- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5. At least four data points are required in `series` to detect outliers. diff --git a/src/Functions/seriesOutliersDetectTukey.cpp b/src/Functions/seriesOutliersDetectTukey.cpp index 66fda8ce976f..a2fc9cf8eb62 100644 --- a/src/Functions/seriesOutliersDetectTukey.cpp +++ b/src/Functions/seriesOutliersDetectTukey.cpp @@ -23,6 +23,9 @@ class FunctionSeriesOutliersDetectTukey : public IFunction public: static constexpr auto name = "seriesOutliersDetectTukey"; + static constexpr Float64 min_quartile = 2.0; + static constexpr Float64 max_quartile = 98.0; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } std::string getName() const override { return name; } @@ -72,18 +75,18 @@ class FunctionSeriesOutliersDetectTukey : public IFunction Float64 min_percentile = 0.25; /// default 25th percentile Float64 max_percentile = 0.75; /// default 75th percentile - Float64 K = 1.50; + Float64 k = 1.50; if (arguments.size() > 1) { Float64 p_min = arguments[1].column->getFloat64(0); - if (p_min < 2.0 || p_min > 98.0) + if (p_min < min_quartile|| p_min > max_quartile) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName()); min_percentile = p_min / 100; Float64 p_max = arguments[2].column->getFloat64(0); - if (p_max < 2.0 || p_max > 98.0 || p_max < min_percentile * 100) + if (p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); max_percentile = p_max / 100; @@ -92,19 +95,19 @@ class FunctionSeriesOutliersDetectTukey : public IFunction if (k_val < 0.0) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName()); - K = k_val; + k = k_val; } - if (executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res) - || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)) + if (executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)) { return col_res; } @@ -123,7 +126,7 @@ class FunctionSeriesOutliersDetectTukey : public IFunction const ColumnArray::Offsets & arr_offsets, Float64 min_percentile, Float64 max_percentile, - Float64 K, + Float64 k, ColumnPtr & res_ptr) const { const ColumnVector * src_data_concrete = checkAndGetColumn>(&arr_data); @@ -179,8 +182,8 @@ class FunctionSeriesOutliersDetectTukey : public IFunction Float64 iqr = q2 - q1; /// interquantile range - Float64 lower_fence = q1 - K * iqr; - Float64 upper_fence = q2 + K * iqr; + Float64 lower_fence = q1 - k * iqr; + Float64 upper_fence = q2 + k * iqr; for (ColumnArray::Offset j = prev_src_offset; j < src_offset; ++j) { @@ -206,7 +209,7 @@ Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wi ``` sql seriesOutliersDetectTukey(series); -seriesOutliersDetectTukey(series, min_percentile, max_percentile, K); +seriesOutliersDetectTukey(series, min_percentile, max_percentile, k); ``` **Arguments** @@ -214,7 +217,7 @@ seriesOutliersDetectTukey(series, min_percentile, max_percentile, K); - `series` - An array of numeric values. - `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. - `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. -- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 +- `k` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 At least four data points are required in `series` to detect outliers. diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.reference b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference similarity index 100% rename from tests/queries/0_stateless/02813_seriesOutliersTukey.reference rename to tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference diff --git a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql similarity index 94% rename from tests/queries/0_stateless/02813_seriesOutliersTukey.sql rename to tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql index b43fa40e82b2..ca116e8b7ede 100644 --- a/tests/queries/0_stateless/02813_seriesOutliersTukey.sql +++ b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql @@ -1,3 +1,6 @@ +-- Tags: no-cpu-aarch64 +-- Tag no-cpu-aarch64: values generated are slighly different on aarch64 + DROP TABLE IF EXISTS tb1; CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; From c9954a7eb8e75c130435b84fd14e0be17765adb2 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Mon, 5 Feb 2024 09:23:11 -0800 Subject: [PATCH 9/9] added check for NaN and infinite values --- src/Functions/seriesOutliersDetectTukey.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Functions/seriesOutliersDetectTukey.cpp b/src/Functions/seriesOutliersDetectTukey.cpp index a2fc9cf8eb62..8a2e276c74a3 100644 --- a/src/Functions/seriesOutliersDetectTukey.cpp +++ b/src/Functions/seriesOutliersDetectTukey.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include namespace DB @@ -80,19 +81,19 @@ class FunctionSeriesOutliersDetectTukey : public IFunction if (arguments.size() > 1) { Float64 p_min = arguments[1].column->getFloat64(0); - if (p_min < min_quartile|| p_min > max_quartile) + if (isnan(p_min) || !isFinite(p_min) || p_min < min_quartile|| p_min > max_quartile) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName()); min_percentile = p_min / 100; Float64 p_max = arguments[2].column->getFloat64(0); - if (p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100) + if (isnan(p_max) || !isFinite(p_max) || p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); max_percentile = p_max / 100; auto k_val = arguments[3].column->getFloat64(0); - if (k_val < 0.0) + if (k_val < 0.0 || isnan(k_val) || !isFinite(k_val)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName()); k = k_val;