diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 144d832b36a7..eb4fdc2899f5 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -6,11 +6,67 @@ sidebar_label: Time Series # Time Series Functions -Below functions are used for time series analysis. +Below functions are used for series data analysis. + +## seriesOutliersDetectTukey + +Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). + +**Syntax** + +``` sql +seriesOutliersDetectTukey(series); +seriesOutliersDetectTukey(series, min_percentile, max_percentile, K); +``` + +**Arguments** + +- `series` - An array of numeric values. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. +- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5. + +At least four data points are required in `series` to detect outliers. + +**Returned value** + +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0; +``` + +Result: + +``` text +┌───────────print_0─────────────────┐ +│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │ +└───────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0; +``` + +Result: + +``` text +┌─print_0──────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │ +└──────────────────────────────────────┘ +``` ## seriesPeriodDetectFFT -Finds the period of the given time series data using FFT +Finds the period of the given series data data using FFT FFT - [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform) **Syntax** @@ -25,7 +81,7 @@ seriesPeriodDetectFFT(series); **Returned value** -- A real value equal to the period of time series +- A real value equal to the period of series data - Returns NAN when number of data points are less than four. Type: [Float64](../../sql-reference/data-types/float.md). @@ -60,7 +116,7 @@ Result: ## seriesDecomposeSTL -Decomposes a time series using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component. +Decomposes a series data using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component. **Syntax** diff --git a/src/Functions/seriesOutliersDetectTukey.cpp b/src/Functions/seriesOutliersDetectTukey.cpp new file mode 100644 index 000000000000..8a2e276c74a3 --- /dev/null +++ b/src/Functions/seriesOutliersDetectTukey.cpp @@ -0,0 +1,262 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +extern const int ILLEGAL_COLUMN; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) +class FunctionSeriesOutliersDetectTukey : public IFunction +{ +public: + static constexpr auto name = "seriesOutliersDetectTukey"; + + static constexpr Float64 min_quartile = 2.0; + static constexpr Float64 max_quartile = 98.0; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + std::string getName() const override { return name; } + + bool isVariadic() const override { return true; } + + size_t getNumberOfArguments() const override { return 0; } + + bool useDefaultImplementationForConstants() const override { return true; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + if (arguments.size() != 1 && arguments.size() != 4) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Function {} needs either 1 or 4 arguments; passed {}.", + getName(), + arguments.size()); + + FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray, nullptr, "Array"}}; + FunctionArgumentDescriptors optional_args{ + {"min_percentile", &isNativeNumber, isColumnConst, "Number"}, + {"max_percentile", &isNativeNumber, isColumnConst, "Number"}, + {"k", &isNativeNumber, isColumnConst, "Number"}}; + + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); + + return std::make_shared(std::make_shared()); + } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + ColumnPtr col = arguments[0].column; + const ColumnArray * col_arr = checkAndGetColumn(col.get()); + + const IColumn & arr_data = col_arr->getData(); + const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets(); + + ColumnPtr col_res; + if (input_rows_count == 0) + return ColumnArray::create(ColumnFloat64::create()); + + + Float64 min_percentile = 0.25; /// default 25th percentile + Float64 max_percentile = 0.75; /// default 75th percentile + Float64 k = 1.50; + + if (arguments.size() > 1) + { + Float64 p_min = arguments[1].column->getFloat64(0); + if (isnan(p_min) || !isFinite(p_min) || p_min < min_quartile|| p_min > max_quartile) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName()); + + min_percentile = p_min / 100; + + Float64 p_max = arguments[2].column->getFloat64(0); + if (isnan(p_max) || !isFinite(p_max) || p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); + + max_percentile = p_max / 100; + + auto k_val = arguments[3].column->getFloat64(0); + if (k_val < 0.0 || isnan(k_val) || !isFinite(k_val)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName()); + + k = k_val; + } + + if (executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) + || executeNumber(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)) + { + return col_res; + } + else + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), + getName()); + } + +private: + template + bool executeNumber( + const IColumn & arr_data, + const ColumnArray::Offsets & arr_offsets, + Float64 min_percentile, + Float64 max_percentile, + Float64 k, + ColumnPtr & res_ptr) const + { + const ColumnVector * src_data_concrete = checkAndGetColumn>(&arr_data); + if (!src_data_concrete) + return false; + + const PaddedPODArray & src_vec = src_data_concrete->getData(); + + auto outliers = ColumnFloat64::create(); + auto & outlier_data = outliers->getData(); + + ColumnArray::ColumnOffsets::MutablePtr res_offsets = ColumnArray::ColumnOffsets::create(); + auto & res_offsets_data = res_offsets->getData(); + + std::vector src_sorted; + + ColumnArray::Offset prev_src_offset = 0; + for (auto src_offset : arr_offsets) + { + chassert(prev_src_offset <= src_offset); + size_t len = src_offset - prev_src_offset; + if (len < 4) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName()); + + src_sorted.assign(src_vec.begin() + prev_src_offset, src_vec.begin() + src_offset); + std::sort(src_sorted.begin(), src_sorted.end()); + + Float64 q1, q2; + + Float64 p1 = len * min_percentile; + if (p1 == static_cast(p1)) + { + size_t index = static_cast(p1) - 1; + q1 = (src_sorted[index] + src_sorted[index + 1]) / 2; + } + else + { + size_t index = static_cast(std::ceil(p1)) - 1; + q1 = src_sorted[index]; + } + + Float64 p2 = len * max_percentile; + if (p2 == static_cast(p2)) + { + size_t index = static_cast(p2) - 1; + q2 = (src_sorted[index] + src_sorted[index + 1]) / 2; + } + else + { + size_t index = static_cast(std::ceil(p2)) - 1; + q2 = src_sorted[index]; + } + + Float64 iqr = q2 - q1; /// interquantile range + + Float64 lower_fence = q1 - k * iqr; + Float64 upper_fence = q2 + k * iqr; + + for (ColumnArray::Offset j = prev_src_offset; j < src_offset; ++j) + { + auto score = std::min((src_vec[j] - lower_fence), 0.0) + std::max((src_vec[j] - upper_fence), 0.0); + outlier_data.push_back(score); + } + res_offsets_data.push_back(outlier_data.size()); + prev_src_offset = src_offset; + } + + res_ptr = ColumnArray::create(std::move(outliers), std::move(res_offsets)); + return true; + } +}; + +REGISTER_FUNCTION(SeriesOutliersDetectTukey) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). + +**Syntax** + +``` sql +seriesOutliersDetectTukey(series); +seriesOutliersDetectTukey(series, min_percentile, max_percentile, k); +``` + +**Arguments** + +- `series` - An array of numeric values. +- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. +- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. +- `k` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 + +At least four data points are required in `series` to detect outliers. + +**Returned value** + +- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0; +``` + +Result: + +``` text +┌───────────print_0─────────────────┐ +│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │ +└───────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0; +``` + +Result: + +``` text +┌─print_0──────────────────────────────┐ +│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │ +└──────────────────────────────────────┘ +```)", + .categories{"Time series analysis"}}); +} +} diff --git a/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference new file mode 100644 index 000000000000..85c65ab10ba2 --- /dev/null +++ b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference @@ -0,0 +1,12 @@ +[-4.475000000000001,0,6.925000000000001,0,0,0,0,0,0,0,0,7.925000000000001,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.975,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] +[-2.4999999999999996,0,5.1,0,0,0,0,0,2.0999999999999996,50.1,2.0999999999999996,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0] +[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +[0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0] diff --git a/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql new file mode 100644 index 000000000000..ca116e8b7ede --- /dev/null +++ b/tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql @@ -0,0 +1,32 @@ +-- Tags: no-cpu-aarch64 +-- Tag no-cpu-aarch64: values generated are slighly different on aarch64 + +DROP TABLE IF EXISTS tb1; + +CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; +INSERT INTO tb1 VALUES (1, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 3, 4, 5, 16, 7, 5, 5, 4]), (2, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 45, 12, 3.40, 3, 4, 5, 6]); + +-- non-const inputs +SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n; +SELECT seriesOutliersDetectTukey(a,10,90,1.5) FROM tb1 ORDER BY n; +DROP TABLE IF EXISTS tb1; + +-- const inputs +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6]); +SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]); + +-- const inputs with optional arguments +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 25, 75, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 10, 90, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 2, 98, 1.5); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 2, 98, 1.5); +SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))); +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, 3); + +-- negative tests +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, -1); -- { serverError BAD_ARGUMENTS} +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33, 53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN} +SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS} \ No newline at end of file diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 01af9e14c01e..9db2d95f4c69 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -929,6 +929,7 @@ TotalTemporaryFiles Tradeoff Transactional TwoColumnList +Tukey UBSan UDFs UInt @@ -1317,6 +1318,7 @@ cryptographic csv csvwithnames csvwithnamesandtypes +ctukey curdate currentDatabase currentProfiles @@ -2241,8 +2243,9 @@ seektable sequenceCount sequenceMatch sequenceNextNode -seriesPeriodDetectFFT seriesDecomposeSTL +seriesOutliersDetectTukey +seriesPeriodDetectFFT serverTimeZone serverTimezone serverUUID @@ -2528,6 +2531,7 @@ tryDecrypt tskv tsv tui +tukey tumbleEnd tumbleStart tupleConcat