forked from ClickHouse/ClickHouse
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request ClickHouse#58632 from ClibMouse/series_outlier
Added function `seriesOutliersTukey` for outlier detetion in series data
- Loading branch information
1 parent
eb59a0d
commit 80f3001
Showing
5 changed files
with
371 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,262 @@ | ||
#include <Columns/ColumnArray.h> | ||
#include <Columns/ColumnString.h> | ||
#include <Columns/ColumnsNumber.h> | ||
#include <DataTypes/DataTypeArray.h> | ||
#include <DataTypes/DataTypesNumber.h> | ||
#include <Functions/FunctionFactory.h> | ||
#include <Functions/FunctionHelpers.h> | ||
#include <Functions/IFunction.h> | ||
#include <Common/NaNUtils.h> | ||
#include <cmath> | ||
|
||
namespace DB | ||
{ | ||
namespace ErrorCodes | ||
{ | ||
extern const int BAD_ARGUMENTS; | ||
extern const int ILLEGAL_COLUMN; | ||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; | ||
} | ||
|
||
/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences) | ||
class FunctionSeriesOutliersDetectTukey : public IFunction | ||
{ | ||
public: | ||
static constexpr auto name = "seriesOutliersDetectTukey"; | ||
|
||
static constexpr Float64 min_quartile = 2.0; | ||
static constexpr Float64 max_quartile = 98.0; | ||
|
||
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSeriesOutliersDetectTukey>(); } | ||
|
||
std::string getName() const override { return name; } | ||
|
||
bool isVariadic() const override { return true; } | ||
|
||
size_t getNumberOfArguments() const override { return 0; } | ||
|
||
bool useDefaultImplementationForConstants() const override { return true; } | ||
|
||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } | ||
|
||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override | ||
{ | ||
if (arguments.size() != 1 && arguments.size() != 4) | ||
throw Exception( | ||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, | ||
"Function {} needs either 1 or 4 arguments; passed {}.", | ||
getName(), | ||
arguments.size()); | ||
|
||
FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray<IDataType>, nullptr, "Array"}}; | ||
FunctionArgumentDescriptors optional_args{ | ||
{"min_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"}, | ||
{"max_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"}, | ||
{"k", &isNativeNumber<IDataType>, isColumnConst, "Number"}}; | ||
|
||
validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); | ||
|
||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeFloat64>()); | ||
} | ||
|
||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; } | ||
|
||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override | ||
{ | ||
ColumnPtr col = arguments[0].column; | ||
const ColumnArray * col_arr = checkAndGetColumn<ColumnArray>(col.get()); | ||
|
||
const IColumn & arr_data = col_arr->getData(); | ||
const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets(); | ||
|
||
ColumnPtr col_res; | ||
if (input_rows_count == 0) | ||
return ColumnArray::create(ColumnFloat64::create()); | ||
|
||
|
||
Float64 min_percentile = 0.25; /// default 25th percentile | ||
Float64 max_percentile = 0.75; /// default 75th percentile | ||
Float64 k = 1.50; | ||
|
||
if (arguments.size() > 1) | ||
{ | ||
Float64 p_min = arguments[1].column->getFloat64(0); | ||
if (isnan(p_min) || !isFinite(p_min) || p_min < min_quartile|| p_min > max_quartile) | ||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName()); | ||
|
||
min_percentile = p_min / 100; | ||
|
||
Float64 p_max = arguments[2].column->getFloat64(0); | ||
if (isnan(p_max) || !isFinite(p_max) || p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100) | ||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName()); | ||
|
||
max_percentile = p_max / 100; | ||
|
||
auto k_val = arguments[3].column->getFloat64(0); | ||
if (k_val < 0.0 || isnan(k_val) || !isFinite(k_val)) | ||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName()); | ||
|
||
k = k_val; | ||
} | ||
|
||
if (executeNumber<UInt8>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<UInt16>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<UInt32>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<UInt64>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<Int8>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<Int16>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<Int32>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<Int64>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<Float32>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res) | ||
|| executeNumber<Float64>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)) | ||
{ | ||
return col_res; | ||
} | ||
else | ||
throw Exception( | ||
ErrorCodes::ILLEGAL_COLUMN, | ||
"Illegal column {} of first argument of function {}", | ||
arguments[0].column->getName(), | ||
getName()); | ||
} | ||
|
||
private: | ||
template <typename T> | ||
bool executeNumber( | ||
const IColumn & arr_data, | ||
const ColumnArray::Offsets & arr_offsets, | ||
Float64 min_percentile, | ||
Float64 max_percentile, | ||
Float64 k, | ||
ColumnPtr & res_ptr) const | ||
{ | ||
const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&arr_data); | ||
if (!src_data_concrete) | ||
return false; | ||
|
||
const PaddedPODArray<T> & src_vec = src_data_concrete->getData(); | ||
|
||
auto outliers = ColumnFloat64::create(); | ||
auto & outlier_data = outliers->getData(); | ||
|
||
ColumnArray::ColumnOffsets::MutablePtr res_offsets = ColumnArray::ColumnOffsets::create(); | ||
auto & res_offsets_data = res_offsets->getData(); | ||
|
||
std::vector<Float64> src_sorted; | ||
|
||
ColumnArray::Offset prev_src_offset = 0; | ||
for (auto src_offset : arr_offsets) | ||
{ | ||
chassert(prev_src_offset <= src_offset); | ||
size_t len = src_offset - prev_src_offset; | ||
if (len < 4) | ||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName()); | ||
|
||
src_sorted.assign(src_vec.begin() + prev_src_offset, src_vec.begin() + src_offset); | ||
std::sort(src_sorted.begin(), src_sorted.end()); | ||
|
||
Float64 q1, q2; | ||
|
||
Float64 p1 = len * min_percentile; | ||
if (p1 == static_cast<Int64>(p1)) | ||
{ | ||
size_t index = static_cast<size_t>(p1) - 1; | ||
q1 = (src_sorted[index] + src_sorted[index + 1]) / 2; | ||
} | ||
else | ||
{ | ||
size_t index = static_cast<size_t>(std::ceil(p1)) - 1; | ||
q1 = src_sorted[index]; | ||
} | ||
|
||
Float64 p2 = len * max_percentile; | ||
if (p2 == static_cast<Int64>(p2)) | ||
{ | ||
size_t index = static_cast<size_t>(p2) - 1; | ||
q2 = (src_sorted[index] + src_sorted[index + 1]) / 2; | ||
} | ||
else | ||
{ | ||
size_t index = static_cast<size_t>(std::ceil(p2)) - 1; | ||
q2 = src_sorted[index]; | ||
} | ||
|
||
Float64 iqr = q2 - q1; /// interquantile range | ||
|
||
Float64 lower_fence = q1 - k * iqr; | ||
Float64 upper_fence = q2 + k * iqr; | ||
|
||
for (ColumnArray::Offset j = prev_src_offset; j < src_offset; ++j) | ||
{ | ||
auto score = std::min((src_vec[j] - lower_fence), 0.0) + std::max((src_vec[j] - upper_fence), 0.0); | ||
outlier_data.push_back(score); | ||
} | ||
res_offsets_data.push_back(outlier_data.size()); | ||
prev_src_offset = src_offset; | ||
} | ||
|
||
res_ptr = ColumnArray::create(std::move(outliers), std::move(res_offsets)); | ||
return true; | ||
} | ||
}; | ||
|
||
REGISTER_FUNCTION(SeriesOutliersDetectTukey) | ||
{ | ||
factory.registerFunction<FunctionSeriesOutliersDetectTukey>(FunctionDocumentation{ | ||
.description = R"( | ||
Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences). | ||
**Syntax** | ||
``` sql | ||
seriesOutliersDetectTukey(series); | ||
seriesOutliersDetectTukey(series, min_percentile, max_percentile, k); | ||
``` | ||
**Arguments** | ||
- `series` - An array of numeric values. | ||
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25. | ||
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75. | ||
- `k` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5 | ||
At least four data points are required in `series` to detect outliers. | ||
**Returned value** | ||
- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly. | ||
Type: [Array](../../sql-reference/data-types/array.md). | ||
**Examples** | ||
Query: | ||
``` sql | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0; | ||
``` | ||
Result: | ||
``` text | ||
┌───────────print_0─────────────────┐ | ||
│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │ | ||
└───────────────────────────────────┘ | ||
``` | ||
Query: | ||
``` sql | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0; | ||
``` | ||
Result: | ||
``` text | ||
┌─print_0──────────────────────────────┐ | ||
│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │ | ||
└──────────────────────────────────────┘ | ||
```)", | ||
.categories{"Time series analysis"}}); | ||
} | ||
} |
12 changes: 12 additions & 0 deletions
12
tests/queries/0_stateless/02813_seriesOutliersDetectTukey.reference
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
[-4.475000000000001,0,6.925000000000001,0,0,0,0,0,0,0,0,7.925000000000001,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,27.975,0,0,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] | ||
[-2.4999999999999996,0,5.1,0,0,0,0,0,2.0999999999999996,50.1,2.0999999999999996,0,0,0,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | ||
[0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] | ||
[0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0] |
32 changes: 32 additions & 0 deletions
32
tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
-- Tags: no-cpu-aarch64 | ||
-- Tag no-cpu-aarch64: values generated are slighly different on aarch64 | ||
|
||
DROP TABLE IF EXISTS tb1; | ||
|
||
CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory; | ||
INSERT INTO tb1 VALUES (1, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 3, 4, 5, 16, 7, 5, 5, 4]), (2, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 45, 12, 3.40, 3, 4, 5, 6]); | ||
|
||
-- non-const inputs | ||
SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n; | ||
SELECT seriesOutliersDetectTukey(a,10,90,1.5) FROM tb1 ORDER BY n; | ||
DROP TABLE IF EXISTS tb1; | ||
|
||
-- const inputs | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6]); | ||
SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]); | ||
|
||
-- const inputs with optional arguments | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 25, 75, 1.5); | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 10, 90, 1.5); | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 2, 98, 1.5); | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 2, 98, 1.5); | ||
SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30))); | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, 3); | ||
|
||
-- negative tests | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, -1); -- { serverError BAD_ARGUMENTS} | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33, 53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} | ||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} | ||
SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN} | ||
SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN} | ||
SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS} |
Oops, something went wrong.