Skip to content

Commit

Permalink
Merge pull request ClickHouse#58632 from ClibMouse/series_outlier
Browse files Browse the repository at this point in the history
Added function `seriesOutliersTukey` for outlier detetion in series data
  • Loading branch information
rschu1ze authored and bhavnajindal committed Feb 23, 2024
1 parent eb59a0d commit 80f3001
Show file tree
Hide file tree
Showing 5 changed files with 371 additions and 5 deletions.
64 changes: 60 additions & 4 deletions docs/en/sql-reference/functions/time-series-functions.md
Expand Up @@ -6,11 +6,67 @@ sidebar_label: Time Series

# Time Series Functions

Below functions are used for time series analysis.
Below functions are used for series data analysis.

## seriesOutliersDetectTukey

Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).

**Syntax**

``` sql
seriesOutliersDetectTukey(series);
seriesOutliersDetectTukey(series, min_percentile, max_percentile, K);
```

**Arguments**

- `series` - An array of numeric values.
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25.
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75.
- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5.

At least four data points are required in `series` to detect outliers.

**Returned value**

- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly.

Type: [Array](../../sql-reference/data-types/array.md).

**Examples**

Query:

``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0;
```

Result:

``` text
┌───────────print_0─────────────────┐
│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │
└───────────────────────────────────┘
```

Query:

``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0;
```

Result:

``` text
┌─print_0──────────────────────────────┐
│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │
└──────────────────────────────────────┘
```

## seriesPeriodDetectFFT

Finds the period of the given time series data using FFT
Finds the period of the given series data data using FFT
FFT - [Fast Fourier transform](https://en.wikipedia.org/wiki/Fast_Fourier_transform)

**Syntax**
Expand All @@ -25,7 +81,7 @@ seriesPeriodDetectFFT(series);

**Returned value**

- A real value equal to the period of time series
- A real value equal to the period of series data
- Returns NAN when number of data points are less than four.

Type: [Float64](../../sql-reference/data-types/float.md).
Expand Down Expand Up @@ -60,7 +116,7 @@ Result:

## seriesDecomposeSTL

Decomposes a time series using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component.
Decomposes a series data using STL [(Seasonal-Trend Decomposition Procedure Based on Loess)](https://www.wessa.net/download/stl.pdf) into a season, a trend and a residual component.

**Syntax**

Expand Down
262 changes: 262 additions & 0 deletions src/Functions/seriesOutliersDetectTukey.cpp
@@ -0,0 +1,262 @@
#include <Columns/ColumnArray.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Common/NaNUtils.h>
#include <cmath>

namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}

/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences)
class FunctionSeriesOutliersDetectTukey : public IFunction
{
public:
static constexpr auto name = "seriesOutliersDetectTukey";

static constexpr Float64 min_quartile = 2.0;
static constexpr Float64 max_quartile = 98.0;

static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSeriesOutliersDetectTukey>(); }

std::string getName() const override { return name; }

bool isVariadic() const override { return true; }

size_t getNumberOfArguments() const override { return 0; }

bool useDefaultImplementationForConstants() const override { return true; }

bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }

DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (arguments.size() != 1 && arguments.size() != 4)
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Function {} needs either 1 or 4 arguments; passed {}.",
getName(),
arguments.size());

FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray<IDataType>, nullptr, "Array"}};
FunctionArgumentDescriptors optional_args{
{"min_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"},
{"max_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"},
{"k", &isNativeNumber<IDataType>, isColumnConst, "Number"}};

validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);

return std::make_shared<DataTypeArray>(std::make_shared<DataTypeFloat64>());
}

ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; }

ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
ColumnPtr col = arguments[0].column;
const ColumnArray * col_arr = checkAndGetColumn<ColumnArray>(col.get());

const IColumn & arr_data = col_arr->getData();
const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets();

ColumnPtr col_res;
if (input_rows_count == 0)
return ColumnArray::create(ColumnFloat64::create());


Float64 min_percentile = 0.25; /// default 25th percentile
Float64 max_percentile = 0.75; /// default 75th percentile
Float64 k = 1.50;

if (arguments.size() > 1)
{
Float64 p_min = arguments[1].column->getFloat64(0);
if (isnan(p_min) || !isFinite(p_min) || p_min < min_quartile|| p_min > max_quartile)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName());

min_percentile = p_min / 100;

Float64 p_max = arguments[2].column->getFloat64(0);
if (isnan(p_max) || !isFinite(p_max) || p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName());

max_percentile = p_max / 100;

auto k_val = arguments[3].column->getFloat64(0);
if (k_val < 0.0 || isnan(k_val) || !isFinite(k_val))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName());

k = k_val;
}

if (executeNumber<UInt8>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<UInt16>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<UInt32>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<UInt64>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<Int8>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<Int16>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<Int32>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<Int64>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<Float32>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res)
|| executeNumber<Float64>(arr_data, arr_offsets, min_percentile, max_percentile, k, col_res))
{
return col_res;
}
else
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Illegal column {} of first argument of function {}",
arguments[0].column->getName(),
getName());
}

private:
template <typename T>
bool executeNumber(
const IColumn & arr_data,
const ColumnArray::Offsets & arr_offsets,
Float64 min_percentile,
Float64 max_percentile,
Float64 k,
ColumnPtr & res_ptr) const
{
const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&arr_data);
if (!src_data_concrete)
return false;

const PaddedPODArray<T> & src_vec = src_data_concrete->getData();

auto outliers = ColumnFloat64::create();
auto & outlier_data = outliers->getData();

ColumnArray::ColumnOffsets::MutablePtr res_offsets = ColumnArray::ColumnOffsets::create();
auto & res_offsets_data = res_offsets->getData();

std::vector<Float64> src_sorted;

ColumnArray::Offset prev_src_offset = 0;
for (auto src_offset : arr_offsets)
{
chassert(prev_src_offset <= src_offset);
size_t len = src_offset - prev_src_offset;
if (len < 4)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName());

src_sorted.assign(src_vec.begin() + prev_src_offset, src_vec.begin() + src_offset);
std::sort(src_sorted.begin(), src_sorted.end());

Float64 q1, q2;

Float64 p1 = len * min_percentile;
if (p1 == static_cast<Int64>(p1))
{
size_t index = static_cast<size_t>(p1) - 1;
q1 = (src_sorted[index] + src_sorted[index + 1]) / 2;
}
else
{
size_t index = static_cast<size_t>(std::ceil(p1)) - 1;
q1 = src_sorted[index];
}

Float64 p2 = len * max_percentile;
if (p2 == static_cast<Int64>(p2))
{
size_t index = static_cast<size_t>(p2) - 1;
q2 = (src_sorted[index] + src_sorted[index + 1]) / 2;
}
else
{
size_t index = static_cast<size_t>(std::ceil(p2)) - 1;
q2 = src_sorted[index];
}

Float64 iqr = q2 - q1; /// interquantile range

Float64 lower_fence = q1 - k * iqr;
Float64 upper_fence = q2 + k * iqr;

for (ColumnArray::Offset j = prev_src_offset; j < src_offset; ++j)
{
auto score = std::min((src_vec[j] - lower_fence), 0.0) + std::max((src_vec[j] - upper_fence), 0.0);
outlier_data.push_back(score);
}
res_offsets_data.push_back(outlier_data.size());
prev_src_offset = src_offset;
}

res_ptr = ColumnArray::create(std::move(outliers), std::move(res_offsets));
return true;
}
};

REGISTER_FUNCTION(SeriesOutliersDetectTukey)
{
factory.registerFunction<FunctionSeriesOutliersDetectTukey>(FunctionDocumentation{
.description = R"(
Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
**Syntax**
``` sql
seriesOutliersDetectTukey(series);
seriesOutliersDetectTukey(series, min_percentile, max_percentile, k);
```
**Arguments**
- `series` - An array of numeric values.
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25.
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75.
- `k` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5
At least four data points are required in `series` to detect outliers.
**Returned value**
- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly.
Type: [Array](../../sql-reference/data-types/array.md).
**Examples**
Query:
``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0;
```
Result:
``` text
┌───────────print_0─────────────────┐
│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │
└───────────────────────────────────┘
```
Query:
``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0;
```
Result:
``` text
┌─print_0──────────────────────────────┐
│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │
└──────────────────────────────────────┘
```)",
.categories{"Time series analysis"}});
}
}
@@ -0,0 +1,12 @@
[-4.475000000000001,0,6.925000000000001,0,0,0,0,0,0,0,0,7.925000000000001,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27.975,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0]
[-2.4999999999999996,0,5.1,0,0,0,0,0,2.0999999999999996,50.1,2.0999999999999996,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0]
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0]
32 changes: 32 additions & 0 deletions tests/queries/0_stateless/02813_seriesOutliersDetectTukey.sql
@@ -0,0 +1,32 @@
-- Tags: no-cpu-aarch64
-- Tag no-cpu-aarch64: values generated are slighly different on aarch64

DROP TABLE IF EXISTS tb1;

CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory;
INSERT INTO tb1 VALUES (1, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 3, 4, 5, 16, 7, 5, 5, 4]), (2, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 45, 12, 3.40, 3, 4, 5, 6]);

-- non-const inputs
SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n;
SELECT seriesOutliersDetectTukey(a,10,90,1.5) FROM tb1 ORDER BY n;
DROP TABLE IF EXISTS tb1;

-- const inputs
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6]);
SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]);

-- const inputs with optional arguments
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 25, 75, 1.5);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 10, 90, 1.5);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 2, 98, 1.5);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 2, 98, 1.5);
SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30)));
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, 3);

-- negative tests
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, -1); -- { serverError BAD_ARGUMENTS}
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33, 53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN}
SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN}
SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS}

0 comments on commit 80f3001

Please sign in to comment.