From 2f6b88a0c479742ee5a4aaf92a9ee92d1d5d91cf Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Mon, 17 Oct 2022 21:08:15 +0200 Subject: [PATCH 01/12] Done --- src/Functions/distribution.cpp | 179 ++++++++++++++++++ .../0_stateless/02462_distributions.reference | 6 + .../0_stateless/02462_distributions.sql | 12 ++ 3 files changed, 197 insertions(+) create mode 100644 src/Functions/distribution.cpp create mode 100644 tests/queries/0_stateless/02462_distributions.reference create mode 100644 tests/queries/0_stateless/02462_distributions.sql diff --git a/src/Functions/distribution.cpp b/src/Functions/distribution.cpp new file mode 100644 index 000000000000..6d6c1db265cd --- /dev/null +++ b/src/Functions/distribution.cpp @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int TOO_SLOW; + extern const int ILLEGAL_COLUMN; + extern const int BAD_ARGUMENTS; +} + +struct UniformDistribution +{ + static constexpr const char * getName() { return "uniformDistribution"; } + static constexpr size_t getNumberOfArguments() { return 2; } + + void generate(std::vector & parameters, ColumnFloat64::Container & container) const + { + auto distribution = std::uniform_real_distribution<>(parameters[0], parameters[1]); + for (auto & elem : container) + elem = distribution(thread_local_rng); + } +}; + +struct NormalDistribution +{ + static constexpr const char * getName() { return "normalDistribution"; } + static constexpr size_t getNumberOfArguments() { return 2; } + + void generate(std::vector & parameters, ColumnFloat64::Container & container) const + { + auto distribution = std::normal_distribution<>(parameters[0], parameters[1]); + for (auto & elem : container) + elem = distribution(thread_local_rng); + } +}; + +struct LogNormalDistribution +{ + static constexpr const char * getName() { return "logNormalDistribution"; } + static constexpr size_t getNumberOfArguments() { return 2; } + + void generate(std::vector & parameters, ColumnFloat64::Container & container) const + { + auto distribution = std::lognormal_distribution<>(parameters[0], parameters[1]); + for (auto & elem : container) + elem = distribution(thread_local_rng); + } +}; + +struct ChiSquaredDistribution +{ + static constexpr const char * getName() { return "chiSquaredDistribution"; } + static constexpr size_t getNumberOfArguments() { return 1; } + + void generate(std::vector & parameters, ColumnFloat64::Container & container) const + { + auto distribution = std::chi_squared_distribution<>(parameters[0]); + for (auto & elem : container) + elem = distribution(thread_local_rng); + } +}; + +struct StudentTDistribution +{ + static constexpr const char * getName() { return "studentTDistribution"; } + static constexpr size_t getNumberOfArguments() { return 1; } + + void generate(std::vector & parameters, ColumnFloat64::Container & container) const + { + auto distribution = std::student_t_distribution<>(parameters[0]); + for (auto & elem : container) + elem = distribution(thread_local_rng); + } +}; + +struct FisherFDistribution +{ + static constexpr const char * getName() { return "fisherFDistribution"; } + static constexpr size_t getNumberOfArguments() { return 1; } + + void generate(std::vector & parameters, ColumnFloat64::Container & container) const + { + auto distribution = std::fisher_f_distribution<>(parameters[0]); + for (auto & elem : container) + elem = distribution(thread_local_rng); + } +}; + + +/// Function which will generate values according to the distibution +/// Accepts only constant arguments +template +class FunctionDistribution : public IFunction +{ +private: + mutable Distribution distribution; + +public: + static FunctionPtr create(ContextPtr) + { + return std::make_shared>(); + } + + static constexpr auto name = Distribution::getName(); + String getName() const override { return Distribution::getName(); } + size_t getNumberOfArguments() const override { return Distribution::getNumberOfArguments(); } + bool isStateful() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + for (const auto & type : arguments) + { + WhichDataType which(type); + if (!which.isFloat() && !which.isNativeUInt()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of argument of function {}, expected Float64", type->getName(), getName()); + } + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override + { + std::vector parameters(arguments.size()); + for (size_t i = 0; i < parameters.size(); ++i) + { + const IColumn * col = arguments[i].column.get(); + + if (!isColumnConst(*col)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The {}th argument of function must be constant.", getName()); + + parameters[i] = applyVisitor(FieldVisitorConvertToNumber(), assert_cast(*col).getField()); + + if (isNaN(parameters[i]) || !std::isfinite(parameters[i])) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter number {} of function {} cannot be NaN of infinite", i, getName()); + } + + auto res_column = ColumnFloat64::create(input_rows_count); + auto & res_data = res_column->getData(); + distribution.generate(parameters, res_data); + + return res_column; + } +}; + + +REGISTER_FUNCTION(Distribution) +{ + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); +} + +} diff --git a/tests/queries/0_stateless/02462_distributions.reference b/tests/queries/0_stateless/02462_distributions.reference new file mode 100644 index 000000000000..fd2996b1c78c --- /dev/null +++ b/tests/queries/0_stateless/02462_distributions.reference @@ -0,0 +1,6 @@ +Ok +Ok +Ok +Ok +Ok +Ok diff --git a/tests/queries/0_stateless/02462_distributions.sql b/tests/queries/0_stateless/02462_distributions.sql new file mode 100644 index 000000000000..318e87bde2ba --- /dev/null +++ b/tests/queries/0_stateless/02462_distributions.sql @@ -0,0 +1,12 @@ +# Values should be between 0 and 1 +SELECT DISTINCT if (a >= 0 AND a <= 1, 'Ok', 'Fail') FROM (SELECT uniformDistribution(0, 1) AS a FROM numbers(100000)); +# Mean should be around 0 +SELECT DISTINCT if (m > toFloat64(-0.2) AND m < toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT normalDistribution(0, 5) AS a FROM numbers(100000))); +# Values should be > 0 +SELECT DISTINCT if (a > 0, 'Ok', 'Fail') FROM (SELECT logNormalDistribution(0, 5) AS a FROM numbers(100000)); +# Values should be > 0 +SELECT DISTINCT if (a > 0, 'Ok', 'Fail') FROM (SELECT chiSquaredDistribution(3) AS a FROM numbers(100000)); +# Mean should be around 0 +SELECT DISTINCT if (m > toFloat64(-0.2) AND m < toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT studentTDistribution(5) AS a FROM numbers(100000))); +# Values should be > 0 +SELECT DISTINCT if (a > 0, 'Ok', 'Fail') FROM (SELECT fisherFDistribution(3) AS a FROM numbers(100000)); From adabd5a0b11950a17906b73993af1415db410209 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Mon, 17 Oct 2022 23:12:02 +0200 Subject: [PATCH 02/12] Added documentation --- src/Functions/distribution.cpp | 94 ++++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/src/Functions/distribution.cpp b/src/Functions/distribution.cpp index 6d6c1db265cd..64c8c6e071d9 100644 --- a/src/Functions/distribution.cpp +++ b/src/Functions/distribution.cpp @@ -168,11 +168,95 @@ class FunctionDistribution : public IFunction REGISTER_FUNCTION(Distribution) { - factory.registerFunction>(); - factory.registerFunction>(); - factory.registerFunction>(); - factory.registerFunction>(); - factory.registerFunction>(); + factory.registerFunction>( + { + R"( +Returns a random number from the uniform distribution in the specified range. +Accepts two parameters - minimum bound and maximum bound. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT uniformDistribution(0, 1) FROM numbers(100000)"}}, + Documentation::Categories{"Distribution"} + }); + + factory.registerFunction>( + { + R"( +Returns a random number from the normal distribuion. +Accepts two parameters - mean and variance. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT normalDistribution(0, 5) FROM numbers(100000)"}}, + Documentation::Categories{"Distribution"} + }); + + + factory.registerFunction>( + { + R"( +Returns a random number from the lognormal distribuion (a distribution of a random variable whose logarithm is normally distributed). +Accepts two parameters - mean and variance. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT logNormalDistribution(0, 5) FROM numbers(100000)"}}, + Documentation::Categories{"Distribution"} + }); + + + factory.registerFunction>( + { + R"( +Returns a random number from the chi-squared distribuion (a distribution of a sum of the squares of k independent standart normal random variables). +Accepts one parameter - degree of freedom. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT chiSquaredDistribution(5) FROM numbers(100000)"}}, + Documentation::Categories{"Distribution"} + }); + + factory.registerFunction>( + { + R"( +Returns a random number from the t-distribution. +Accepts one parameter - degree of freedom. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT studentTDistribution(5) FROM numbers(100000)"}}, + Documentation::Categories{"Distribution"} + }); + + + factory.registerFunction>( + { + R"( +Returns a random number from the f-distribution. +The F-distribution is the distribution of X = (S1 / d1) / (S2 / d2) where d1 and d2 are degrees of freedom. +Accepts two parameters - degrees of freedom. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT studentTDistribution(5) FROM numbers(100000)"}}, + Documentation::Categories{"Distribution"} + }); + + factory.registerFunction>(); } From 15b6ae3e3a17592ad5fb5e231a66ac54d2956078 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 00:20:21 +0200 Subject: [PATCH 03/12] Added another distributions --- src/Functions/distribution.cpp | 272 +++++++++++++++--- .../0_stateless/02462_distributions.reference | 7 + .../0_stateless/02462_distributions.sql | 28 +- 3 files changed, 258 insertions(+), 49 deletions(-) diff --git a/src/Functions/distribution.cpp b/src/Functions/distribution.cpp index 64c8c6e071d9..42296bdbb861 100644 --- a/src/Functions/distribution.cpp +++ b/src/Functions/distribution.cpp @@ -13,9 +13,6 @@ #include -#include -#include - namespace DB { @@ -27,14 +24,16 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +namespace +{ struct UniformDistribution { static constexpr const char * getName() { return "uniformDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(std::vector & parameters, ColumnFloat64::Container & container) const + void generate(Float64 min, Float64 max, ColumnFloat64::Container & container) const { - auto distribution = std::uniform_real_distribution<>(parameters[0], parameters[1]); + auto distribution = std::uniform_real_distribution<>(min, max); for (auto & elem : container) elem = distribution(thread_local_rng); } @@ -45,9 +44,9 @@ struct NormalDistribution static constexpr const char * getName() { return "normalDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(std::vector & parameters, ColumnFloat64::Container & container) const + void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) const { - auto distribution = std::normal_distribution<>(parameters[0], parameters[1]); + auto distribution = std::normal_distribution<>(mean, variance); for (auto & elem : container) elem = distribution(thread_local_rng); } @@ -58,9 +57,22 @@ struct LogNormalDistribution static constexpr const char * getName() { return "logNormalDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(std::vector & parameters, ColumnFloat64::Container & container) const + void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) const { - auto distribution = std::lognormal_distribution<>(parameters[0], parameters[1]); + auto distribution = std::lognormal_distribution<>(mean, variance); + for (auto & elem : container) + elem = distribution(thread_local_rng); + } +}; + +struct ExponentialDistribution +{ + static constexpr const char * getName() { return "exponentialDistribution"; } + static constexpr size_t getNumberOfArguments() { return 1; } + + void generate(Float64 lambda, ColumnFloat64::Container & container) const + { + auto distribution = std::exponential_distribution<>(lambda); for (auto & elem : container) elem = distribution(thread_local_rng); } @@ -71,9 +83,9 @@ struct ChiSquaredDistribution static constexpr const char * getName() { return "chiSquaredDistribution"; } static constexpr size_t getNumberOfArguments() { return 1; } - void generate(std::vector & parameters, ColumnFloat64::Container & container) const + void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) const { - auto distribution = std::chi_squared_distribution<>(parameters[0]); + auto distribution = std::chi_squared_distribution<>(degree_of_freedom); for (auto & elem : container) elem = distribution(thread_local_rng); } @@ -84,9 +96,9 @@ struct StudentTDistribution static constexpr const char * getName() { return "studentTDistribution"; } static constexpr size_t getNumberOfArguments() { return 1; } - void generate(std::vector & parameters, ColumnFloat64::Container & container) const + void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) const { - auto distribution = std::student_t_distribution<>(parameters[0]); + auto distribution = std::student_t_distribution<>(degree_of_freedom); for (auto & elem : container) elem = distribution(thread_local_rng); } @@ -95,17 +107,78 @@ struct StudentTDistribution struct FisherFDistribution { static constexpr const char * getName() { return "fisherFDistribution"; } - static constexpr size_t getNumberOfArguments() { return 1; } + static constexpr size_t getNumberOfArguments() { return 2; } - void generate(std::vector & parameters, ColumnFloat64::Container & container) const + void generate(Float64 d1, Float64 d2, ColumnFloat64::Container & container) const { - auto distribution = std::fisher_f_distribution<>(parameters[0]); + auto distribution = std::fisher_f_distribution<>(d1, d2); for (auto & elem : container) elem = distribution(thread_local_rng); } }; +struct BernoulliDistribution +{ + static constexpr const char * getName() { return "bernoulliDistribution"; } + static constexpr size_t getNumberOfArguments() { return 1; } + + void generate(Float64 p, ColumnUInt8::Container & container) const + { + if (p < 0.0f || p > 1.0f) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument of function {} should be inside [0, 1] because it is a probability", getName()); + + auto distribution = std::bernoulli_distribution(p); + for (auto & elem : container) + elem = static_cast(distribution(thread_local_rng)); + } +}; + +struct BinomialDistribution +{ + static constexpr const char * getName() { return "binomialDistribution"; } + static constexpr size_t getNumberOfArguments() { return 2; } + + void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) const + { + if (p < 0.0f || p > 1.0f) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument of function {} should be inside [0, 1] because it is a probability", getName()); + + auto distribution = std::binomial_distribution(t, p); + for (auto & elem : container) + elem = static_cast(distribution(thread_local_rng)); + } +}; + +struct NegativeBinomialDistribution +{ + static constexpr const char * getName() { return "negativeBinomialDistribution"; } + static constexpr size_t getNumberOfArguments() { return 2; } + + void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) const + { + if (p < 0.0f || p > 1.0f) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument of function {} should be inside [0, 1] because it is a probability", getName()); + + auto distribution = std::negative_binomial_distribution(t, p); + for (auto & elem : container) + elem = static_cast(distribution(thread_local_rng)); + } +}; + +struct PoissonDistribution +{ + static constexpr const char * getName() { return "poissonDistribution"; } + static constexpr size_t getNumberOfArguments() { return 1; } + + void generate(UInt64 n, ColumnUInt64::Container & container) const + { + auto distribution = std::poisson_distribution(n); + for (auto & elem : container) + elem = static_cast(distribution(thread_local_rng)); + } +}; +} /// Function which will generate values according to the distibution /// Accepts only constant arguments template @@ -114,6 +187,22 @@ class FunctionDistribution : public IFunction private: mutable Distribution distribution; + template + ResultType getParameterFromConstColumn(size_t parameter_number, const ColumnsWithTypeAndName & arguments) const + { + const IColumn * col = arguments[parameter_number].column.get(); + + if (!isColumnConst(*col)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Parameter number {} of function must be constant.", parameter_number, getName()); + + auto parameter = applyVisitor(FieldVisitorConvertToNumber(), assert_cast(*col).getField()); + + if (isNaN(parameter) || !std::isfinite(parameter)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter number {} of function {} cannot be NaN of infinite", parameter_number, getName()); + + return parameter; + } + public: static FunctionPtr create(ContextPtr) { @@ -138,30 +227,59 @@ class FunctionDistribution : public IFunction "Illegal type {} of argument of function {}, expected Float64", type->getName(), getName()); } - return std::make_shared(); + if constexpr (std::is_same_v) + return std::make_shared(); + else if constexpr ( + std::is_same_v + || std::is_same_v + || std::is_same_v) + return std::make_shared(); + else + return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override { - std::vector parameters(arguments.size()); - for (size_t i = 0; i < parameters.size(); ++i) + if constexpr (std::is_same_v) { - const IColumn * col = arguments[i].column.get(); - - if (!isColumnConst(*col)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The {}th argument of function must be constant.", getName()); - - parameters[i] = applyVisitor(FieldVisitorConvertToNumber(), assert_cast(*col).getField()); - - if (isNaN(parameters[i]) || !std::isfinite(parameters[i])) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter number {} of function {} cannot be NaN of infinite", i, getName()); + auto res_column = ColumnUInt8::create(input_rows_count); + auto & res_data = res_column->getData(); + distribution.generate(getParameterFromConstColumn(0, arguments), res_data); + return res_column; + } + else if constexpr (std::is_same_v || std::is_same_v) + { + auto res_column = ColumnUInt64::create(input_rows_count); + auto & res_data = res_column->getData(); + distribution.generate(getParameterFromConstColumn(0, arguments), getParameterFromConstColumn(1, arguments), res_data); + return res_column; + } + else if constexpr (std::is_same_v) + { + auto res_column = ColumnUInt64::create(input_rows_count); + auto & res_data = res_column->getData(); + distribution.generate(getParameterFromConstColumn(0, arguments), res_data); + return res_column; + } + else + { + auto res_column = ColumnFloat64::create(input_rows_count); + auto & res_data = res_column->getData(); + if constexpr (Distribution::getNumberOfArguments() == 1) + { + distribution.generate(getParameterFromConstColumn(0, arguments), res_data); + } + else if constexpr (Distribution::getNumberOfArguments() == 2) + { + distribution.generate(getParameterFromConstColumn(0, arguments), getParameterFromConstColumn(1, arguments), res_data); + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "More than two argument specified for function", getName()); + } + + return res_column; } - - auto res_column = ColumnFloat64::create(input_rows_count); - auto & res_data = res_column->getData(); - distribution.generate(parameters, res_data); - - return res_column; } }; @@ -178,7 +296,7 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT uniformDistribution(0, 1) FROM numbers(100000)"}}, + {"typical", "SELECT uniformDistribution(0, 1) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); @@ -192,7 +310,7 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT normalDistribution(0, 5) FROM numbers(100000)"}}, + {"typical", "SELECT normalDistribution(0, 5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); @@ -207,7 +325,22 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT logNormalDistribution(0, 5) FROM numbers(100000)"}}, + {"typical", "SELECT logNormalDistribution(0, 5) FROM numbers(100000);"}}, + Documentation::Categories{"Distribution"} + }); + + + factory.registerFunction>( + { + R"( +Returns a random number from the exponential distribuion. +Accepts one parameter. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT exponentialDistribution(0, 5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); @@ -222,7 +355,7 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT chiSquaredDistribution(5) FROM numbers(100000)"}}, + {"typical", "SELECT chiSquaredDistribution(5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); @@ -236,7 +369,7 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT studentTDistribution(5) FROM numbers(100000)"}}, + {"typical", "SELECT studentTDistribution(5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); @@ -252,12 +385,69 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT studentTDistribution(5) FROM numbers(100000)"}}, + {"typical", "SELECT studentTDistribution(5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>(); + factory.registerFunction>( + { + R"( +Returns a random number from the Bernoulli distribution. +Accepts two parameters - probability of success. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT bernoulliDistribution(0.1) FROM numbers(100000);"}}, + Documentation::Categories{"Distribution"} + }); + + + factory.registerFunction>( + { + R"( +Returns a random number from the binomial distribution. +Accepts two parameters - number of experiments and probability of success in each experiment. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT binomialDistribution(10, 0.1) FROM numbers(100000);"}}, + Documentation::Categories{"Distribution"} + }); + + + factory.registerFunction>( + { + R"( +Returns a random number from the negative binomial distribution. +Accepts two parameters - number of experiments and probability of success in each experiment. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT negativeBinomialDistribution(10, 0.1) FROM numbers(100000);"}}, + Documentation::Categories{"Distribution"} + }); + + + factory.registerFunction>( + { + R"( +Returns a random number from the poisson distribution. +Accepts two parameters - the mean number of occurrences. + +Typical usage: +[example:typical] +)", + Documentation::Examples{ + {"typical", "SELECT poissonDistribution(3) FROM numbers(100000);"}}, + Documentation::Categories{"Distribution"} + }); } } diff --git a/tests/queries/0_stateless/02462_distributions.reference b/tests/queries/0_stateless/02462_distributions.reference index fd2996b1c78c..17e5154b5f2f 100644 --- a/tests/queries/0_stateless/02462_distributions.reference +++ b/tests/queries/0_stateless/02462_distributions.reference @@ -4,3 +4,10 @@ Ok Ok Ok Ok +Ok +0 +1 +Ok +Ok +Ok +Ok diff --git a/tests/queries/0_stateless/02462_distributions.sql b/tests/queries/0_stateless/02462_distributions.sql index 318e87bde2ba..d6cc1acac1fe 100644 --- a/tests/queries/0_stateless/02462_distributions.sql +++ b/tests/queries/0_stateless/02462_distributions.sql @@ -1,12 +1,24 @@ # Values should be between 0 and 1 -SELECT DISTINCT if (a >= 0 AND a <= 1, 'Ok', 'Fail') FROM (SELECT uniformDistribution(0, 1) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0) AND a <= toFloat64(1), 'Ok', 'Fail') FROM (SELECT uniformDistribution(0, 1) AS a FROM numbers(100000)); # Mean should be around 0 -SELECT DISTINCT if (m > toFloat64(-0.2) AND m < toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT normalDistribution(0, 5) AS a FROM numbers(100000))); -# Values should be > 0 -SELECT DISTINCT if (a > 0, 'Ok', 'Fail') FROM (SELECT logNormalDistribution(0, 5) AS a FROM numbers(100000)); -# Values should be > 0 -SELECT DISTINCT if (a > 0, 'Ok', 'Fail') FROM (SELECT chiSquaredDistribution(3) AS a FROM numbers(100000)); +SELECT DISTINCT if (m >= toFloat64(-0.2) AND m <= toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT normalDistribution(0, 5) AS a FROM numbers(100000))); +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT logNormalDistribution(0, 5) AS a FROM numbers(100000)); +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT exponentialDistribution(15) AS a FROM numbers(100000)); +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT chiSquaredDistribution(3) AS a FROM numbers(100000)); # Mean should be around 0 SELECT DISTINCT if (m > toFloat64(-0.2) AND m < toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT studentTDistribution(5) AS a FROM numbers(100000))); -# Values should be > 0 -SELECT DISTINCT if (a > 0, 'Ok', 'Fail') FROM (SELECT fisherFDistribution(3) AS a FROM numbers(100000)); +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT fisherFDistribution(3, 4) AS a FROM numbers(100000)); +# There should be only 0s and 1s +SELECT a FROM (SELECT DISTINCT bernoulliDistribution(0.5) AS a FROM numbers(100000)) ORDER BY a; +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT binomialDistribution(3, 0.5) AS a FROM numbers(100000)); +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT negativeBinomialDistribution(3, 0.5) AS a FROM numbers(100000)); +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT poissonDistribution(44) AS a FROM numbers(100000)); +# Values should be >= 0 +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT exponentialDistribution(23) AS a FROM numbers(100000)); From ff0a9fde063f4e3ad5586b7249e848c7c4f77d3a Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 00:22:51 +0200 Subject: [PATCH 04/12] Better test --- tests/queries/0_stateless/02462_distributions.reference | 1 - tests/queries/0_stateless/02462_distributions.sql | 2 -- 2 files changed, 3 deletions(-) diff --git a/tests/queries/0_stateless/02462_distributions.reference b/tests/queries/0_stateless/02462_distributions.reference index 17e5154b5f2f..56b04bcb856b 100644 --- a/tests/queries/0_stateless/02462_distributions.reference +++ b/tests/queries/0_stateless/02462_distributions.reference @@ -10,4 +10,3 @@ Ok Ok Ok Ok -Ok diff --git a/tests/queries/0_stateless/02462_distributions.sql b/tests/queries/0_stateless/02462_distributions.sql index d6cc1acac1fe..ea59bc99a679 100644 --- a/tests/queries/0_stateless/02462_distributions.sql +++ b/tests/queries/0_stateless/02462_distributions.sql @@ -20,5 +20,3 @@ SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT binomialDistri SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT negativeBinomialDistribution(3, 0.5) AS a FROM numbers(100000)); # Values should be >= 0 SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT poissonDistribution(44) AS a FROM numbers(100000)); -# Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT exponentialDistribution(23) AS a FROM numbers(100000)); From 8f43d93503be463e09e3b74a2387a56c91429f77 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 01:40:05 +0200 Subject: [PATCH 05/12] Style --- src/Functions/distribution.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Functions/distribution.cpp b/src/Functions/distribution.cpp index 42296bdbb861..a362d04b71d1 100644 --- a/src/Functions/distribution.cpp +++ b/src/Functions/distribution.cpp @@ -19,7 +19,6 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int TOO_SLOW; extern const int ILLEGAL_COLUMN; extern const int BAD_ARGUMENTS; } @@ -179,7 +178,7 @@ struct PoissonDistribution }; } -/// Function which will generate values according to the distibution +/// Function which will generate values according to the distribution /// Accepts only constant arguments template class FunctionDistribution : public IFunction @@ -303,7 +302,7 @@ Typical usage: factory.registerFunction>( { R"( -Returns a random number from the normal distribuion. +Returns a random number from the normal distribution. Accepts two parameters - mean and variance. Typical usage: @@ -318,7 +317,7 @@ Typical usage: factory.registerFunction>( { R"( -Returns a random number from the lognormal distribuion (a distribution of a random variable whose logarithm is normally distributed). +Returns a random number from the lognormal distribution (a distribution of a random variable whose logarithm is normally distributed). Accepts two parameters - mean and variance. Typical usage: @@ -333,7 +332,7 @@ Typical usage: factory.registerFunction>( { R"( -Returns a random number from the exponential distribuion. +Returns a random number from the exponential distribution. Accepts one parameter. Typical usage: @@ -348,7 +347,7 @@ Typical usage: factory.registerFunction>( { R"( -Returns a random number from the chi-squared distribuion (a distribution of a sum of the squares of k independent standart normal random variables). +Returns a random number from the chi-squared distribution (a distribution of a sum of the squares of k independent standard normal random variables). Accepts one parameter - degree of freedom. Typical usage: From 14308b01cf456214162614dc79a272e3d64431a7 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 13:16:16 +0200 Subject: [PATCH 06/12] Review fixes + fix build --- src/Functions/distribution.cpp | 56 +++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/src/Functions/distribution.cpp b/src/Functions/distribution.cpp index a362d04b71d1..90d357ff9524 100644 --- a/src/Functions/distribution.cpp +++ b/src/Functions/distribution.cpp @@ -21,16 +21,18 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_COLUMN; extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } namespace { struct UniformDistribution { + using ReturnType = DataTypeFloat64; static constexpr const char * getName() { return "uniformDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(Float64 min, Float64 max, ColumnFloat64::Container & container) const + static void generate(Float64 min, Float64 max, ColumnFloat64::Container & container) { auto distribution = std::uniform_real_distribution<>(min, max); for (auto & elem : container) @@ -40,10 +42,11 @@ struct UniformDistribution struct NormalDistribution { + using ReturnType = DataTypeFloat64; static constexpr const char * getName() { return "normalDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) const + static void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) { auto distribution = std::normal_distribution<>(mean, variance); for (auto & elem : container) @@ -53,10 +56,11 @@ struct NormalDistribution struct LogNormalDistribution { + using ReturnType = DataTypeFloat64; static constexpr const char * getName() { return "logNormalDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) const + static void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) { auto distribution = std::lognormal_distribution<>(mean, variance); for (auto & elem : container) @@ -66,10 +70,11 @@ struct LogNormalDistribution struct ExponentialDistribution { + using ReturnType = DataTypeFloat64; static constexpr const char * getName() { return "exponentialDistribution"; } static constexpr size_t getNumberOfArguments() { return 1; } - void generate(Float64 lambda, ColumnFloat64::Container & container) const + static void generate(Float64 lambda, ColumnFloat64::Container & container) { auto distribution = std::exponential_distribution<>(lambda); for (auto & elem : container) @@ -79,10 +84,11 @@ struct ExponentialDistribution struct ChiSquaredDistribution { + using ReturnType = DataTypeFloat64; static constexpr const char * getName() { return "chiSquaredDistribution"; } static constexpr size_t getNumberOfArguments() { return 1; } - void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) const + static void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) { auto distribution = std::chi_squared_distribution<>(degree_of_freedom); for (auto & elem : container) @@ -92,10 +98,11 @@ struct ChiSquaredDistribution struct StudentTDistribution { + using ReturnType = DataTypeFloat64; static constexpr const char * getName() { return "studentTDistribution"; } static constexpr size_t getNumberOfArguments() { return 1; } - void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) const + static void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) { auto distribution = std::student_t_distribution<>(degree_of_freedom); for (auto & elem : container) @@ -105,10 +112,11 @@ struct StudentTDistribution struct FisherFDistribution { + using ReturnType = DataTypeFloat64; static constexpr const char * getName() { return "fisherFDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(Float64 d1, Float64 d2, ColumnFloat64::Container & container) const + static void generate(Float64 d1, Float64 d2, ColumnFloat64::Container & container) { auto distribution = std::fisher_f_distribution<>(d1, d2); for (auto & elem : container) @@ -118,10 +126,11 @@ struct FisherFDistribution struct BernoulliDistribution { + using ReturnType = DataTypeUInt8; static constexpr const char * getName() { return "bernoulliDistribution"; } static constexpr size_t getNumberOfArguments() { return 1; } - void generate(Float64 p, ColumnUInt8::Container & container) const + static void generate(Float64 p, ColumnUInt8::Container & container) { if (p < 0.0f || p > 1.0f) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument of function {} should be inside [0, 1] because it is a probability", getName()); @@ -134,10 +143,11 @@ struct BernoulliDistribution struct BinomialDistribution { + using ReturnType = DataTypeUInt64; static constexpr const char * getName() { return "binomialDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) const + static void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) { if (p < 0.0f || p > 1.0f) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument of function {} should be inside [0, 1] because it is a probability", getName()); @@ -150,10 +160,11 @@ struct BinomialDistribution struct NegativeBinomialDistribution { + using ReturnType = DataTypeUInt64; static constexpr const char * getName() { return "negativeBinomialDistribution"; } static constexpr size_t getNumberOfArguments() { return 2; } - void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) const + static void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) { if (p < 0.0f || p > 1.0f) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Argument of function {} should be inside [0, 1] because it is a probability", getName()); @@ -166,10 +177,11 @@ struct NegativeBinomialDistribution struct PoissonDistribution { + using ReturnType = DataTypeUInt64; static constexpr const char * getName() { return "poissonDistribution"; } static constexpr size_t getNumberOfArguments() { return 1; } - void generate(UInt64 n, ColumnUInt64::Container & container) const + static void generate(UInt64 n, ColumnUInt64::Container & container) { auto distribution = std::poisson_distribution(n); for (auto & elem : container) @@ -184,11 +196,15 @@ template class FunctionDistribution : public IFunction { private: - mutable Distribution distribution; + Distribution distribution; template ResultType getParameterFromConstColumn(size_t parameter_number, const ColumnsWithTypeAndName & arguments) const { + if (parameter_number >= arguments.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "Parameter number ({}) is greater than the size of arguments ({}). This is a bug", parameter_number, arguments.size()); + const IColumn * col = arguments[parameter_number].column.get(); if (!isColumnConst(*col)) @@ -209,7 +225,7 @@ class FunctionDistribution : public IFunction } static constexpr auto name = Distribution::getName(); - String getName() const override { return Distribution::getName(); } + String getName() const override { return name; } size_t getNumberOfArguments() const override { return Distribution::getNumberOfArguments(); } bool isStateful() const override { return true; } bool isDeterministic() const override { return false; } @@ -223,18 +239,10 @@ class FunctionDistribution : public IFunction WhichDataType which(type); if (!which.isFloat() && !which.isNativeUInt()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of argument of function {}, expected Float64", type->getName(), getName()); + "Illegal type {} of argument of function {}, expected Float64 or interger", type->getName(), getName()); } - if constexpr (std::is_same_v) - return std::make_shared(); - else if constexpr ( - std::is_same_v - || std::is_same_v - || std::is_same_v) - return std::make_shared(); - else - return std::make_shared(); + return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override @@ -274,7 +282,7 @@ class FunctionDistribution : public IFunction } else { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "More than two argument specified for function", getName()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "More than two argument specified for function {}", getName()); } return res_column; From aa7a22e786a5e2daf4b7e1413fc210bc7927a807 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 14:52:30 +0200 Subject: [PATCH 07/12] Style --- src/Functions/distribution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/distribution.cpp b/src/Functions/distribution.cpp index 90d357ff9524..768a0aacec8b 100644 --- a/src/Functions/distribution.cpp +++ b/src/Functions/distribution.cpp @@ -239,7 +239,7 @@ class FunctionDistribution : public IFunction WhichDataType which(type); if (!which.isFloat() && !which.isNativeUInt()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of argument of function {}, expected Float64 or interger", type->getName(), getName()); + "Illegal type {} of argument of function {}, expected Float64 or integer", type->getName(), getName()); } return std::make_shared(); From d232fee1420853481d53b4fef979acc964432152 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 17:03:24 +0200 Subject: [PATCH 08/12] Rename functions --- ...{distribution.cpp => randDistribution.cpp} | 70 +++++++++---------- .../0_stateless/02462_distributions.sql | 22 +++--- 2 files changed, 46 insertions(+), 46 deletions(-) rename src/Functions/{distribution.cpp => randDistribution.cpp} (82%) diff --git a/src/Functions/distribution.cpp b/src/Functions/randDistribution.cpp similarity index 82% rename from src/Functions/distribution.cpp rename to src/Functions/randDistribution.cpp index 768a0aacec8b..985abb617e5f 100644 --- a/src/Functions/distribution.cpp +++ b/src/Functions/randDistribution.cpp @@ -29,7 +29,7 @@ namespace struct UniformDistribution { using ReturnType = DataTypeFloat64; - static constexpr const char * getName() { return "uniformDistribution"; } + static constexpr const char * getName() { return "randUniform"; } static constexpr size_t getNumberOfArguments() { return 2; } static void generate(Float64 min, Float64 max, ColumnFloat64::Container & container) @@ -43,7 +43,7 @@ struct UniformDistribution struct NormalDistribution { using ReturnType = DataTypeFloat64; - static constexpr const char * getName() { return "normalDistribution"; } + static constexpr const char * getName() { return "randNormal"; } static constexpr size_t getNumberOfArguments() { return 2; } static void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) @@ -57,7 +57,7 @@ struct NormalDistribution struct LogNormalDistribution { using ReturnType = DataTypeFloat64; - static constexpr const char * getName() { return "logNormalDistribution"; } + static constexpr const char * getName() { return "randLogNormal"; } static constexpr size_t getNumberOfArguments() { return 2; } static void generate(Float64 mean, Float64 variance, ColumnFloat64::Container & container) @@ -71,7 +71,7 @@ struct LogNormalDistribution struct ExponentialDistribution { using ReturnType = DataTypeFloat64; - static constexpr const char * getName() { return "exponentialDistribution"; } + static constexpr const char * getName() { return "randExponential"; } static constexpr size_t getNumberOfArguments() { return 1; } static void generate(Float64 lambda, ColumnFloat64::Container & container) @@ -85,7 +85,7 @@ struct ExponentialDistribution struct ChiSquaredDistribution { using ReturnType = DataTypeFloat64; - static constexpr const char * getName() { return "chiSquaredDistribution"; } + static constexpr const char * getName() { return "randChiSquared"; } static constexpr size_t getNumberOfArguments() { return 1; } static void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) @@ -99,7 +99,7 @@ struct ChiSquaredDistribution struct StudentTDistribution { using ReturnType = DataTypeFloat64; - static constexpr const char * getName() { return "studentTDistribution"; } + static constexpr const char * getName() { return "randStudentT"; } static constexpr size_t getNumberOfArguments() { return 1; } static void generate(Float64 degree_of_freedom, ColumnFloat64::Container & container) @@ -113,7 +113,7 @@ struct StudentTDistribution struct FisherFDistribution { using ReturnType = DataTypeFloat64; - static constexpr const char * getName() { return "fisherFDistribution"; } + static constexpr const char * getName() { return "randFisherF"; } static constexpr size_t getNumberOfArguments() { return 2; } static void generate(Float64 d1, Float64 d2, ColumnFloat64::Container & container) @@ -127,7 +127,7 @@ struct FisherFDistribution struct BernoulliDistribution { using ReturnType = DataTypeUInt8; - static constexpr const char * getName() { return "bernoulliDistribution"; } + static constexpr const char * getName() { return "randBernoulli"; } static constexpr size_t getNumberOfArguments() { return 1; } static void generate(Float64 p, ColumnUInt8::Container & container) @@ -144,7 +144,7 @@ struct BernoulliDistribution struct BinomialDistribution { using ReturnType = DataTypeUInt64; - static constexpr const char * getName() { return "binomialDistribution"; } + static constexpr const char * getName() { return "randBinomial"; } static constexpr size_t getNumberOfArguments() { return 2; } static void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) @@ -161,7 +161,7 @@ struct BinomialDistribution struct NegativeBinomialDistribution { using ReturnType = DataTypeUInt64; - static constexpr const char * getName() { return "negativeBinomialDistribution"; } + static constexpr const char * getName() { return "randNegativeBinomial"; } static constexpr size_t getNumberOfArguments() { return 2; } static void generate(UInt64 t, Float64 p, ColumnUInt64::Container & container) @@ -178,7 +178,7 @@ struct NegativeBinomialDistribution struct PoissonDistribution { using ReturnType = DataTypeUInt64; - static constexpr const char * getName() { return "poissonDistribution"; } + static constexpr const char * getName() { return "randPoisson"; } static constexpr size_t getNumberOfArguments() { return 1; } static void generate(UInt64 n, ColumnUInt64::Container & container) @@ -193,7 +193,7 @@ struct PoissonDistribution /// Function which will generate values according to the distribution /// Accepts only constant arguments template -class FunctionDistribution : public IFunction +class FunctionRandomDistribution : public IFunction { private: Distribution distribution; @@ -221,7 +221,7 @@ class FunctionDistribution : public IFunction public: static FunctionPtr create(ContextPtr) { - return std::make_shared>(); + return std::make_shared>(); } static constexpr auto name = Distribution::getName(); @@ -293,7 +293,7 @@ class FunctionDistribution : public IFunction REGISTER_FUNCTION(Distribution) { - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the uniform distribution in the specified range. @@ -303,11 +303,11 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT uniformDistribution(0, 1) FROM numbers(100000);"}}, + {"typical", "SELECT randUniform(0, 1) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the normal distribution. @@ -317,12 +317,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT normalDistribution(0, 5) FROM numbers(100000);"}}, + {"typical", "SELECT randNormal(0, 5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the lognormal distribution (a distribution of a random variable whose logarithm is normally distributed). @@ -332,12 +332,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT logNormalDistribution(0, 5) FROM numbers(100000);"}}, + {"typical", "SELECT randLogNormal(0, 5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the exponential distribution. @@ -347,12 +347,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT exponentialDistribution(0, 5) FROM numbers(100000);"}}, + {"typical", "SELECT randExponential(0, 5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the chi-squared distribution (a distribution of a sum of the squares of k independent standard normal random variables). @@ -362,11 +362,11 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT chiSquaredDistribution(5) FROM numbers(100000);"}}, + {"typical", "SELECT randChiSquared(5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the t-distribution. @@ -376,12 +376,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT studentTDistribution(5) FROM numbers(100000);"}}, + {"typical", "SELECT randStudentT(5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the f-distribution. @@ -392,12 +392,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT studentTDistribution(5) FROM numbers(100000);"}}, + {"typical", "SELECT randFisherF(5) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the Bernoulli distribution. @@ -407,12 +407,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT bernoulliDistribution(0.1) FROM numbers(100000);"}}, + {"typical", "SELECT randBernoulli(0.1) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the binomial distribution. @@ -422,12 +422,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT binomialDistribution(10, 0.1) FROM numbers(100000);"}}, + {"typical", "SELECT randBinomial(10, 0.1) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the negative binomial distribution. @@ -437,12 +437,12 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT negativeBinomialDistribution(10, 0.1) FROM numbers(100000);"}}, + {"typical", "SELECT randNegativeBinomial(10, 0.1) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); - factory.registerFunction>( + factory.registerFunction>( { R"( Returns a random number from the poisson distribution. @@ -452,7 +452,7 @@ Typical usage: [example:typical] )", Documentation::Examples{ - {"typical", "SELECT poissonDistribution(3) FROM numbers(100000);"}}, + {"typical", "SELECT randPoisson(3) FROM numbers(100000);"}}, Documentation::Categories{"Distribution"} }); } diff --git a/tests/queries/0_stateless/02462_distributions.sql b/tests/queries/0_stateless/02462_distributions.sql index ea59bc99a679..8378dd3ff4a8 100644 --- a/tests/queries/0_stateless/02462_distributions.sql +++ b/tests/queries/0_stateless/02462_distributions.sql @@ -1,22 +1,22 @@ # Values should be between 0 and 1 -SELECT DISTINCT if (a >= toFloat64(0) AND a <= toFloat64(1), 'Ok', 'Fail') FROM (SELECT uniformDistribution(0, 1) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0) AND a <= toFloat64(1), 'Ok', 'Fail') FROM (SELECT randUniform(0, 1) AS a FROM numbers(100000)); # Mean should be around 0 -SELECT DISTINCT if (m >= toFloat64(-0.2) AND m <= toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT normalDistribution(0, 5) AS a FROM numbers(100000))); +SELECT DISTINCT if (m >= toFloat64(-0.2) AND m <= toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT randNormal(0, 5) AS a FROM numbers(100000))); # Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT logNormalDistribution(0, 5) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randLogNormal(0, 5) AS a FROM numbers(100000)); # Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT exponentialDistribution(15) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randExponential(15) AS a FROM numbers(100000)); # Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT chiSquaredDistribution(3) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randChiSquared(3) AS a FROM numbers(100000)); # Mean should be around 0 -SELECT DISTINCT if (m > toFloat64(-0.2) AND m < toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT studentTDistribution(5) AS a FROM numbers(100000))); +SELECT DISTINCT if (m > toFloat64(-0.2) AND m < toFloat64(0.2), 'Ok', 'Fail') FROM (SELECT avg(a) as m FROM (SELECT randStudentT(5) AS a FROM numbers(100000))); # Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT fisherFDistribution(3, 4) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randFisherF(3, 4) AS a FROM numbers(100000)); # There should be only 0s and 1s -SELECT a FROM (SELECT DISTINCT bernoulliDistribution(0.5) AS a FROM numbers(100000)) ORDER BY a; +SELECT a FROM (SELECT DISTINCT randBernoulli(0.5) AS a FROM numbers(100000)) ORDER BY a; # Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT binomialDistribution(3, 0.5) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randBinomial(3, 0.5) AS a FROM numbers(100000)); # Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT negativeBinomialDistribution(3, 0.5) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randNegativeBinomial(3, 0.5) AS a FROM numbers(100000)); # Values should be >= 0 -SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT poissonDistribution(44) AS a FROM numbers(100000)); +SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randPoisson(44) AS a FROM numbers(100000)); From fac1f067fe8d0942e48d0f059066722bbf93b287 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 18:32:40 +0200 Subject: [PATCH 09/12] Added optional argument --- src/Functions/randDistribution.cpp | 31 +++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/Functions/randDistribution.cpp b/src/Functions/randDistribution.cpp index 985abb617e5f..eec61c618f8e 100644 --- a/src/Functions/randDistribution.cpp +++ b/src/Functions/randDistribution.cpp @@ -1,6 +1,7 @@ #include #include #include +#include "Common/Exception.h" #include #include #include @@ -190,13 +191,19 @@ struct PoissonDistribution }; } -/// Function which will generate values according to the distribution -/// Accepts only constant arguments + +/** Function which will generate values according to the specified distribution + * Accepts only constant arguments + * Similar to the functions rand and rand64 an additional 'tag' argument could be added to the + * end of arguments list (this argument will be ignored) which will guarantee that functions are not sticked together + * during optimisations. + * Example: SELECT randNormal(0, 1, 1), randNormal(0, 1, 2) FROM numbers(10) + * This query will return two different columns + */ template class FunctionRandomDistribution : public IFunction { private: - Distribution distribution; template ResultType getParameterFromConstColumn(size_t parameter_number, const ColumnsWithTypeAndName & arguments) const @@ -227,6 +234,7 @@ class FunctionRandomDistribution : public IFunction static constexpr auto name = Distribution::getName(); String getName() const override { return name; } size_t getNumberOfArguments() const override { return Distribution::getNumberOfArguments(); } + bool isVariadic() const override { return true; } bool isStateful() const override { return true; } bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } @@ -234,8 +242,13 @@ class FunctionRandomDistribution : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - for (const auto & type : arguments) + auto desired = Distribution::getNumberOfArguments(); + if (arguments.size() != desired && arguments.size() != desired + 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong number of arguments for function {}. Should be {} or {}", getName(), desired, desired + 1); + + for (size_t i = 0; i < Distribution::getNumberOfArguments(); ++i) { + auto & type = arguments[i]; WhichDataType which(type); if (!which.isFloat() && !which.isNativeUInt()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, @@ -251,21 +264,21 @@ class FunctionRandomDistribution : public IFunction { auto res_column = ColumnUInt8::create(input_rows_count); auto & res_data = res_column->getData(); - distribution.generate(getParameterFromConstColumn(0, arguments), res_data); + Distribution::generate(getParameterFromConstColumn(0, arguments), res_data); return res_column; } else if constexpr (std::is_same_v || std::is_same_v) { auto res_column = ColumnUInt64::create(input_rows_count); auto & res_data = res_column->getData(); - distribution.generate(getParameterFromConstColumn(0, arguments), getParameterFromConstColumn(1, arguments), res_data); + Distribution::generate(getParameterFromConstColumn(0, arguments), getParameterFromConstColumn(1, arguments), res_data); return res_column; } else if constexpr (std::is_same_v) { auto res_column = ColumnUInt64::create(input_rows_count); auto & res_data = res_column->getData(); - distribution.generate(getParameterFromConstColumn(0, arguments), res_data); + Distribution::generate(getParameterFromConstColumn(0, arguments), res_data); return res_column; } else @@ -274,11 +287,11 @@ class FunctionRandomDistribution : public IFunction auto & res_data = res_column->getData(); if constexpr (Distribution::getNumberOfArguments() == 1) { - distribution.generate(getParameterFromConstColumn(0, arguments), res_data); + Distribution::generate(getParameterFromConstColumn(0, arguments), res_data); } else if constexpr (Distribution::getNumberOfArguments() == 2) { - distribution.generate(getParameterFromConstColumn(0, arguments), getParameterFromConstColumn(1, arguments), res_data); + Distribution::generate(getParameterFromConstColumn(0, arguments), getParameterFromConstColumn(1, arguments), res_data); } else { From 311cac8ecec1279f3954c069393b34f4f1acfe71 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Tue, 18 Oct 2022 18:36:02 +0200 Subject: [PATCH 10/12] Small test --- tests/queries/0_stateless/02462_distributions.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02462_distributions.sql b/tests/queries/0_stateless/02462_distributions.sql index 8378dd3ff4a8..b45dc897f2ae 100644 --- a/tests/queries/0_stateless/02462_distributions.sql +++ b/tests/queries/0_stateless/02462_distributions.sql @@ -20,3 +20,5 @@ SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randBinomial(3 SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randNegativeBinomial(3, 0.5) AS a FROM numbers(100000)); # Values should be >= 0 SELECT DISTINCT if (a >= toFloat64(0), 'Ok', 'Fail') FROM (SELECT randPoisson(44) AS a FROM numbers(100000)); +# No errors +SELECT randUniform(1, 2, 1), randNormal(0, 1, 'abacaba'), randLogNormal(0, 10, 'b'), randChiSquared(1, 1), randStudentT(7, '8'), randFisherF(23, 42, 100), randBernoulli(0.5, 2), randBinomial(3, 0.5, 1), randNegativeBinomial(3, 0.5, 2), randPoisson(44, 44) FORMAT Null; From e65265ea304e2b7db0a8c1483a7ef4e287eba879 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 19 Oct 2022 13:55:31 +0200 Subject: [PATCH 11/12] Update randDistribution.cpp --- src/Functions/randDistribution.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/randDistribution.cpp b/src/Functions/randDistribution.cpp index eec61c618f8e..d35a8d404292 100644 --- a/src/Functions/randDistribution.cpp +++ b/src/Functions/randDistribution.cpp @@ -248,7 +248,7 @@ class FunctionRandomDistribution : public IFunction for (size_t i = 0; i < Distribution::getNumberOfArguments(); ++i) { - auto & type = arguments[i]; + const auto & type = arguments[i]; WhichDataType which(type); if (!which.isFloat() && !which.isNativeUInt()) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, From f424e16500988de37461f7d11cf7647971dbd654 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 19 Oct 2022 14:30:34 +0200 Subject: [PATCH 12/12] Update randDistribution.cpp --- src/Functions/randDistribution.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Functions/randDistribution.cpp b/src/Functions/randDistribution.cpp index d35a8d404292..94dad4fdc899 100644 --- a/src/Functions/randDistribution.cpp +++ b/src/Functions/randDistribution.cpp @@ -235,7 +235,6 @@ class FunctionRandomDistribution : public IFunction String getName() const override { return name; } size_t getNumberOfArguments() const override { return Distribution::getNumberOfArguments(); } bool isVariadic() const override { return true; } - bool isStateful() const override { return true; } bool isDeterministic() const override { return false; } bool isDeterministicInScopeOfQuery() const override { return false; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }