Skip to content

Commit

Permalink
ARROW-7215: [C++][Gandiva] Implement castVARCHAR(numeric_type) functions
Browse files Browse the repository at this point in the history
This PR implements the castVARCHAR for numeric values inside the Gandiva.

It replaces the logic of the apache#8158 PR  to change the function output to match the Java language patterns.

Closes apache#9816 from anthonylouisbsb/feature/fix-castvarchar-to-match-java-impl and squashes the following commits:

7df55a5 <Anthony Louis> Apply formatting changes
7a724c0 <Anthony Louis> Remove unnecessary macros
4fb8a7f <Anthony Louis> Refactor if chain
e787051 <Anthony Louis> Add test to infinity case
b62b856 <Anthony Louis> Add comments for changes
cec11bb <Anthony Louis> Add tests to check Java compatibility
302139c <Anthony Louis> Add emit trailing point tests
efb94b9 <Anthony Louis> Add -0.0 inside cast test
523e60a <Anthony Louis> Add custom constructor inside the class
34f2f92 <Anthony Louis> Add class to print in formatted way
e244502 <Anthony Louis> Fix tests to consider java formatting
33bc5b2 <Projjal Chanda> added castvarchar(numeric_types) functions

Lead-authored-by: Anthony Louis <anthony@simbioseventures.com>
Co-authored-by: Projjal Chanda <iam@pchanda.com>
Signed-off-by: Praveen <praveen@dremio.com>
  • Loading branch information
2 people authored and praveenbingo committed Apr 15, 2021
1 parent 1e6819c commit 15137e2
Show file tree
Hide file tree
Showing 10 changed files with 618 additions and 20 deletions.
18 changes: 18 additions & 0 deletions cpp/src/arrow/util/formatting.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,29 @@ struct FloatToStringFormatter::Impl {
: converter_(DoubleToStringConverter::EMIT_POSITIVE_EXPONENT_SIGN, "inf", "nan",
'e', -6, 10, 6, 0) {}

Impl(int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
int decimal_in_shortest_low, int decimal_in_shortest_high,
int max_leading_padding_zeroes_in_precision_mode,
int max_trailing_padding_zeroes_in_precision_mode)
: converter_(flags, inf_symbol, nan_symbol, exp_character, decimal_in_shortest_low,
decimal_in_shortest_high, max_leading_padding_zeroes_in_precision_mode,
max_trailing_padding_zeroes_in_precision_mode) {}

DoubleToStringConverter converter_;
};

FloatToStringFormatter::FloatToStringFormatter() : impl_(new Impl()) {}

FloatToStringFormatter::FloatToStringFormatter(
int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
int decimal_in_shortest_low, int decimal_in_shortest_high,
int max_leading_padding_zeroes_in_precision_mode,
int max_trailing_padding_zeroes_in_precision_mode)
: impl_(new Impl(flags, inf_symbol, nan_symbol, exp_character,
decimal_in_shortest_low, decimal_in_shortest_high,
max_leading_padding_zeroes_in_precision_mode,
max_trailing_padding_zeroes_in_precision_mode)) {}

FloatToStringFormatter::~FloatToStringFormatter() {}

int FloatToStringFormatter::FormatFloat(float v, char* out_buffer, int out_size) {
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/arrow/util/formatting.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/double_conversion.h"
#include "arrow/util/string_view.h"
#include "arrow/util/time.h"
#include "arrow/util/visibility.h"
Expand Down Expand Up @@ -219,6 +220,11 @@ class StringFormatter<UInt64Type> : public IntToStringFormatterMixin<UInt64Type>
class ARROW_EXPORT FloatToStringFormatter {
public:
FloatToStringFormatter();
FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
char exp_character, int decimal_in_shortest_low,
int decimal_in_shortest_high,
int max_leading_padding_zeroes_in_precision_mode,
int max_trailing_padding_zeroes_in_precision_mode);
~FloatToStringFormatter();

// Returns the number of characters written
Expand All @@ -239,6 +245,16 @@ class FloatToStringFormatterMixin : public FloatToStringFormatter {

explicit FloatToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}

FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
char exp_character, int decimal_in_shortest_low,
int decimal_in_shortest_high,
int max_leading_padding_zeroes_in_precision_mode,
int max_trailing_padding_zeroes_in_precision_mode)
: FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
decimal_in_shortest_low, decimal_in_shortest_high,
max_leading_padding_zeroes_in_precision_mode,
max_trailing_padding_zeroes_in_precision_mode) {}

template <typename Appender>
Return<Appender> operator()(value_type value, Appender&& append) {
char buffer[buffer_size];
Expand Down
20 changes: 19 additions & 1 deletion cpp/src/arrow/vendored/double-conversion/double-conversion.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,25 @@ void DoubleToStringConverter::CreateExponentialRepresentation(
StringBuilder* result_builder) const {
ASSERT(length != 0);
result_builder->AddCharacter(decimal_digits[0]);
if (length != 1) {

/* If the mantissa of the scientific notation representation is an integer number,
* the EMIT_TRAILING_DECIMAL_POINT flag will add a '.' character at the end of the
* representation:
* - With EMIT_TRAILING_DECIMAL_POINT enabled -> 0.0009 => 9.E-4
* - With EMIT_TRAILING_DECIMAL_POINT disabled -> 0.0009 => 9E-4
*
* If the mantissa is an integer and the EMIT_TRAILING_ZERO_AFTER_POINT flag is enabled
* it will add a '0' character at the end of the mantissa representation. Note that that
* flag depends on EMIT_TRAILING_DECIMAL_POINT flag be enabled.*/
if(length == 1){
if ((flags_ & EMIT_TRAILING_DECIMAL_POINT) != 0) {
result_builder->AddCharacter('.');

if ((flags_ & EMIT_TRAILING_ZERO_AFTER_POINT) != 0) {
result_builder->AddCharacter('0');
}
}
} else {
result_builder->AddCharacter('.');
result_builder->AddSubstring(&decimal_digits[1], length-1);
}
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/arrow/vendored/double-conversion/double-conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,17 @@ class DoubleToStringConverter {
// ToPrecision(230.0, 2) -> "230"
// ToPrecision(230.0, 2) -> "230." with EMIT_TRAILING_DECIMAL_POINT.
// ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT.
//
// When converting numbers to scientific notation representation, if the mantissa of
// the representation is an integer number, the EMIT_TRAILING_DECIMAL_POINT flag will
// add a '.' character at the end of the representation:
// - With EMIT_TRAILING_DECIMAL_POINT enabled -> 0.0009 => 9.E-4
// - With EMIT_TRAILING_DECIMAL_POINT disabled -> 0.0009 => 9E-4
//
// If the mantissa is an integer and the EMIT_TRAILING_ZERO_AFTER_POINT flag is enabled
// it will add a '0' character at the end of the mantissa representation. Note that that
// flag depends on EMIT_TRAILING_DECIMAL_POINT flag be enabled.
// - With EMIT_TRAILING_ZERO_AFTER_POINT enabled -> 0.0009 => 9.0E-4
DoubleToStringConverter(int flags,
const char* infinity_symbol,
const char* nan_symbol,
Expand Down
69 changes: 69 additions & 0 deletions cpp/src/gandiva/formatting_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/type.h"
#include "arrow/util/formatting.h"
#include "arrow/vendored/double-conversion/double-conversion.h"

namespace gandiva {

/// \brief The entry point for conversion to strings.
template <typename ARROW_TYPE, typename Enable = void>
class GdvStringFormatter;

using double_conversion::DoubleToStringConverter;

template <typename ARROW_TYPE>
class FloatToStringGdvMixin
: public arrow::internal::FloatToStringFormatterMixin<ARROW_TYPE> {
public:
using arrow::internal::FloatToStringFormatterMixin<
ARROW_TYPE>::FloatToStringFormatterMixin;

// The mixin is a modified version of the existent FloatToStringFormatterMixin, but
// it defines some specific parameters in the FloatToStringFormatterMixin to cast
// the float numbers to string using the same patterns like Java.
//
// The Java real numbers are represented in two ways following these rules:
//- If the number is greater or equals than 10^7 and less than 10^(-3)
// it will be represented using scientific notation, e.g:
// - 0.000012 -> 1.2E-5
// - 10000002.3 -> 1.00000023E7
//- If the numbers are between that interval above, they are showed as is.
explicit FloatToStringGdvMixin(const std::shared_ptr<arrow::DataType>& = NULLPTR)
: arrow::internal::FloatToStringFormatterMixin<ARROW_TYPE>(
DoubleToStringConverter::EMIT_TRAILING_ZERO_AFTER_POINT |
DoubleToStringConverter::EMIT_TRAILING_DECIMAL_POINT,
"Infinity", "NaN", 'E', -3, 7, 3, 1) {}
};

template <>
class GdvStringFormatter<arrow::FloatType>
: public FloatToStringGdvMixin<arrow::FloatType> {
public:
using FloatToStringGdvMixin::FloatToStringGdvMixin;
};

template <>
class GdvStringFormatter<arrow::DoubleType>
: public FloatToStringGdvMixin<arrow::DoubleType> {
public:
using FloatToStringGdvMixin::FloatToStringGdvMixin;
};
} // namespace gandiva
16 changes: 16 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "castVARCHAR_utf8_int64",
NativeFunction::kNeedsContext),

NativeFunction("castVARCHAR", {}, DataTypeVector{int32(), int64()}, utf8(),
kResultNullIfNull, "gdv_fn_castVARCHAR_int32_int64",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("castVARCHAR", {}, DataTypeVector{int64(), int64()}, utf8(),
kResultNullIfNull, "gdv_fn_castVARCHAR_int64_int64",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("castVARCHAR", {}, DataTypeVector{float32(), int64()}, utf8(),
kResultNullIfNull, "gdv_fn_castVARCHAR_float32_int64",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("castVARCHAR", {}, DataTypeVector{float64(), int64()}, utf8(),
kResultNullIfNull, "gdv_fn_castVARCHAR_float64_int64",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("castVARCHAR", {}, DataTypeVector{decimal128(), int64()}, utf8(),
kResultNullIfNull, "castVARCHAR_decimal128_int64",
NativeFunction::kNeedsContext),
Expand Down
119 changes: 119 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@
#include <string>
#include <vector>

#include "arrow/util/formatting.h"
#include "arrow/util/value_parsing.h"
#include "gandiva/engine.h"
#include "gandiva/exported_funcs.h"
#include "gandiva/formatting_utils.h"
#include "gandiva/hash_utils.h"
#include "gandiva/in_holder.h"
#include "gandiva/like_holder.h"
#include "gandiva/precompiled/types.h"
#include "gandiva/random_generator_holder.h"
#include "gandiva/to_date_holder.h"

Expand Down Expand Up @@ -303,6 +306,86 @@ CAST_NUMERIC_FROM_STRING(float, arrow::FloatType, FLOAT4)
CAST_NUMERIC_FROM_STRING(double, arrow::DoubleType, FLOAT8)

#undef CAST_NUMERIC_FROM_STRING

#define GDV_FN_CAST_VARCHAR_INTEGER(IN_TYPE, ARROW_TYPE) \
GANDIVA_EXPORT \
const char* gdv_fn_castVARCHAR_##IN_TYPE##_int64(int64_t context, gdv_##IN_TYPE value, \
int64_t len, int32_t * out_len) { \
if (len < 0) { \
gdv_fn_context_set_error_msg(context, "Buffer length can not be negative"); \
*out_len = 0; \
return ""; \
} \
if (len == 0) { \
*out_len = 0; \
return ""; \
} \
arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter; \
char* ret = reinterpret_cast<char*>( \
gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len))); \
if (ret == nullptr) { \
gdv_fn_context_set_error_msg(context, "Could not allocate memory"); \
*out_len = 0; \
return ""; \
} \
arrow::Status status = formatter(value, [&](arrow::util::string_view v) { \
int64_t size = static_cast<int64_t>(v.size()); \
*out_len = static_cast<int32_t>(len < size ? len : size); \
memcpy(ret, v.data(), *out_len); \
return arrow::Status::OK(); \
}); \
if (!status.ok()) { \
std::string err = "Could not cast " + std::to_string(value) + " to string"; \
gdv_fn_context_set_error_msg(context, err.c_str()); \
*out_len = 0; \
return ""; \
} \
return ret; \
}

#define GDV_FN_CAST_VARCHAR_REAL(IN_TYPE, ARROW_TYPE) \
GANDIVA_EXPORT \
const char* gdv_fn_castVARCHAR_##IN_TYPE##_int64(int64_t context, gdv_##IN_TYPE value, \
int64_t len, int32_t * out_len) { \
if (len < 0) { \
gdv_fn_context_set_error_msg(context, "Buffer length can not be negative"); \
*out_len = 0; \
return ""; \
} \
if (len == 0) { \
*out_len = 0; \
return ""; \
} \
gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter; \
char* ret = reinterpret_cast<char*>( \
gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len))); \
if (ret == nullptr) { \
gdv_fn_context_set_error_msg(context, "Could not allocate memory"); \
*out_len = 0; \
return ""; \
} \
arrow::Status status = formatter(value, [&](arrow::util::string_view v) { \
int64_t size = static_cast<int64_t>(v.size()); \
*out_len = static_cast<int32_t>(len < size ? len : size); \
memcpy(ret, v.data(), *out_len); \
return arrow::Status::OK(); \
}); \
if (!status.ok()) { \
std::string err = "Could not cast " + std::to_string(value) + " to string"; \
gdv_fn_context_set_error_msg(context, err.c_str()); \
*out_len = 0; \
return ""; \
} \
return ret; \
}

GDV_FN_CAST_VARCHAR_INTEGER(int32, Int32Type)
GDV_FN_CAST_VARCHAR_INTEGER(int64, Int64Type)
GDV_FN_CAST_VARCHAR_REAL(float32, FloatType)
GDV_FN_CAST_VARCHAR_REAL(float64, DoubleType)

#undef GDV_FN_CAST_VARCHAR_INTEGER
#undef GDV_FN_CAST_VARCHAR_REAL
}

namespace gandiva {
Expand Down Expand Up @@ -471,6 +554,42 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_utf8", types->double_type(), args,
reinterpret_cast<void*>(gdv_fn_castFLOAT8_utf8));

// gdv_fn_castVARCHAR_int32_int64
args = {types->i64_type(), // int64_t execution_context
types->i32_type(), // int32_t value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_int32_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_int32_int64));

// gdv_fn_castVARCHAR_int64_int64
args = {types->i64_type(), // int64_t execution_context
types->i64_type(), // int64_t value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_int64_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_int64_int64));

// gdv_fn_castVARCHAR_float32_int64
args = {types->i64_type(), // int64_t execution_context
types->float_type(), // float value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_float32_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_float32_int64));

// gdv_fn_castVARCHAR_float64_int64
args = {types->i64_type(), // int64_t execution_context
types->double_type(), // double value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_float64_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_float64_int64));

// gdv_fn_sha1_int8
args = {
types->i64_type(), // context
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,17 @@ float gdv_fn_castFLOAT4_utf8(int64_t context, const char* data, int32_t data_len

GANDIVA_EXPORT
double gdv_fn_castFLOAT8_utf8(int64_t context, const char* data, int32_t data_len);

GANDIVA_EXPORT
const char* gdv_fn_castVARCHAR_int32_int64(int64_t context, int32_t value, int64_t len,
int32_t* out_len);
GANDIVA_EXPORT
const char* gdv_fn_castVARCHAR_int64_int64(int64_t context, int64_t value, int64_t len,
int32_t* out_len);
GANDIVA_EXPORT
const char* gdv_fn_castVARCHAR_float32_int64(int64_t context, float value, int64_t len,
int32_t* out_len);
GANDIVA_EXPORT
const char* gdv_fn_castVARCHAR_float64_int64(int64_t context, double value, int64_t len,
int32_t* out_len);
}

0 comments on commit 15137e2

Please sign in to comment.