Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,31 @@ void ParquetPlainValuesReader<ColumnString>::readBatch(
);
}

template <typename TColumn>
void ParquetBitPlainReader<TColumn>::readBatch(
MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values)
{
auto cursor = col_ptr->size();
auto * column_data = getResizedPrimitiveData(*assert_cast<TColumn *>(col_ptr.get()), cursor + num_values);

def_level_reader->visitNullableValues(
cursor,
num_values,
max_def_level,
null_map,
/* individual_visitor */ [&](size_t nest_cursor)
{
uint8_t byte;
bit_reader->GetValue(1, &byte);
column_data[nest_cursor] = byte;
},
/* repeated_visitor */ [&](size_t nest_cursor, UInt32 count)
{
bit_reader->GetBatch(1, &column_data[nest_cursor], count);
}
);
}


template <>
void ParquetPlainValuesReader<ColumnDecimal<DateTime64>, ParquetReaderTypes::TimestampInt96>::readBatch(
Expand Down Expand Up @@ -561,6 +586,9 @@ template class ParquetPlainValuesReader<ColumnDecimal<Decimal32>>;
template class ParquetPlainValuesReader<ColumnDecimal<Decimal64>>;
template class ParquetPlainValuesReader<ColumnDecimal<DateTime64>>;
template class ParquetPlainValuesReader<ColumnString>;
template class ParquetPlainValuesReader<ColumnUInt8>;

template class ParquetBitPlainReader<ColumnUInt8>;

template class ParquetFixedLenPlainReader<ColumnDecimal<Decimal128>>;
template class ParquetFixedLenPlainReader<ColumnDecimal<Decimal256>>;
Expand All @@ -569,6 +597,7 @@ template class ParquetRleLCReader<ColumnUInt8>;
template class ParquetRleLCReader<ColumnUInt16>;
template class ParquetRleLCReader<ColumnUInt32>;

template class ParquetRleDictReader<ColumnUInt8>;
template class ParquetRleDictReader<ColumnInt32>;
template class ParquetRleDictReader<ColumnUInt32>;
template class ParquetRleDictReader<ColumnInt64>;
Expand Down
21 changes: 21 additions & 0 deletions src/Processors/Formats/Impl/Parquet/ParquetDataValuesReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,27 @@ class ParquetPlainValuesReader : public ParquetDataValuesReader
ParquetDataBuffer plain_data_buffer;
};

template <typename TColumn>
class ParquetBitPlainReader : public ParquetDataValuesReader
{
public:
ParquetBitPlainReader(
Int32 max_def_level_,
std::unique_ptr<RleValuesReader> def_level_reader_,
std::unique_ptr<arrow::bit_util::BitReader> bit_reader_)
: max_def_level(max_def_level_)
, def_level_reader(std::move(def_level_reader_))
, bit_reader(std::move(bit_reader_))
{}

void readBatch(MutableColumnPtr & col_ptr, LazyNullMap & null_map, UInt32 num_values) override;

private:
Int32 max_def_level;
std::unique_ptr<RleValuesReader> def_level_reader;
std::unique_ptr<arrow::bit_util::BitReader> bit_reader;
};

/**
* The data and definition level encoding are same as ParquetPlainValuesReader.
* But the element size is const and bigger than primitive data type.
Expand Down
38 changes: 29 additions & 9 deletions src/Processors/Formats/Impl/Parquet/ParquetLeafColReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,16 +458,29 @@ void ParquetLeafColReader<TColumn>::readPageV1(const parquet::DataPageV1 & page)
degradeDictionary();
}

ParquetDataBuffer parquet_buffer = [&]()
if (col_descriptor.physical_type() == parquet::Type::BOOLEAN)
{
if constexpr (!std::is_same_v<ColumnDecimal<DateTime64>, TColumn>)
return ParquetDataBuffer(buffer, max_size);

auto scale = assert_cast<const DataTypeDateTime64 &>(*base_data_type).getScale();
return ParquetDataBuffer(buffer, max_size, scale);
}();
data_values_reader = createPlainReader<TColumn>(
col_descriptor, std::move(def_level_reader), std::move(parquet_buffer));
if constexpr (std::is_same_v<TColumn, ColumnUInt8>)
{
auto bit_reader = std::make_unique<arrow::bit_util::BitReader>(buffer, max_size);
data_values_reader = std::make_unique<ParquetBitPlainReader<ColumnUInt8>>(col_descriptor.max_definition_level(),
std::move(def_level_reader),
std::move(bit_reader));
}
}
else
{
ParquetDataBuffer parquet_buffer = [&]()
{
if constexpr (!std::is_same_v<ColumnDecimal<DateTime64>, TColumn>)
return ParquetDataBuffer(buffer, max_size);

auto scale = assert_cast<const DataTypeDateTime64 &>(*base_data_type).getScale();
return ParquetDataBuffer(buffer, max_size, scale);
}();
data_values_reader = createPlainReader<TColumn>(
col_descriptor, std::move(def_level_reader), std::move(parquet_buffer));
}
break;
}
case parquet::Encoding::RLE_DICTIONARY:
Expand Down Expand Up @@ -518,6 +531,12 @@ std::unique_ptr<ParquetDataValuesReader> ParquetLeafColReader<TColumn>::createDi
});
return res;
}

if (col_descriptor.physical_type() == parquet::Type::type::BOOLEAN)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Dictionary encoding for booleans is not supported");
}

return std::make_unique<ParquetRleDictReader<TColumn>>(
col_descriptor.max_definition_level(),
std::move(def_level_reader),
Expand All @@ -526,6 +545,7 @@ std::unique_ptr<ParquetDataValuesReader> ParquetLeafColReader<TColumn>::createDi
}


template class ParquetLeafColReader<ColumnUInt8>;
template class ParquetLeafColReader<ColumnInt32>;
template class ParquetLeafColReader<ColumnUInt32>;
template class ParquetLeafColReader<ColumnInt64>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ std::unique_ptr<ParquetColumnReader> ColReaderFactory::makeReader()
switch (col_descriptor.physical_type())
{
case parquet::Type::BOOLEAN:
break;
return makeLeafReader<DataTypeUInt8>();
case parquet::Type::INT32:
return fromInt32();
case parquet::Type::INT64:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
0 false
1 \N
2 false
3 \N
4 false
5 \N
6 false
7 \N
8 true
9 \N
0 false
1 \N
2 false
3 \N
4 false
5 \N
6 false
7 \N
8 true
9 \N
21 changes: 21 additions & 0 deletions tests/queries/0_stateless/03254_parquet_bool_native_reader.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Tags: no-ubsan, no-fasttest

CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh

USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')

WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}"

mkdir -p "${WORKING_DIR}"

DATA_FILE="${CUR_DIR}/data_parquet/nullbool.parquet"

DATA_FILE_USER_PATH="${WORKING_DIR}/nullbool.parquet"

cp ${DATA_FILE} ${DATA_FILE_USER_PATH}

${CLICKHOUSE_CLIENT} --query="select id, bool from file('${DATA_FILE_USER_PATH}', Parquet) order by id SETTINGS input_format_parquet_use_native_reader=false;"
${CLICKHOUSE_CLIENT} --query="select id, bool from file('${DATA_FILE_USER_PATH}', Parquet) order by id SETTINGS input_format_parquet_use_native_reader=true;"
Binary file not shown.
Loading