Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support orc filter push down (file + stripe + rowgroup level) #55330

Merged
merged 36 commits into from Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
20d1eb4
support orc filter push down
taiyang-li Oct 8, 2023
bd51d21
update orc lib version
taiyang-li Oct 8, 2023
bd011a3
replace setqueryinfo with setkeycondition
taiyang-li Oct 8, 2023
c482ad8
fix issue https://github.com/ClickHouse/ClickHouse/issues/53536
taiyang-li Oct 9, 2023
31bd247
refactor source with key condition
taiyang-li Oct 9, 2023
2f5f2ba
fix building error
taiyang-li Oct 9, 2023
0d3213d
remove std::cout
taiyang-li Oct 9, 2023
7b328df
update orc
taiyang-li Oct 9, 2023
507620b
update orc version
taiyang-li Oct 10, 2023
f05511b
fix bugs
taiyang-li Oct 10, 2023
7acefea
improve code
taiyang-li Oct 10, 2023
aa7d89f
upgrade orc lib
taiyang-li Oct 10, 2023
d7b1257
fix code style
taiyang-li Oct 11, 2023
1bb2a22
change as requested
taiyang-li Oct 13, 2023
72c11c4
add performance tests for orc filter push down
taiyang-li Oct 13, 2023
c01a206
add performance tests for orc filter push down
taiyang-li Oct 13, 2023
592b30a
fix all bugs
taiyang-li Oct 13, 2023
57d714d
fix default as null issue
taiyang-li Oct 16, 2023
1c6df04
add uts for null as default issues
taiyang-li Oct 16, 2023
2cebb06
upgrade orc lib
taiyang-li Oct 16, 2023
048eb13
merge master and solve conflict
taiyang-li Oct 17, 2023
de22fdc
fix failed orc lib uts and fix typo
taiyang-li Oct 17, 2023
d950dc1
fix failed uts
taiyang-li Oct 18, 2023
5b47432
fix failed uts
taiyang-li Oct 18, 2023
1c4530d
fix ast fuzzer tests
taiyang-li Oct 18, 2023
6e9ca51
Merge branch 'master' into ch_orc_filter_push_down
taiyang-li Oct 18, 2023
be39d23
fix bug of uint64 overflow in https://s3.amazonaws.com/clickhouse-tes…
taiyang-li Oct 18, 2023
fab596d
fix asan fatal caused by reused column vector batch in native orc inp…
taiyang-li Oct 19, 2023
bb0a5a9
fix wrong performance tests
taiyang-li Oct 19, 2023
21f1db6
disable 02892_orc_filter_pushdown on aarch64. https://s3.amazonaws.co…
taiyang-li Oct 19, 2023
d3e89f7
add some comments
taiyang-li Oct 19, 2023
d36e3e1
add some comments
taiyang-li Oct 19, 2023
822965d
Merge branch 'master' into ch_orc_filter_push_down
taiyang-li Oct 19, 2023
0f4a3c8
inline range::equals and range::less
taiyang-li Oct 20, 2023
eacc41e
fix data race of key condition
taiyang-li Oct 24, 2023
ef4b5d5
trigger ci
taiyang-li Oct 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Core/Settings.h
Expand Up @@ -895,6 +895,7 @@ class IColumn;
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
M(Bool, input_format_orc_filter_push_down, true, "When reading ORC files, skip whole stripes or row groups based on the WHERE/PREWHERE expressions, min/max statistics or bloom filter in the ORC metadata.", 0) \
M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatFactory.cpp
Expand Up @@ -193,6 +193,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string;
format_settings.orc.output_compression_method = settings.output_format_orc_compression_method;
format_settings.orc.use_fast_decoder = settings.input_format_orc_use_fast_decoder;
format_settings.orc.filter_push_down = settings.input_format_orc_filter_push_down;
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference;
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatSettings.h
Expand Up @@ -362,6 +362,7 @@ struct FormatSettings
bool output_string_as_string = false;
ORCCompression output_compression_method = ORCCompression::NONE;
bool use_fast_decoder = true;
bool filter_push_down = true;
} orc;

/// For capnProto format we should determine how to
Expand Down
4 changes: 3 additions & 1 deletion src/Interpreters/Set.h
Expand Up @@ -193,7 +193,7 @@ using FunctionPtr = std::shared_ptr<IFunction>;
*/
struct FieldValue
{
FieldValue(MutableColumnPtr && column_) : column(std::move(column_)) {}
explicit FieldValue(MutableColumnPtr && column_) : column(std::move(column_)) {}
void update(const Field & x);

bool isNormal() const { return !value.isPositiveInfinity() && !value.isNegativeInfinity(); }
Expand Down Expand Up @@ -225,6 +225,8 @@ class MergeTreeSetIndex

size_t size() const { return ordered_set.at(0)->size(); }

const Columns & getOrderedSet() const { return ordered_set; }

bool hasMonotonicFunctionsChain() const;

BoolMask checkInRange(const std::vector<Range> & key_ranges, const DataTypes & data_types, bool single_point = false) const;
Expand Down
2 changes: 1 addition & 1 deletion src/Processors/Formats/IInputFormat.cpp
Expand Up @@ -6,7 +6,7 @@ namespace DB
{

IInputFormat::IInputFormat(Block header, ReadBuffer * in_)
: ISource(std::move(header)), in(in_)
: SourceWithKeyCondition(std::move(header)), in(in_)
{
column_mapping = std::make_shared<ColumnMapping>();
}
Expand Down
13 changes: 5 additions & 8 deletions src/Processors/Formats/IInputFormat.h
@@ -1,10 +1,11 @@
#pragma once

#include <Processors/Formats/InputFormatErrorsLogger.h>
#include <Processors/ISource.h>
#include <Formats/ColumnMapping.h>
#include <IO/ReadBuffer.h>
#include <Interpreters/Context.h>
#include <Formats/ColumnMapping.h>
#include <Processors/Formats/InputFormatErrorsLogger.h>
#include <Processors/SourceWithKeyCondition.h>
#include <Storages/MergeTree/KeyCondition.h>


namespace DB
Expand All @@ -16,7 +17,7 @@ using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;

/** Input format is a source, that reads data from ReadBuffer.
*/
class IInputFormat : public ISource
class IInputFormat : public SourceWithKeyCondition
{
protected:

Expand All @@ -26,10 +27,6 @@ class IInputFormat : public ISource
/// ReadBuffer can be nullptr for random-access formats.
IInputFormat(Block header, ReadBuffer * in_);

/// If the format is used by a SELECT query, this method may be called.
/// The format may use it for filter pushdown.
virtual void setQueryInfo(const SelectQueryInfo &, ContextPtr) {}

/** In some usecase (hello Kafka) we need to read a lot of tiny streams in exactly the same format.
* The recreating of parser for each small stream takes too long, so we introduce a method
* resetParser() which allow to reset the state of parser to continue reading of
Expand Down