Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
110 commits
Select commit Hold shift + click to select a range
88e530a
wip, fix hashing
arthurpassos Apr 21, 2024
bbc4677
shitty serialize impl, missing arrays
arthurpassos Apr 24, 2024
6719364
wip
arthurpassos Apr 25, 2024
7e05934
undo something
arthurpassos Apr 25, 2024
2b6adf5
delete unnecessary stuff
arthurpassos Apr 25, 2024
4bbf242
fix tests
arthurpassos Apr 25, 2024
2c0badb
remove unused variable
arthurpassos Apr 25, 2024
0bbfa91
progress
arthurpassos Apr 25, 2024
547ae00
style
arthurpassos Apr 25, 2024
d9f3ccd
try to fix defines
arthurpassos Apr 25, 2024
1a33e5a
add if use parquet
arthurpassos Apr 25, 2024
ff37c19
more complex test
arthurpassos Apr 25, 2024
ad78591
update docs
arthurpassos Apr 25, 2024
7b7c4d6
add new setting to settingshistory
arthurpassos Apr 26, 2024
afb35cf
fix
arthurpassos Apr 26, 2024
9bb0ddc
add suport for in check
arthurpassos Apr 26, 2024
f3e02ef
add some tests for in cehck
arthurpassos Apr 26, 2024
1db4eda
try chassert
arthurpassos Apr 26, 2024
ae07174
fix dumb shit
arthurpassos Apr 26, 2024
af546f3
make test file reproducible, add bloom filter to the array column and…
arthurpassos Apr 27, 2024
0163ed5
fix existing tests
arthurpassos Apr 29, 2024
f0586e3
add hasall support
arthurpassos Apr 29, 2024
d56c666
update test files so array column has higher cardinality
arthurpassos Apr 30, 2024
e2da88c
simplify tests and add tests for has array operations
arthurpassos Apr 30, 2024
c6a5205
simplify code by reducing the scope for now. Also add test for equali…
arthurpassos May 1, 2024
f998987
add test for nested function
arthurpassos May 2, 2024
0a6ac88
optimize things a bit
arthurpassos May 2, 2024
d59f8b2
hash only once approach
arthurpassos May 10, 2024
8021318
minor update
arthurpassos May 22, 2024
eda321b
extract rpn build process into separate class
arthurpassos May 23, 2024
74a6b62
do not get parquet reader from arrow
arthurpassos May 27, 2024
87e902f
early return for no row groups
arthurpassos May 28, 2024
4dfa16d
remove a.ref
arthurpassos May 28, 2024
4a4e3b4
specify a few types instead of auto
arthurpassos Jun 10, 2024
f6911dd
add missing file
arthurpassos Jun 10, 2024
9bde5f4
settings history..
arthurpassos Jul 5, 2024
133249b
try to use keycondition instead
arthurpassos Jul 18, 2024
368d556
rmv debug var
arthurpassos Jul 18, 2024
7dc41da
extern bad arguments
arthurpassos Jul 19, 2024
8e8cc43
style check
arthurpassos Jul 23, 2024
6813b7c
minor typo
arthurpassos Jul 23, 2024
7c8f7e0
minor fix
arthurpassos Jul 23, 2024
52a77d1
drop support for array types
arthurpassos Jul 23, 2024
b11b87b
move parquet bf stuff out of keycondition
arthurpassos Jul 23, 2024
df0f81a
error codes
arthurpassos Jul 23, 2024
3847ff3
few fixes
arthurpassos Jul 24, 2024
a7f5cdf
remove unnecessary code
arthurpassos Jul 24, 2024
0d93b15
delete unused files
arthurpassos Jul 24, 2024
17d3ac2
progress on re-using keycondition and implementing in_set
arthurpassos Aug 2, 2024
1c67418
minor stuff
arthurpassos Aug 3, 2024
b8fdb1b
evaluate rpn only once
arthurpassos Aug 3, 2024
a487337
fix issue when in_set columns do not have bf
arthurpassos Aug 12, 2024
ea6118b
only get bloom filters for filtering columns
arthurpassos Aug 17, 2024
73e985e
get bf reader in non optimal way
arthurpassos Aug 17, 2024
a976f26
only calculate filtering columns if key_condition and bloom fliter is…
arthurpassos Aug 19, 2024
baf4386
get bf reader in optimal way
arthurpassos Aug 19, 2024
9d82991
Delete
arthurpassos Aug 19, 2024
7d4a169
some improvements, altho code looks odd
arthurpassos Aug 21, 2024
ac13d5d
Update SettingsChangesHistory.cpp
arthurpassos Aug 22, 2024
d6a64c8
function_unknown instead of function_true in some cases
arthurpassos Aug 22, 2024
7097e0c
put reader properties in a variable
arthurpassos Aug 22, 2024
450d34c
process in_set even if one of the columns do not have bf
arthurpassos Aug 22, 2024
bc76d06
simplify index mapping a bit
arthurpassos Aug 23, 2024
68315c7
vector instead of map.. need to remember order of insertion is important
arthurpassos Aug 28, 2024
ad21e64
safe guard around bf across row groups
arthurpassos Aug 28, 2024
3d5eeac
progress
arthurpassos Aug 29, 2024
bf09d39
simplify getfilteringcolumns
arthurpassos Aug 29, 2024
559057a
remove unused type alias
arthurpassos Aug 30, 2024
c411cbd
support only basic types with no special encodingwq
arthurpassos Sep 13, 2024
26d3d2c
small fixes, I need to improve testing
arthurpassos Sep 15, 2024
fb89ce6
tmp
arthurpassos Sep 17, 2024
a6b909b
handle nullable on tuple
arthurpassos Sep 18, 2024
0a9d143
progress
arthurpassos Sep 19, 2024
2488537
fix conflicts
arthurpassos Sep 20, 2024
2e2ccfa
fix steyle
arthurpassos Sep 20, 2024
385b9cf
tryconvertfieldtotype
arthurpassos Sep 20, 2024
9509c19
update tests to new dataset
arthurpassos Sep 23, 2024
fa0d11e
grab Field from ordered set columns and rely on convertfieldtotype
arthurpassos Sep 25, 2024
c86ad1d
tmp
arthurpassos Sep 25, 2024
27686aa
update tests
arthurpassos Sep 25, 2024
b91bf3b
add proper uint64 tests
arthurpassos Sep 26, 2024
96a8ed8
add uint8 and uuid tests
arthurpassos Sep 26, 2024
bc5c397
simplify logic to use vector of uint64 instead of columnptr. Fixes ms…
arthurpassos Sep 27, 2024
68132a8
undo hashstring modification
arthurpassos Sep 27, 2024
8442c0b
address some coments
arthurpassos Oct 1, 2024
a9ef415
simplify maybeTrueOnBloomFilter
arthurpassos Oct 1, 2024
bef2412
progress
arthurpassos Oct 4, 2024
6b7ee88
safe guards around index mapping
arthurpassos Oct 4, 2024
080fd53
some structural changes
arthurpassos Oct 4, 2024
0af5d54
progress
arthurpassos Oct 7, 2024
5a7c27b
move setting to 24.10
arthurpassos Oct 8, 2024
65a3a17
add test for ipv6
arthurpassos Oct 8, 2024
2ec54ed
rmv ssh key
arthurpassos Oct 8, 2024
45fb149
add new test file
arthurpassos Oct 8, 2024
a847b9b
minor improvements
arthurpassos Oct 9, 2024
0111428
few comments adressed
arthurpassos Oct 11, 2024
bc81f7d
assert parquet column size is 16 bytes when hashing ipv6
arthurpassos Oct 11, 2024
6bec838
simplify parquet bf rpn by removing function_equals
arthurpassos Oct 11, 2024
0f2eca4
fix in with invalid conversion
arthurpassos Oct 15, 2024
3446260
docs change
arthurpassos Oct 15, 2024
009ae77
f_unknown if monotonic_functions not empty
arthurpassos Oct 15, 2024
e76924c
updt
arthurpassos Dec 12, 2024
c153fdc
updt
arthurpassos Dec 12, 2024
65af160
updt
arthurpassos Dec 12, 2024
7240a19
updt
arthurpassos Dec 12, 2024
dfb68e2
updt
arthurpassos Dec 12, 2024
3d1b1ba
updt
arthurpassos Dec 12, 2024
8c7b87a
updt
arthurpassos Dec 12, 2024
0a66ca0
Update FormatFactory.cpp
arthurpassos Dec 12, 2024
d30eaeb
Update ParquetBlockInputFormat.cpp
arthurpassos Dec 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Core/Settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,7 @@ class IColumn;
M(Bool, input_format_parquet_preserve_order, false, "Avoid reordering rows when reading from Parquet files. Usually makes it much slower.", 0) \
M(Bool, input_format_parquet_filter_push_down, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.", 0) \
M(Bool, input_format_parquet_use_native_reader, false, "When reading Parquet files, to use native reader instead of arrow reader.", 0) \
M(Bool, input_format_parquet_bloom_filter_push_down, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata.", 0) \
M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
M(Bool, input_format_orc_allow_missing_columns, true, "Allow missing columns while reading ORC input formats", 0) \
M(Bool, input_format_orc_use_fast_decoder, true, "Use a faster ORC decoder implementation.", 0) \
Expand Down
1 change: 1 addition & 0 deletions src/Core/SettingsChangesHistory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
},
{"24.8",
{
{"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
{"enable_named_columns_in_function_tuple", false, false, "Retroactively disabled by default due to critical bugs."},
{"rows_before_aggregation", false, false, "Provide exact value for rows_before_aggregation statistic, represents the number of rows read before aggregation"},
{"restore_replace_external_table_functions_to_null", false, false, "New setting."},
Expand Down
4 changes: 4 additions & 0 deletions src/DataTypes/IDataType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,13 +322,17 @@ bool isUInt8(TYPE data_type) { return WhichDataType(data_type).isUInt8(); } \
bool isUInt16(TYPE data_type) { return WhichDataType(data_type).isUInt16(); } \
bool isUInt32(TYPE data_type) { return WhichDataType(data_type).isUInt32(); } \
bool isUInt64(TYPE data_type) { return WhichDataType(data_type).isUInt64(); } \
bool isUInt128(TYPE data_type) { return WhichDataType(data_type).isUInt128(); } \
bool isUInt256(TYPE data_type) { return WhichDataType(data_type).isUInt256(); } \
bool isNativeUInt(TYPE data_type) { return WhichDataType(data_type).isNativeUInt(); } \
bool isUInt(TYPE data_type) { return WhichDataType(data_type).isUInt(); } \
\
bool isInt8(TYPE data_type) { return WhichDataType(data_type).isInt8(); } \
bool isInt16(TYPE data_type) { return WhichDataType(data_type).isInt16(); } \
bool isInt32(TYPE data_type) { return WhichDataType(data_type).isInt32(); } \
bool isInt64(TYPE data_type) { return WhichDataType(data_type).isInt64(); } \
bool isInt128(TYPE data_type) { return WhichDataType(data_type).isInt128(); } \
bool isInt256(TYPE data_type) { return WhichDataType(data_type).isInt256(); } \
bool isNativeInt(TYPE data_type) { return WhichDataType(data_type).isNativeInt(); } \
bool isInt(TYPE data_type) { return WhichDataType(data_type).isInt(); } \
\
Expand Down
6 changes: 5 additions & 1 deletion src/DataTypes/IDataType.h
Original file line number Diff line number Diff line change
Expand Up @@ -459,14 +459,18 @@ struct WhichDataType
bool isUInt8(TYPE data_type); \
bool isUInt16(TYPE data_type); \
bool isUInt32(TYPE data_type); \
bool isUInt64(TYPE data_type); \
bool isUInt64(TYPE data_type);\
bool isUInt128(TYPE data_type);\
bool isUInt256(TYPE data_type); \
bool isNativeUInt(TYPE data_type); \
bool isUInt(TYPE data_type); \
\
bool isInt8(TYPE data_type); \
bool isInt16(TYPE data_type); \
bool isInt32(TYPE data_type); \
bool isInt64(TYPE data_type); \
bool isInt128(TYPE data_type); \
bool isInt256(TYPE data_type); \
bool isNativeInt(TYPE data_type); \
bool isInt(TYPE data_type); \
\
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.parquet.case_insensitive_column_matching = settings.input_format_parquet_case_insensitive_column_matching;
format_settings.parquet.preserve_order = settings.input_format_parquet_preserve_order;
format_settings.parquet.filter_push_down = settings.input_format_parquet_filter_push_down;
format_settings.parquet.bloom_filter_push_down = settings.input_format_parquet_bloom_filter_push_down;
format_settings.parquet.use_native_reader = settings.input_format_parquet_use_native_reader;
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings.input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference;
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatSettings.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ struct FormatSettings
bool case_insensitive_column_matching = false;
bool filter_push_down = true;
bool use_native_reader = false;
bool bloom_filter_push_down = true;
std::unordered_set<int> skip_row_groups = {};
bool output_string_as_string = false;
bool output_fixed_string_as_fixed_byte_array = true;
Expand Down
2 changes: 2 additions & 0 deletions src/Interpreters/Set.h
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ class MergeTreeSetIndex

const Columns & getOrderedSet() const { return ordered_set; }

const std::vector<KeyTuplePositionMapping> & getIndexesMapping() const { return indexes_mapping; }

private:
// If all arguments in tuple are key columns, we can optimize NOT IN when there is only one element.
bool has_all_keys;
Expand Down
36 changes: 27 additions & 9 deletions src/Processors/Formats/Impl/ArrowFieldIndexUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <arrow/type_fwd.h>
#include <boost/algorithm/string/case_conv.hpp>
#include <Common/Exception.h>
#include <parquet/metadata.h>


namespace arrow
Expand Down Expand Up @@ -65,11 +66,22 @@ class ArrowFieldIndexUtil
return result;
}

// For a parquet schema {x: {i: int, j: int}}, this should be populated as follows
// clickhouse_index = 0, parquet_indexes = {0, 1}
struct ClickHouseIndexToParquetIndex
{
std::size_t clickhouse_index;
std::vector<int> parquet_indexes;
};

/// Only collect the required fields' indices. Eg. when just read a field of a struct,
/// don't need to collect the whole indices in this struct.
std::vector<int> findRequiredIndices(const Block & header, const arrow::Schema & schema)
std::vector<ClickHouseIndexToParquetIndex> findRequiredIndices(
const Block & header,
const arrow::Schema & schema,
const parquet::FileMetaData & file)
{
std::vector<int> required_indices;
std::vector<ClickHouseIndexToParquetIndex> required_indices;
std::unordered_set<int> added_indices;
/// Flat all named fields' index information into a map.
auto fields_indices = calculateFieldIndices(schema);
Expand All @@ -79,7 +91,7 @@ class ArrowFieldIndexUtil
std::string col_name = named_col.name;
if (ignore_case)
boost::to_lower(col_name);
findRequiredIndices(col_name, named_col.type, fields_indices, added_indices, required_indices);
findRequiredIndices(col_name, i, named_col.type, fields_indices, added_indices, required_indices, file);
}
return required_indices;
}
Expand Down Expand Up @@ -169,10 +181,12 @@ class ArrowFieldIndexUtil

void findRequiredIndices(
const String & name,
std::size_t header_index,
DataTypePtr data_type,
const std::unordered_map<std::string, std::pair<int, int>> & field_indices,
std::unordered_set<int> & added_indices,
std::vector<int> & required_indices)
std::vector<ClickHouseIndexToParquetIndex> & required_indices,
const parquet::FileMetaData & file)
{
auto nested_type = removeNullable(data_type);
if (const DB::DataTypeTuple * type_tuple = typeid_cast<const DB::DataTypeTuple *>(nested_type.get()))
Expand All @@ -187,20 +201,20 @@ class ArrowFieldIndexUtil
if (ignore_case)
boost::to_lower(field_name);
const auto & field_type = field_types[i];
findRequiredIndices(Nested::concatenateName(name, field_name), field_type, field_indices, added_indices, required_indices);
findRequiredIndices(Nested::concatenateName(name, field_name), header_index, field_type, field_indices, added_indices, required_indices, file);
}
return;
}
}
else if (const auto * type_array = typeid_cast<const DB::DataTypeArray *>(nested_type.get()))
{
findRequiredIndices(name, type_array->getNestedType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, header_index, type_array->getNestedType(), field_indices, added_indices, required_indices, file);
return;
}
else if (const auto * type_map = typeid_cast<const DB::DataTypeMap *>(nested_type.get()))
{
findRequiredIndices(name, type_map->getKeyType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, type_map->getValueType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, header_index, type_map->getKeyType(), field_indices, added_indices, required_indices, file);
findRequiredIndices(name, header_index, type_map->getValueType(), field_indices, added_indices, required_indices, file);
return;
}
auto it = field_indices.find(name);
Expand All @@ -211,14 +225,18 @@ class ArrowFieldIndexUtil
}
else
{
ClickHouseIndexToParquetIndex index_mapping;
index_mapping.clickhouse_index = header_index;
for (int j = 0; j < it->second.second; ++j)
{
auto index = it->second.first + j;
if (added_indices.insert(index).second)
{
required_indices.emplace_back(index);
index_mapping.parquet_indexes.emplace_back(index);
}
}

required_indices.emplace_back(index_mapping);
}
}
};
Expand Down
Loading
Loading