Skip to content

Commit

Permalink
Add setting output_format_parquet_compliant_nested_types to produce m…
Browse files Browse the repository at this point in the history
…ore compatible Parquet files
  • Loading branch information
al13n321 committed May 19, 2023
1 parent 8745b81 commit 6fd5d8e
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 4 deletions.
1 change: 1 addition & 0 deletions src/Core/Settings.h
Expand Up @@ -913,6 +913,7 @@ class IColumn;
M(Bool, output_format_parquet_fixed_string_as_fixed_byte_array, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary for FixedString columns.", 0) \
M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \
M(ParquetCompression, output_format_parquet_compression_method, "lz4", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \
M(Bool, output_format_parquet_compliant_nested_types, true, "In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow.", 0) \
M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \
M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \
M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \
Expand Down
5 changes: 3 additions & 2 deletions src/Core/SettingsChangesHistory.h
Expand Up @@ -80,8 +80,9 @@ namespace SettingsChangesHistory
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
{
{"23.5", {{"input_format_parquet_preserve_order", true, false, "Allow Parquet reade to reorder rows for better parallelism."},
{"parallelize_output_from_storages", false, true, "Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows."}}},
{"23.5", {{"input_format_parquet_preserve_order", true, false, "Allow Parquet reader to reorder rows for better parallelism."},
{"parallelize_output_from_storages", false, true, "Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows."},
{"output_format_parquet_compliant_nested_types", false, true, "Change an internal field name in output Parquet file schema."}}},
{"23.4", {{"allow_suspicious_indices", true, false, "If true, index can defined with identical expressions"},
{"connect_timeout_with_failover_ms", 50, 1000, "Increase default connect timeout because of async connect"},
{"connect_timeout_with_failover_secure_ms", 100, 1000, "Increase default secure connect timeout because of async connect"},
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatFactory.cpp
Expand Up @@ -122,6 +122,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.parquet.output_fixed_string_as_fixed_byte_array = settings.output_format_parquet_fixed_string_as_fixed_byte_array;
format_settings.parquet.max_block_size = settings.input_format_parquet_max_block_size;
format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method;
format_settings.parquet.output_compliant_nested_types = settings.output_format_parquet_compliant_nested_types;
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
format_settings.pretty.color = settings.output_format_pretty_color;
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
Expand Down
1 change: 1 addition & 0 deletions src/Formats/FormatSettings.h
Expand Up @@ -220,6 +220,7 @@ struct FormatSettings
UInt64 max_block_size = 8192;
ParquetVersion output_version;
ParquetCompression output_compression_method = ParquetCompression::SNAPPY;
bool output_compliant_nested_types = true;
} parquet;

struct Pretty
Expand Down
11 changes: 9 additions & 2 deletions src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
Expand Up @@ -155,12 +155,19 @@ void ParquetBlockOutputFormat::write(Chunk chunk, size_t row_group_size)
parquet::WriterProperties::Builder builder;
builder.version(getParquetVersion(format_settings));
builder.compression(getParquetCompression(format_settings.parquet.output_compression_method));
auto props = builder.build();

parquet::ArrowWriterProperties::Builder writer_props_builder;
if (format_settings.parquet.output_compliant_nested_types)
writer_props_builder.enable_compliant_nested_types();
else
writer_props_builder.disable_compliant_nested_types();

auto result = parquet::arrow::FileWriter::Open(
*arrow_table->schema(),
arrow::default_memory_pool(),
sink,
props);
builder.build(),
writer_props_builder.build());
if (!result.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while opening a table: {}", result.status().ToString());
file_writer = std::move(result.ValueOrDie());
Expand Down

0 comments on commit 6fd5d8e

Please sign in to comment.