Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry pick #54513 to 23.7: Fix possible parsing error in WithNames formats with disabled input_format_with_names_use_header #54597

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Analyzer/Passes/QueryAnalysisPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6321,9 +6321,9 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
{
/// For input function we should check if input format supports reading subset of columns.
if (table_function_ptr->getName() == "input")
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(scope.context->getInsertFormat());
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(scope.context->getInsertFormat(), scope.context);
else
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns();
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns(scope.context);
}

if (use_columns_from_insert_query)
Expand Down
17 changes: 13 additions & 4 deletions src/Formats/FormatFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -678,10 +678,18 @@ void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & na

void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name)
{
auto & target = dict[name].supports_subset_of_columns;
auto & target = dict[name].subset_of_columns_support_checker;
if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
target = true;
target = [](const FormatSettings &){ return true; };
}

void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker)
{
auto & target = dict[name].subset_of_columns_support_checker;
if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
target = std::move(subset_of_columns_support_checker);
}

void FormatFactory::markFormatSupportsSubcolumns(const String & name)
Expand All @@ -706,10 +714,11 @@ bool FormatFactory::checkIfFormatSupportsSubcolumns(const String & name) const
return target.supports_subcolumns;
}

bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const String & name) const
bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const DB::String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_) const
{
const auto & target = getCreators(name);
return target.supports_subset_of_columns;
auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
return target.subset_of_columns_support_checker && target.subset_of_columns_support_checker(format_settings);
}

void FormatFactory::registerAdditionalInfoForSchemaCacheGetter(
Expand Down
14 changes: 10 additions & 4 deletions src/Formats/FormatFactory.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ class FormatFactory final : private boost::noncopyable
/// and the name of the message.
using AdditionalInfoForSchemaCacheGetter = std::function<String(const FormatSettings & settings)>;

/// Some formats can support reading subset of columns depending on settings.
/// The checker should return true if format support append.
using SubsetOfColumnsSupportChecker = std::function<bool(const FormatSettings & settings)>;

struct Creators
{
InputCreator input_creator;
Expand All @@ -134,13 +138,13 @@ class FormatFactory final : private boost::noncopyable
FileSegmentationEngine file_segmentation_engine;
SchemaReaderCreator schema_reader_creator;
ExternalSchemaReaderCreator external_schema_reader_creator;
bool supports_parallel_formatting{false};
bool supports_subcolumns{false};
bool supports_subset_of_columns{false};
bool supports_parallel_formatting{false};
bool prefers_large_blocks{false};
NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker;
AppendSupportChecker append_support_checker;
AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter;
SubsetOfColumnsSupportChecker subset_of_columns_support_checker;
};

using FormatsDictionary = std::unordered_map<String, Creators>;
Expand Down Expand Up @@ -229,10 +233,12 @@ class FormatFactory final : private boost::noncopyable
void markOutputFormatSupportsParallelFormatting(const String & name);
void markOutputFormatPrefersLargeBlocks(const String & name);
void markFormatSupportsSubcolumns(const String & name);
void markFormatSupportsSubsetOfColumns(const String & name);

bool checkIfFormatSupportsSubcolumns(const String & name) const;
bool checkIfFormatSupportsSubsetOfColumns(const String & name) const;

void markFormatSupportsSubsetOfColumns(const String & name);
void registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker);
bool checkIfFormatSupportsSubsetOfColumns(const String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_ = std::nullopt) const;

bool checkIfFormatHasSchemaReader(const String & name) const;
bool checkIfFormatHasExternalSchemaReader(const String & name) const;
Expand Down
5 changes: 3 additions & 2 deletions src/Formats/registerWithNamesAndTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWit

void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory)
{
factory.markFormatSupportsSubsetOfColumns(base_format_name + "WithNames");
factory.markFormatSupportsSubsetOfColumns(base_format_name + "WithNamesAndTypes");
auto setting_checker = [](const FormatSettings & settings){ return settings.with_names_use_header; };
factory.registerSubsetOfColumnsSupportChecker(base_format_name + "WithNames", setting_checker);
factory.registerSubsetOfColumnsSupportChecker(base_format_name + "WithNamesAndTypes", setting_checker);
}

}
4 changes: 2 additions & 2 deletions src/Interpreters/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1703,9 +1703,9 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const
{
/// For input function we should check if input format supports reading subset of columns.
if (table_function_ptr->getName() == "input")
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(getInsertFormat());
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(getInsertFormat(), shared_from_this());
else
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns();
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns(shared_from_this());
}

if (use_columns_from_insert_query)
Expand Down
6 changes: 3 additions & 3 deletions src/Storages/HDFS/StorageHDFS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -719,9 +719,9 @@ class PartitionedHDFSSink : public PartitionedSink
};


bool StorageHDFS::supportsSubsetOfColumns() const
bool StorageHDFS::supportsSubsetOfColumns(const ContextPtr & context_) const
{
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context_);
}

Pipe StorageHDFS::read(
Expand Down Expand Up @@ -770,7 +770,7 @@ Pipe StorageHDFS::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(context_))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/HDFS/StorageHDFS.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class StorageHDFS final : public IStorage, WithContext
/// Is is useful because column oriented formats could effectively skip unknown columns
/// So we can create a header of only required columns in read method and ask
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context_) const;

static ColumnsDescription getTableStructureFromData(
const String & format,
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/Hive/StorageHive.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class StorageHive final : public IStorage, WithContext

NamesAndTypesList getVirtuals() const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns() const;

std::optional<UInt64> totalRows(const Settings & settings) const override;
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr context_) const override;
Expand Down
2 changes: 0 additions & 2 deletions src/Storages/IStorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -619,8 +619,6 @@ class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromo
/// NOTE: write-once also does not support INSERTs/merges/... for MergeTree
virtual bool isStaticStorage() const;

virtual bool supportsSubsetOfColumns() const { return false; }

/// If it is possible to quickly determine exact number of rows in the table at this moment of time, then return it.
/// Used for:
/// - Simple count() optimization
Expand Down
6 changes: 3 additions & 3 deletions src/Storages/StorageAzureBlob.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ Pipe StorageAzureBlob::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down Expand Up @@ -767,9 +767,9 @@ bool StorageAzureBlob::supportsSubcolumns() const
return FormatFactory::instance().checkIfFormatSupportsSubcolumns(configuration.format);
}

bool StorageAzureBlob::supportsSubsetOfColumns() const
bool StorageAzureBlob::supportsSubsetOfColumns(const ContextPtr & context) const
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context, format_settings);
}

bool StorageAzureBlob::prefersLargeBlocks() const
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageAzureBlob.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class StorageAzureBlob : public IStorage

bool supportsSubcolumns() const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
6 changes: 3 additions & 3 deletions src/Storages/StorageFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,9 +526,9 @@ ColumnsDescription StorageFile::getTableStructureFromFile(
return columns;
}

bool StorageFile::supportsSubsetOfColumns() const
bool StorageFile::supportsSubsetOfColumns(const ContextPtr & context) const
{
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name);
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings);
}

bool StorageFile::prefersLargeBlocks() const
Expand Down Expand Up @@ -949,7 +949,7 @@ Pipe StorageFile::read(
{
ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(context))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class StorageFile final : public IStorage
/// Is is useful because such formats could effectively skip unknown columns
/// So we can create a header of only required columns in read method and ask
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
6 changes: 3 additions & 3 deletions src/Storages/StorageS3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1002,9 +1002,9 @@ bool StorageS3::supportsSubcolumns() const
return FormatFactory::instance().checkIfFormatSupportsSubcolumns(configuration.format);
}

bool StorageS3::supportsSubsetOfColumns() const
bool StorageS3::supportsSubsetOfColumns(const ContextPtr & context) const
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context, format_settings);
}

bool StorageS3::prefersLargeBlocks() const
Expand Down Expand Up @@ -1047,7 +1047,7 @@ Pipe StorageS3::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageS3.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ class StorageS3 : public IStorage

bool supportsSubcolumns() const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
8 changes: 4 additions & 4 deletions src/Storages/StorageURL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -669,9 +669,9 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData(
return columns;
}

bool IStorageURLBase::supportsSubsetOfColumns() const
bool IStorageURLBase::supportsSubsetOfColumns(const ContextPtr & context) const
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings);
}

bool IStorageURLBase::prefersLargeBlocks() const
Expand All @@ -697,7 +697,7 @@ Pipe IStorageURLBase::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
columns_description = storage_snapshot->getDescriptionForColumns(column_names);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
Expand Down Expand Up @@ -800,7 +800,7 @@ Pipe StorageURLWithFailover::read(
{
ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
columns_description = storage_snapshot->getDescriptionForColumns(column_names);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageURL.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class IStorageURLBase : public IStorage
QueryProcessingStage::Enum & processed_stage,
size_t max_block_size) const;

bool supportsSubsetOfColumns() const override;
virtual bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageXDBC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ SinkToStoragePtr StorageXDBC::write(const ASTPtr & /* query */, const StorageMet
compression_method);
}

bool StorageXDBC::supportsSubsetOfColumns() const
bool StorageXDBC::supportsSubsetOfColumns(const ContextPtr &) const
{
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageXDBC.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class StorageXDBC : public IStorageURLBase

Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr &) const override;
};

}
2 changes: 1 addition & 1 deletion src/TableFunctions/ITableFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class ITableFunction : public std::enable_shared_from_this<ITableFunction>
/// because we cannot determine which column from table correspond to this virtual column.
virtual std::unordered_set<String> getVirtualsToCheckBeforeUsingStructureHint() const { return {}; }

virtual bool supportsReadingSubsetOfColumns() { return true; }
virtual bool supportsReadingSubsetOfColumns(const ContextPtr &) { return true; }

/// Create storage according to the query.
StoragePtr
Expand Down
4 changes: 2 additions & 2 deletions src/TableFunctions/ITableFunctionFileLike.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ String ITableFunctionFileLike::getFormatFromFirstArgument()
return FormatFactory::instance().getFormatFromFileName(filename, true);
}

bool ITableFunctionFileLike::supportsReadingSubsetOfColumns()
bool ITableFunctionFileLike::supportsReadingSubsetOfColumns(const ContextPtr & context)
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context);
}

void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context)
Expand Down
2 changes: 1 addition & 1 deletion src/TableFunctions/ITableFunctionFileLike.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ITableFunctionFileLike : public ITableFunction

void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

bool supportsReadingSubsetOfColumns() override;
bool supportsReadingSubsetOfColumns(const ContextPtr & context) override;

static size_t getMaxNumberOfArguments() { return 4; }

Expand Down
4 changes: 2 additions & 2 deletions src/TableFunctions/TableFunctionAzureBlobStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,9 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex
return parseColumnsListFromString(configuration.structure, context);
}

bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns()
bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns(const ContextPtr & context)
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context);
}

StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const
Expand Down
2 changes: 1 addition & 1 deletion src/TableFunctions/TableFunctionAzureBlobStorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class TableFunctionAzureBlobStorage : public ITableFunction

void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

bool supportsReadingSubsetOfColumns() override;
bool supportsReadingSubsetOfColumns(const ContextPtr & context) override;

std::unordered_set<String> getVirtualsToCheckBeforeUsingStructureHint() const override
{
Expand Down
4 changes: 2 additions & 2 deletions src/TableFunctions/TableFunctionS3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,9 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context)
return parseColumnsListFromString(configuration.structure, context);
}

bool TableFunctionS3::supportsReadingSubsetOfColumns()
bool TableFunctionS3::supportsReadingSubsetOfColumns(const ContextPtr & context)
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context);
}

StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const
Expand Down
2 changes: 1 addition & 1 deletion src/TableFunctions/TableFunctionS3.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class TableFunctionS3 : public ITableFunction

void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

bool supportsReadingSubsetOfColumns() override;
bool supportsReadingSubsetOfColumns(const ContextPtr & context) override;

std::unordered_set<String> getVirtualsToCheckBeforeUsingStructureHint() const override
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

echo -e "a,b,c\n1,2,3" > $CLICKHOUSE_TEST_UNIQUE_NAME.csvwithnames

$CLICKHOUSE_LOCAL -q "select b from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csvwithnames') settings input_format_with_names_use_header=0"

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
a,b,c
1,2,3