Skip to content

Commit

Permalink
Merge pull request #54634 from ClickHouse/backport/23.6/54513
Browse files Browse the repository at this point in the history
Backport #54513 to 23.6: Fix possible parsing error in WithNames formats with disabled input_format_with_names_use_header
  • Loading branch information
Avogar committed Sep 15, 2023
2 parents d5ba2a4 + fe2ddd2 commit fe9cb2b
Show file tree
Hide file tree
Showing 29 changed files with 76 additions and 49 deletions.
4 changes: 2 additions & 2 deletions src/Analyzer/Passes/QueryAnalysisPass.cpp
Expand Up @@ -6317,9 +6317,9 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
{
/// For input function we should check if input format supports reading subset of columns.
if (table_function_ptr->getName() == "input")
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(scope.context->getInsertFormat());
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(scope.context->getInsertFormat(), scope.context);
else
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns();
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns(scope.context);
}

if (use_columns_from_insert_query)
Expand Down
17 changes: 13 additions & 4 deletions src/Formats/FormatFactory.cpp
Expand Up @@ -670,10 +670,18 @@ void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & na

void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name)
{
auto & target = dict[name].supports_subset_of_columns;
auto & target = dict[name].subset_of_columns_support_checker;
if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
target = true;
target = [](const FormatSettings &){ return true; };
}

void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker)
{
auto & target = dict[name].subset_of_columns_support_checker;
if (target)
throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
target = std::move(subset_of_columns_support_checker);
}

void FormatFactory::markFormatSupportsSubcolumns(const String & name)
Expand All @@ -698,10 +706,11 @@ bool FormatFactory::checkIfFormatSupportsSubcolumns(const String & name) const
return target.supports_subcolumns;
}

bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const String & name) const
bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const DB::String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_) const
{
const auto & target = getCreators(name);
return target.supports_subset_of_columns;
auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
return target.subset_of_columns_support_checker && target.subset_of_columns_support_checker(format_settings);
}

void FormatFactory::registerAdditionalInfoForSchemaCacheGetter(
Expand Down
12 changes: 9 additions & 3 deletions src/Formats/FormatFactory.h
Expand Up @@ -126,6 +126,10 @@ class FormatFactory final : private boost::noncopyable
/// and the name of the message.
using AdditionalInfoForSchemaCacheGetter = std::function<String(const FormatSettings & settings)>;

/// Some formats can support reading subset of columns depending on settings.
/// The checker should return true if format support append.
using SubsetOfColumnsSupportChecker = std::function<bool(const FormatSettings & settings)>;

struct Creators
{
InputCreator input_creator;
Expand All @@ -136,11 +140,11 @@ class FormatFactory final : private boost::noncopyable
ExternalSchemaReaderCreator external_schema_reader_creator;
bool supports_parallel_formatting{false};
bool supports_subcolumns{false};
bool supports_subset_of_columns{false};
bool prefers_large_blocks{false};
NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker;
AppendSupportChecker append_support_checker;
AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter;
SubsetOfColumnsSupportChecker subset_of_columns_support_checker;
};

using FormatsDictionary = std::unordered_map<String, Creators>;
Expand Down Expand Up @@ -229,10 +233,12 @@ class FormatFactory final : private boost::noncopyable
void markOutputFormatSupportsParallelFormatting(const String & name);
void markOutputFormatPrefersLargeBlocks(const String & name);
void markFormatSupportsSubcolumns(const String & name);
void markFormatSupportsSubsetOfColumns(const String & name);

bool checkIfFormatSupportsSubcolumns(const String & name) const;
bool checkIfFormatSupportsSubsetOfColumns(const String & name) const;

void markFormatSupportsSubsetOfColumns(const String & name);
void registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker);
bool checkIfFormatSupportsSubsetOfColumns(const String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_ = std::nullopt) const;

bool checkIfFormatHasSchemaReader(const String & name) const;
bool checkIfFormatHasExternalSchemaReader(const String & name) const;
Expand Down
5 changes: 3 additions & 2 deletions src/Formats/registerWithNamesAndTypes.cpp
Expand Up @@ -12,8 +12,9 @@ void registerWithNamesAndTypes(const std::string & base_format_name, RegisterWit

void markFormatWithNamesAndTypesSupportsSamplingColumns(const std::string & base_format_name, FormatFactory & factory)
{
factory.markFormatSupportsSubsetOfColumns(base_format_name + "WithNames");
factory.markFormatSupportsSubsetOfColumns(base_format_name + "WithNamesAndTypes");
auto setting_checker = [](const FormatSettings & settings){ return settings.with_names_use_header; };
factory.registerSubsetOfColumnsSupportChecker(base_format_name + "WithNames", setting_checker);
factory.registerSubsetOfColumnsSupportChecker(base_format_name + "WithNamesAndTypes", setting_checker);
}

}
4 changes: 2 additions & 2 deletions src/Interpreters/Context.cpp
Expand Up @@ -1612,9 +1612,9 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const
{
/// For input function we should check if input format supports reading subset of columns.
if (table_function_ptr->getName() == "input")
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(getInsertFormat());
use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(getInsertFormat(), shared_from_this());
else
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns();
use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns(shared_from_this());
}

if (use_columns_from_insert_query)
Expand Down
6 changes: 3 additions & 3 deletions src/Storages/HDFS/StorageHDFS.cpp
Expand Up @@ -600,9 +600,9 @@ class PartitionedHDFSSink : public PartitionedSink
};


bool StorageHDFS::supportsSubsetOfColumns() const
bool StorageHDFS::supportsSubsetOfColumns(const ContextPtr & context_) const
{
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context_);
}

Pipe StorageHDFS::read(
Expand Down Expand Up @@ -651,7 +651,7 @@ Pipe StorageHDFS::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(context_))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/HDFS/StorageHDFS.h
Expand Up @@ -72,7 +72,7 @@ class StorageHDFS final : public IStorage, WithContext
/// Is is useful because column oriented formats could effectively skip unknown columns
/// So we can create a header of only required columns in read method and ask
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context_) const;

static ColumnsDescription getTableStructureFromData(
const String & format,
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/Hive/StorageHive.h
Expand Up @@ -65,7 +65,7 @@ class StorageHive final : public IStorage, WithContext

NamesAndTypesList getVirtuals() const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns() const;

std::optional<UInt64> totalRows(const Settings & settings) const override;
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr context_) const override;
Expand Down
2 changes: 0 additions & 2 deletions src/Storages/IStorage.h
Expand Up @@ -615,8 +615,6 @@ class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromo
/// NOTE: write-once also does not support INSERTs/merges/... for MergeTree
virtual bool isStaticStorage() const;

virtual bool supportsSubsetOfColumns() const { return false; }

/// If it is possible to quickly determine exact number of rows in the table at this moment of time, then return it.
/// Used for:
/// - Simple count() optimization
Expand Down
6 changes: 3 additions & 3 deletions src/Storages/StorageAzureBlob.cpp
Expand Up @@ -642,7 +642,7 @@ Pipe StorageAzureBlob::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down Expand Up @@ -768,9 +768,9 @@ bool StorageAzureBlob::supportsSubcolumns() const
return FormatFactory::instance().checkIfFormatSupportsSubcolumns(configuration.format);
}

bool StorageAzureBlob::supportsSubsetOfColumns() const
bool StorageAzureBlob::supportsSubsetOfColumns(const ContextPtr & context) const
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context, format_settings);
}

bool StorageAzureBlob::prefersLargeBlocks() const
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageAzureBlob.h
Expand Up @@ -95,7 +95,7 @@ class StorageAzureBlob : public IStorage

bool supportsSubcolumns() const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
6 changes: 3 additions & 3 deletions src/Storages/StorageFile.cpp
Expand Up @@ -442,9 +442,9 @@ ColumnsDescription StorageFile::getTableStructureFromFile(
return columns;
}

bool StorageFile::supportsSubsetOfColumns() const
bool StorageFile::supportsSubsetOfColumns(const ContextPtr & context) const
{
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name);
return format_name != "Distributed" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings);
}

bool StorageFile::prefersLargeBlocks() const
Expand Down Expand Up @@ -881,7 +881,7 @@ Pipe StorageFile::read(
{
ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(context))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageFile.h
Expand Up @@ -73,7 +73,7 @@ class StorageFile final : public IStorage
/// Is is useful because such formats could effectively skip unknown columns
/// So we can create a header of only required columns in read method and ask
/// format to read only them. Note: this hack cannot be done with ordinary formats like TSV.
bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
6 changes: 3 additions & 3 deletions src/Storages/StorageS3.cpp
Expand Up @@ -1031,9 +1031,9 @@ bool StorageS3::supportsSubcolumns() const
return FormatFactory::instance().checkIfFormatSupportsSubcolumns(configuration.format);
}

bool StorageS3::supportsSubsetOfColumns() const
bool StorageS3::supportsSubsetOfColumns(const ContextPtr & context) const
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context, format_settings);
}

bool StorageS3::prefersLargeBlocks() const
Expand Down Expand Up @@ -1076,7 +1076,7 @@ Pipe StorageS3::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
auto fetch_columns = column_names;
const auto & virtuals = getVirtuals();
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageS3.h
Expand Up @@ -362,7 +362,7 @@ class StorageS3 : public IStorage

bool supportsSubcolumns() const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
8 changes: 4 additions & 4 deletions src/Storages/StorageURL.cpp
Expand Up @@ -687,9 +687,9 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData(
return columns;
}

bool IStorageURLBase::supportsSubsetOfColumns() const
bool IStorageURLBase::supportsSubsetOfColumns(const ContextPtr & context) const
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings);
}

bool IStorageURLBase::prefersLargeBlocks() const
Expand All @@ -715,7 +715,7 @@ Pipe IStorageURLBase::read(

ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
columns_description = storage_snapshot->getDescriptionForColumns(column_names);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
Expand Down Expand Up @@ -818,7 +818,7 @@ Pipe StorageURLWithFailover::read(
{
ColumnsDescription columns_description;
Block block_for_format;
if (supportsSubsetOfColumns())
if (supportsSubsetOfColumns(local_context))
{
columns_description = storage_snapshot->getDescriptionForColumns(column_names);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageURL.h
Expand Up @@ -105,7 +105,7 @@ class IStorageURLBase : public IStorage
QueryProcessingStage::Enum & processed_stage,
size_t max_block_size) const;

bool supportsSubsetOfColumns() const override;
virtual bool supportsSubsetOfColumns(const ContextPtr & context) const;

bool prefersLargeBlocks() const override;

Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageXDBC.cpp
Expand Up @@ -145,7 +145,7 @@ SinkToStoragePtr StorageXDBC::write(const ASTPtr & /* query */, const StorageMet
compression_method);
}

bool StorageXDBC::supportsSubsetOfColumns() const
bool StorageXDBC::supportsSubsetOfColumns(const ContextPtr &) const
{
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageXDBC.h
Expand Up @@ -68,7 +68,7 @@ class StorageXDBC : public IStorageURLBase

Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const override;

bool supportsSubsetOfColumns() const override;
bool supportsSubsetOfColumns(const ContextPtr &) const override;
};

}
2 changes: 1 addition & 1 deletion src/TableFunctions/ITableFunction.h
Expand Up @@ -76,7 +76,7 @@ class ITableFunction : public std::enable_shared_from_this<ITableFunction>
/// because we cannot determine which column from table correspond to this virtual column.
virtual std::unordered_set<String> getVirtualsToCheckBeforeUsingStructureHint() const { return {}; }

virtual bool supportsReadingSubsetOfColumns() { return true; }
virtual bool supportsReadingSubsetOfColumns(const ContextPtr &) { return true; }

/// Create storage according to the query.
StoragePtr
Expand Down
4 changes: 2 additions & 2 deletions src/TableFunctions/ITableFunctionFileLike.cpp
Expand Up @@ -32,9 +32,9 @@ String ITableFunctionFileLike::getFormatFromFirstArgument()
return FormatFactory::instance().getFormatFromFileName(filename, true);
}

bool ITableFunctionFileLike::supportsReadingSubsetOfColumns()
bool ITableFunctionFileLike::supportsReadingSubsetOfColumns(const ContextPtr & context)
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context);
}

void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context)
Expand Down
2 changes: 1 addition & 1 deletion src/TableFunctions/ITableFunctionFileLike.h
Expand Up @@ -27,7 +27,7 @@ class ITableFunctionFileLike : public ITableFunction

void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

bool supportsReadingSubsetOfColumns() override;
bool supportsReadingSubsetOfColumns(const ContextPtr & context) override;

static size_t getMaxNumberOfArguments() { return 4; }

Expand Down
4 changes: 2 additions & 2 deletions src/TableFunctions/TableFunctionAzureBlobStorage.cpp
Expand Up @@ -208,9 +208,9 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex
return parseColumnsListFromString(configuration.structure, context);
}

bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns()
bool TableFunctionAzureBlobStorage::supportsReadingSubsetOfColumns(const ContextPtr & context)
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context);
}

StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const
Expand Down
2 changes: 1 addition & 1 deletion src/TableFunctions/TableFunctionAzureBlobStorage.h
Expand Up @@ -39,7 +39,7 @@ class TableFunctionAzureBlobStorage : public ITableFunction

void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

bool supportsReadingSubsetOfColumns() override;
bool supportsReadingSubsetOfColumns(const ContextPtr & context) override;

std::unordered_set<String> getVirtualsToCheckBeforeUsingStructureHint() const override
{
Expand Down
4 changes: 2 additions & 2 deletions src/TableFunctions/TableFunctionS3.cpp
Expand Up @@ -304,9 +304,9 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context)
return parseColumnsListFromString(configuration.structure, context);
}

bool TableFunctionS3::supportsReadingSubsetOfColumns()
bool TableFunctionS3::supportsReadingSubsetOfColumns(const ContextPtr & context)
{
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format, context);
}

StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const
Expand Down
2 changes: 1 addition & 1 deletion src/TableFunctions/TableFunctionS3.h
Expand Up @@ -47,7 +47,7 @@ class TableFunctionS3 : public ITableFunction

void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

bool supportsReadingSubsetOfColumns() override;
bool supportsReadingSubsetOfColumns(const ContextPtr & context) override;

std::unordered_set<String> getVirtualsToCheckBeforeUsingStructureHint() const override
{
Expand Down
@@ -0,0 +1 @@
2
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh

echo -e "a,b,c\n1,2,3" > $CLICKHOUSE_TEST_UNIQUE_NAME.csvwithnames

$CLICKHOUSE_LOCAL -q "select b from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csvwithnames') settings input_format_with_names_use_header=0"

@@ -0,0 +1,2 @@
a,b,c
1,2,3

0 comments on commit fe9cb2b

Please sign in to comment.