From 5e2a022979462d009c72136ee948855563cca132 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Fri, 18 Apr 2025 16:38:04 +0200 Subject: [PATCH 1/4] Merge pull request #742 from Altinity/feature/lazy_load_metadata Make DataLake metadata more lazy --- src/Disks/ObjectStorages/IObjectStorage.cpp | 14 ++++++++++++++ src/Disks/ObjectStorages/IObjectStorage.h | 2 ++ .../DataLakes/IDataLakeMetadata.cpp | 12 ++++++++---- .../ObjectStorage/ReadBufferIterator.cpp | 8 +++----- .../ObjectStorage/StorageObjectStorageSource.cpp | 16 +--------------- 5 files changed, 28 insertions(+), 24 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index 32a607445891..40be617fda9d 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -148,4 +148,18 @@ std::string RelativePathWithMetadata::CommandInTaskResponse::to_string() const return oss.str(); } + +void RelativePathWithMetadata::loadMetadata(ObjectStoragePtr object_storage) +{ + if (!metadata) + { + const auto & path = isArchive() ? getPathToArchive() : getPath(); + + if (query_settings.ignore_non_existent_file) + metadata = object_storage->tryGetObjectMetadata(path); + else + metadata = object_storage->getObjectMetadata(path); + } +} + } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 7a3a050903fa..654e890d4b08 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -174,6 +174,8 @@ struct RelativePathWithMetadata std::optional getFileMetaInfo() const { return file_meta_info; } const CommandInTaskResponse & getCommand() const { return command; } + + void loadMetadata(ObjectStoragePtr object_storage); }; struct ObjectKeyWithMetadata diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp index f524d946a8af..3205da746054 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp @@ -50,12 +50,16 @@ class KeysIterator : public IObjectIterator return nullptr; auto key = data_files[current_index]; - auto object_metadata = object_storage->getObjectMetadata(key); if (callback) - callback(FileProgress(0, object_metadata.size_bytes)); - - return std::make_shared(key, std::move(object_metadata)); + { + /// Too expencive to load size for metadata always + /// because it requires API call to external storage. + /// In many cases only keys are needed. + callback(FileProgress(0, 1)); + } + + return std::make_shared(key, std::nullopt); } } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index d1828633e227..131b73b343ae 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -76,10 +76,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( const auto & object_info = (*it); auto get_last_mod_time = [&] -> std::optional { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - if (!object_info->metadata) - object_info->metadata = object_storage->tryGetObjectMetadata(path); - + object_info->loadMetadata(object_storage); return object_info->metadata ? std::optional(object_info->metadata->last_modified.epochTime()) : std::nullopt; @@ -151,7 +148,6 @@ std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() { auto context = getContext(); - const auto & path = current_object_info->isArchive() ? current_object_info->getPathToArchive() : current_object_info->getPath(); auto impl = createReadBuffer(*current_object_info, object_storage, context, getLogger("ReadBufferIterator")); const auto compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->getCompressionMethod()); @@ -250,6 +246,8 @@ ReadBufferIterator::Data ReadBufferIterator::next() prev_read_keys_size = read_keys.size(); } + current_object_info->loadMetadata(object_storage); + if (query_settings.skip_empty_files && current_object_info->metadata && current_object_info->metadata->size_bytes == 0) continue; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 0819d4abb2a7..77d2c25ab9a5 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -511,21 +511,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade if (object_info->getPath().empty()) return {}; - if (!object_info->metadata) - { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - - if (query_settings.ignore_non_existent_file) - { - auto metadata = object_storage->tryGetObjectMetadata(path); - if (!metadata) - return {}; - - object_info->metadata = metadata; - } - else - object_info->metadata = object_storage->getObjectMetadata(path); - } + object_info->loadMetadata(object_storage, query_settings.ignore_non_existent_file); } while (not_a_path || (query_settings.skip_empty_files && object_info->metadata->size_bytes == 0)); From a10a84dd7966539ce8dcce5335e79b26f82ab500 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 21 Oct 2025 01:20:54 +0200 Subject: [PATCH 2/4] Fix build --- src/Disks/ObjectStorages/IObjectStorage.cpp | 4 ++-- src/Disks/ObjectStorages/IObjectStorage.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index 40be617fda9d..fbd022a6e69d 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -149,13 +149,13 @@ std::string RelativePathWithMetadata::CommandInTaskResponse::to_string() const } -void RelativePathWithMetadata::loadMetadata(ObjectStoragePtr object_storage) +void RelativePathWithMetadata::loadMetadata(ObjectStoragePtr object_storage, bool ignore_non_existent_file) { if (!metadata) { const auto & path = isArchive() ? getPathToArchive() : getPath(); - if (query_settings.ignore_non_existent_file) + if (ignore_non_existent_file) metadata = object_storage->tryGetObjectMetadata(path); else metadata = object_storage->getObjectMetadata(path); diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 654e890d4b08..440f5482e783 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -175,7 +175,7 @@ struct RelativePathWithMetadata const CommandInTaskResponse & getCommand() const { return command; } - void loadMetadata(ObjectStoragePtr object_storage); + void loadMetadata(ObjectStoragePtr object_storage, bool ignore_non_existent_file = true); }; struct ObjectKeyWithMetadata From 17e867d86024bf7c689afead2a455b33fa206ac5 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 21 Oct 2025 11:07:25 +0200 Subject: [PATCH 3/4] Fix distributedWriteIntoReplicatedMergeTreeFromClusterStorage --- src/Interpreters/InterpreterInsertQuery.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 5b52cdbb9920..0a118070c5e6 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -772,6 +772,9 @@ InterpreterInsertQuery::distributedWriteIntoReplicatedMergeTreeFromClusterStorag if (!src_storage_cluster) return {}; + if (src_storage_cluster->getOriginalClusterName().empty()) + return {}; + if (!isInsertSelectTrivialEnoughForDistributedExecution(query)) return {}; From 76484ca6bac4cbafcbe734652608252662e3789c Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 21 Oct 2025 11:52:30 +0200 Subject: [PATCH 4/4] Fix fast tests --- .../03572_export_merge_tree_part_to_object_storage.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh index 6f2d6415f4e8..7af8db09f8d4 100755 --- a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh @@ -1,4 +1,7 @@ #!/usr/bin/env bash +# Tags: no-fasttest +# Tag no-fasttest: requires s3 storage + CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh