diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp index 20b6c5a919ed..fb45ee42b793 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskFull.cpp @@ -95,11 +95,20 @@ UInt32 DataPartStorageOnDiskFull::getRefCount(const String & file_name) const return volume->getDisk()->getRefCount(fs::path(root_path) / part_dir / file_name); } -std::string DataPartStorageOnDiskFull::getRemotePath(const std::string & file_name) const +std::string DataPartStorageOnDiskFull::getRemotePath(const std::string & file_name, bool if_exists) const { - auto objects = volume->getDisk()->getStorageObjects(fs::path(root_path) / part_dir / file_name); + const std::string path = fs::path(root_path) / part_dir / file_name; + auto objects = volume->getDisk()->getStorageObjects(path); + + if (objects.empty() && if_exists) + return ""; + if (objects.size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "One file must be mapped to one object on blob storage in MergeTree tables"); + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "One file must be mapped to one object on blob storage by path {} in MergeTree tables, have {}.", + path, objects.size()); + } return objects[0].remote_path; } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskFull.h b/src/Storages/MergeTree/DataPartStorageOnDiskFull.h index 5d70404fcfad..f194e1a2f98e 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskFull.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskFull.h @@ -23,7 +23,7 @@ class DataPartStorageOnDiskFull final : public DataPartStorageOnDiskBase Poco::Timestamp getFileLastModified(const String & file_name) const override; size_t getFileSize(const std::string & file_name) const override; UInt32 getRefCount(const std::string & file_name) const override; - std::string getRemotePath(const std::string & file_name) const override; + std::string getRemotePath(const std::string & file_name, bool if_exists) const override; String getUniqueId() const override; std::unique_ptr readFile( diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 79aeec002d7c..0d564a0ff488 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -127,7 +127,7 @@ class IDataPartStorage : public boost::noncopyable virtual UInt32 getRefCount(const std::string & file_name) const = 0; /// Get path on remote filesystem from file name on local filesystem. - virtual std::string getRemotePath(const std::string & file_name) const = 0; + virtual std::string getRemotePath(const std::string & file_name, bool if_exists) const = 0; virtual UInt64 calculateTotalSizeOnDisk() const = 0; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 6bb5231f998d..d8c21e8b520e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5308,18 +5308,48 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( backup_entries_from_part, &temp_dirs); - auto projection_parts = part->getProjectionParts(); - for (const auto & [projection_name, projection_part] : projection_parts) + auto backup_projection = [&](IDataPartStorage & storage, IMergeTreeDataPart & projection_part) { - projection_part->getDataPartStorage().backup( - projection_part->checksums, - projection_part->getFileNamesWithoutChecksums(), + storage.backup( + projection_part.checksums, + projection_part.getFileNamesWithoutChecksums(), fs::path{data_path_in_backup} / part->name, backup_settings, read_settings, make_temporary_hard_links, backup_entries_from_part, &temp_dirs); + }; + + auto projection_parts = part->getProjectionParts(); + std::string proj_suffix = ".proj"; + std::unordered_set defined_projections; + + for (const auto & [projection_name, projection_part] : projection_parts) + { + defined_projections.emplace(projection_name); + backup_projection(projection_part->getDataPartStorage(), *projection_part); + } + + /// It is possible that the part has a written but not loaded projection, + /// e.g. it is written to parent part's checksums.txt and exists on disk, + /// but does not exist in table's projections definition. + /// Such a part can appear server was restarted after DROP PROJECTION but before old part was removed. + /// In this case, the old part will load only projections from metadata. + /// See 031145_non_loaded_projection_backup.sh. + for (const auto & [name, _] : part->checksums.files) + { + auto projection_name = fs::path(name).stem().string(); + if (endsWith(name, proj_suffix) && !defined_projections.contains(projection_name)) + { + auto projection_storage = part->getDataPartStorage().getProjection(projection_name + proj_suffix); + if (projection_storage->exists("checksums.txt")) + { + auto projection_part = const_cast(*part).getProjectionPartBuilder( + projection_name, /* is_temp_projection */false).withPartFormatFromDisk().build(); + backup_projection(projection_part->getDataPartStorage(), *projection_part); + } + } } if (hold_storage_and_part_ptrs) diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index a327ca17609e..ee104468ce04 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -289,8 +289,17 @@ IMergeTreeDataPart::Checksums checkDataPart( auto file_name = it->name(); if (!data_part_storage.isDirectory(file_name)) { - auto remote_path = data_part_storage.getRemotePath(file_name); - cache.removePathIfExists(remote_path); + const bool is_projection_part = data_part->isProjectionPart(); + auto remote_path = data_part_storage.getRemotePath(file_name, /* if_exists */is_projection_part); + if (remote_path.empty()) + { + chassert(is_projection_part); + throw Exception( + ErrorCodes::BROKEN_PROJECTION, + "Remote path for {} does not exist for projection path. Projection {} is broken", + file_name, data_part->name); + } + cache.removePathIfExists(remote_path, FileCache::getCommonUser().user_id); } } diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.reference b/tests/queries/0_stateless/03145_non_loaded_projection_backup.reference new file mode 100644 index 000000000000..a11ee210e629 --- /dev/null +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.reference @@ -0,0 +1,7 @@ +7 +Found unexpected projection directories: pp.proj +BACKUP_CREATED +RESTORED +7 +Found unexpected projection directories: pp.proj +0 diff --git a/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh new file mode 100755 index 000000000000..b542c9fff9a3 --- /dev/null +++ b/tests/queries/0_stateless/03145_non_loaded_projection_backup.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -nm -q " +drop table if exists tp_1; +create table tp_1 (x Int32, y Int32, projection p (select x, y order by x)) engine = MergeTree order by y partition by intDiv(y, 100); +system stop merges tp_1; +insert into tp_1 select number, number from numbers(3); + +set mutations_sync = 2; + +alter table tp_1 add projection pp (select x, count() group by x); +insert into tp_1 select number, number from numbers(4); +select count() from tp_1; + +-- Here we have a part with written projection pp +alter table tp_1 detach partition '0'; +-- Move part to detached +alter table tp_1 clear projection pp; +-- Remove projection from table metadata +alter table tp_1 drop projection pp; +-- Now, we don't load projection pp for attached part, but it is written on disk +alter table tp_1 attach partition '0'; +" + +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +check table tp_1 settings check_query_single_value_result = 0;" | grep -o "Found unexpected projection directories: pp.proj" + +backup_id="$CLICKHOUSE_TEST_UNIQUE_NAME" +$CLICKHOUSE_CLIENT -q " +backup table tp_1 to Disk('backups', '$backup_id'); +" | grep -o "BACKUP_CREATED" + +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +drop table tp_1; +restore table tp_1 from Disk('backups', '$backup_id'); +" | grep -o "RESTORED" + +$CLICKHOUSE_CLIENT -q "select count() from tp_1;" +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +check table tp_1 settings check_query_single_value_result = 0;" | grep -o "Found unexpected projection directories: pp.proj" +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +check table tp_1" +$CLICKHOUSE_CLIENT -nm -q " +set send_logs_level='fatal'; +drop table tp_1"