Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A better alternative to #46344 #46921

Merged
merged 2 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -684,12 +684,6 @@ void DataPartStorageOnDiskBase::clearDirectory(
request.emplace_back(fs::path(dir) / "delete-on-destroy.txt", true);
request.emplace_back(fs::path(dir) / "txn_version.txt", true);

/// Inverted index
request.emplace_back(fs::path(dir) / "skp_idx_af.gin_dict", true);
request.emplace_back(fs::path(dir) / "skp_idx_af.gin_post", true);
request.emplace_back(fs::path(dir) / "skp_idx_af.gin_seg", true);
request.emplace_back(fs::path(dir) / "skp_idx_af.gin_sid", true);

disk->removeSharedFiles(request, !can_remove_shared_data, names_not_to_remove);
disk->removeDirectory(dir);
}
Expand Down
4 changes: 4 additions & 0 deletions src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ void MergeTreeDataPartChecksums::checkEqual(const MergeTreeDataPartChecksums & r
{
const String & name = it.first;

/// Exclude files written by inverted index from check. No correct checksums are available for them currently.
if (name.ends_with(".gin_dict") || name.ends_with(".gin_post") || name.ends_with(".gin_seg") || name.ends_with(".gin_sid"))
continue;

auto jt = rhs.files.find(name);
if (jt == rhs.files.end())
throw Exception(ErrorCodes::NO_FILE_IN_DATA_PART, "No file {} in data part", name);
Expand Down
24 changes: 18 additions & 6 deletions src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,26 +208,26 @@ void MergeTreeDataPartWriterOnDisk::initSkipIndices()
auto ast = parseQuery(codec_parser, "(" + Poco::toUpper(settings.marks_compression_codec) + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
CompressionCodecPtr marks_compression_codec = CompressionCodecFactory::instance().get(ast, nullptr);

for (const auto & index_helper : skip_indices)
for (const auto & skip_index : skip_indices)
{
String stream_name = index_helper->getFileName();
String stream_name = skip_index->getFileName();
skip_indices_streams.emplace_back(
std::make_unique<MergeTreeDataPartWriterOnDisk::Stream>(
stream_name,
data_part->getDataPartStoragePtr(),
stream_name, index_helper->getSerializedFileExtension(),
stream_name, skip_index->getSerializedFileExtension(),
stream_name, marks_file_extension,
default_codec, settings.max_compress_block_size,
marks_compression_codec, settings.marks_compress_block_size,
settings.query_write_settings));

GinIndexStorePtr store = nullptr;
if (dynamic_cast<const MergeTreeIndexInverted *>(&*index_helper) != nullptr)
if (typeid_cast<const MergeTreeIndexInverted *>(&*skip_index) != nullptr)
{
store = std::make_shared<GinIndexStore>(stream_name, data_part->getDataPartStoragePtr(), data_part->getDataPartStoragePtr(), storage.getSettings()->max_digestion_size_per_segment);
gin_index_stores[stream_name] = store;
}
skip_indices_aggregators.push_back(index_helper->createIndexAggregatorForPart(store));
skip_indices_aggregators.push_back(skip_index->createIndexAggregatorForPart(store));
skip_index_accumulated_marks.push_back(0);
}
}
Expand Down Expand Up @@ -284,7 +284,7 @@ void MergeTreeDataPartWriterOnDisk::calculateAndSerializeSkipIndices(const Block
WriteBuffer & marks_out = stream.compress_marks ? stream.marks_compressed_hashing : stream.marks_hashing;

GinIndexStorePtr store;
if (dynamic_cast<const MergeTreeIndexInverted *>(&*index_helper) != nullptr)
if (typeid_cast<const MergeTreeIndexInverted *>(&*index_helper) != nullptr)
{
String stream_name = index_helper->getFileName();
auto it = gin_index_stores.find(stream_name);
Expand Down Expand Up @@ -388,6 +388,18 @@ void MergeTreeDataPartWriterOnDisk::fillSkipIndicesChecksums(MergeTreeData::Data
auto & stream = *skip_indices_streams[i];
if (!skip_indices_aggregators[i]->empty())
skip_indices_aggregators[i]->getGranuleAndReset()->serializeBinary(stream.compressed_hashing);

/// Register additional files written only by the inverted index. Required because otherwise DROP TABLE complains about unknown
/// files. Note that the provided actual checksums are bogus. The problem is that at this point the file writes happened already and
/// we'd need to re-open + hash the files (fixing this is TODO). For now, CHECK TABLE skips these four files.
if (typeid_cast<const MergeTreeIndexInverted *>(&*skip_indices[i]) != nullptr)
{
String filename_without_extension = skip_indices[i]->getFileName();
checksums.files[filename_without_extension + ".gin_dict"] = MergeTreeDataPartChecksums::Checksum();
checksums.files[filename_without_extension + ".gin_post"] = MergeTreeDataPartChecksums::Checksum();
checksums.files[filename_without_extension + ".gin_seg"] = MergeTreeDataPartChecksums::Checksum();
checksums.files[filename_without_extension + ".gin_sid"] = MergeTreeDataPartChecksums::Checksum();
}
}

for (auto & stream : skip_indices_streams)
Expand Down
12 changes: 8 additions & 4 deletions src/Storages/MergeTree/checkDataPart.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,25 +157,29 @@ IMergeTreeDataPart::Checksums checkDataPart(
}

NameSet projections_on_disk;
const auto & checksum_files_txt = checksums_txt.files;
const auto & checksums_txt_files = checksums_txt.files;
for (auto it = data_part_storage.iterate(); it->isValid(); it->next())
{
auto file_name = it->name();

/// We will check projections later.
if (data_part_storage.isDirectory(file_name) && endsWith(file_name, ".proj"))
if (data_part_storage.isDirectory(file_name) && file_name.ends_with(".proj"))
{
projections_on_disk.insert(file_name);
continue;
}

/// Exclude files written by inverted index from check. No correct checksums are available for them currently.
if (file_name.ends_with(".gin_dict") || file_name.ends_with(".gin_post") || file_name.ends_with(".gin_seg") || file_name.ends_with(".gin_sid"))
continue;

auto checksum_it = checksums_data.files.find(file_name);

/// Skip files that we already calculated. Also skip metadata files that are not checksummed.
if (checksum_it == checksums_data.files.end() && !files_without_checksums.contains(file_name))
{
auto txt_checksum_it = checksum_files_txt.find(file_name);
if (txt_checksum_it == checksum_files_txt.end() || txt_checksum_it->second.uncompressed_size == 0)
auto txt_checksum_it = checksums_txt_files.find(file_name);
if (txt_checksum_it == checksums_txt_files.end() || txt_checksum_it->second.uncompressed_size == 0)
{
/// The file is not compressed.
checksum_file(file_name);
Expand Down
1 change: 1 addition & 0 deletions tests/queries/0_stateless/02346_full_text_search.reference
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
af inverted
1
101 Alick a01
1
101 Alick a01
Expand Down
3 changes: 3 additions & 0 deletions tests/queries/0_stateless/02346_full_text_search.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ INSERT INTO tab VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'
-- check inverted index was created
SELECT name, type FROM system.data_skipping_indices WHERE table =='tab' AND database = currentDatabase() LIMIT 1;

-- throw in a random consistency check
CHECK TABLE tab;

-- search inverted index with ==
SELECT * FROM tab WHERE s == 'Alick a01';

Expand Down