Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a639ed2
improve perf of order by single column
taiyang-li Mar 3, 2025
5856391
commit again
taiyang-li Mar 3, 2025
de073b8
fix conflict
taiyang-li Mar 3, 2025
97b37d8
opt2: binary search
taiyang-li Mar 4, 2025
648bc91
improve radix sort
taiyang-li Mar 4, 2025
c1da119
Revert "improve radix sort" for no improvement
taiyang-li Mar 4, 2025
a3d064b
fix perf error
taiyang-li Mar 5, 2025
1446130
commit again
taiyang-li Mar 5, 2025
697f21f
improve radix sort
taiyang-li Mar 5, 2025
a168495
remove comments
taiyang-li Mar 5, 2025
db9ad81
change as request
taiyang-li Mar 12, 2025
98f6190
Merge branch 'ClickHouse:master' into opt_single_order_by
taiyang-li Mar 14, 2025
d01f8fb
split prs
taiyang-li Mar 18, 2025
5a3b8a2
Merge branch 'opt_single_order_by' of https://github.com/bigo-sg/Clic…
taiyang-li Mar 18, 2025
6f0b51a
Merge branch 'master' into opt_single_order_by
taiyang-li Jul 3, 2025
ab38d6d
Merge branch 'ClickHouse:master' into opt_single_order_by
taiyang-li Jul 3, 2025
0364ef6
Merge branch 'ClickHouse:master' into opt_single_order_by
taiyang-li Jul 8, 2025
1ddc3ad
Merge branch 'ClickHouse:master' into opt_single_order_by
taiyang-li Jul 28, 2025
59fd82d
Merge remote-tracking branch 'origin/master' into opt_single_order_by
taiyang-li Jul 28, 2025
f459c30
revert files
taiyang-li Jul 28, 2025
92f6117
Merge branch 'opt_single_order_by' of https://github.com/bigo-sg/clic…
taiyang-li Jul 28, 2025
59abf58
Merge branch 'ClickHouse:master' into opt_single_order_by
taiyang-li Aug 5, 2025
45e9fd1
fix building
taiyang-li Aug 8, 2025
ebf72fd
Merge remote-tracking branch 'origin/master' into opt_single_order_by
taiyang-li Aug 20, 2025
cbafd91
improve performance
taiyang-li Aug 20, 2025
faff07b
add prefetch distance
taiyang-li Aug 21, 2025
246272e
improve radix sort
taiyang-li Aug 21, 2025
2b927d1
revert allocation alignment
taiyang-li Aug 21, 2025
fe47e13
Merge branch 'ClickHouse:master' into opt_single_order_by
taiyang-li Aug 22, 2025
18081a3
Merge branch 'ClickHouse:master' into opt_single_order_by
taiyang-li Aug 25, 2025
c04bcdd
minimize CountType according to actual rows
taiyang-li Sep 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 50 additions & 18 deletions src/Common/RadixSort.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#pragma once


#include <string.h>
#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
#include <malloc.h>
Expand All @@ -13,11 +12,11 @@
#include <type_traits>
#include <memory>

#include <Core/Defines.h>
#include <base/bit_cast.h>
#include <base/extended_types.h>
#include <base/sort.h>
#include <Core/Defines.h>

#include <Common/PODArray.h>

/** Radix sort, has the following functionality:
*
Expand Down Expand Up @@ -225,6 +224,7 @@ struct RadixSort
static constexpr size_t PART_BITMASK = HISTOGRAM_SIZE - 1;
static constexpr size_t KEY_BITS = sizeof(Key) * 8;
static constexpr size_t NUM_PASSES = (KEY_BITS + (Traits::PART_SIZE_BITS - 1)) / Traits::PART_SIZE_BITS;
static constexpr size_t PREFETCH_DISTANCE = std::max(1UL, 64 / sizeof(Element) / 2);


static KeyBits keyToBits(Key x) { return bit_cast<KeyBits>(x); }
Expand Down Expand Up @@ -276,8 +276,18 @@ struct RadixSort
}
}


template <bool DIRECT_WRITE_TO_DESTINATION>
static NO_INLINE void radixSortLSDInternalHelper(Element * arr, size_t size, bool reverse, Result * destination)
{
if (size <= std::numeric_limits<UInt8>::max())
radixSortLSDInternal<DIRECT_WRITE_TO_DESTINATION, UInt8>(arr, size, reverse, destination);
else if (size <= std::numeric_limits<UInt16>::max())
radixSortLSDInternal<DIRECT_WRITE_TO_DESTINATION, UInt16>(arr, size, reverse, destination);
else
radixSortLSDInternal<DIRECT_WRITE_TO_DESTINATION, UInt32>(arr, size, reverse, destination);
}

template <bool DIRECT_WRITE_TO_DESTINATION, typename CountType>
static NO_INLINE void radixSortLSDInternal(Element * arr, size_t size, bool reverse, Result * destination)
{
/// If the array is smaller than 256, then it is better to use another algorithm.
Expand All @@ -304,15 +314,15 @@ struct RadixSort
}

{
/// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one.
/// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions.
CountType sums[NUM_PASSES] = {0};

for (size_t i = 0; i < HISTOGRAM_SIZE; ++i)
{
for (size_t pass = 0; pass < NUM_PASSES; ++pass)
{
CountType tmp = histograms[pass * HISTOGRAM_SIZE + i] + sums[pass];
histograms[pass * HISTOGRAM_SIZE + i] = sums[pass] - 1;
histograms[pass * HISTOGRAM_SIZE + i] = sums[pass];
sums[pass] = tmp;
}
}
Expand All @@ -326,15 +336,23 @@ struct RadixSort

for (size_t i = 0; i < size; ++i)
{
size_t pos = extractPart(pass, reader[i]);
auto element = reader[i];
size_t pos = extractPart(pass, element);
if (i + PREFETCH_DISTANCE < size) [[likely]]
{
size_t next_pos = extractPart(pass, reader[i + PREFETCH_DISTANCE]);
__builtin_prefetch(&writer[histograms[pass * HISTOGRAM_SIZE + next_pos]], 1);
}

/// Place the element on the next free position.
auto & dest = writer[++histograms[pass * HISTOGRAM_SIZE + pos]];
dest = reader[i];
auto & dest = writer[histograms[pass * HISTOGRAM_SIZE + pos]];
dest = element;

/// On the last pass, we do the reverse transformation.
if (!Traits::Transform::transform_is_simple && pass == NUM_PASSES - 1)
Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i]))));
Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(element))));

histograms[pass * HISTOGRAM_SIZE + pos]++;
}
}

Expand All @@ -348,16 +366,30 @@ struct RadixSort
{
for (size_t i = 0; i < size; ++i)
{
size_t pos = extractPart(pass, reader[i]);
writer[size - 1 - (++histograms[pass * HISTOGRAM_SIZE + pos])] = Traits::extractResult(reader[i]);
auto element = reader[i];
size_t pos = extractPart(pass, element);
if (i + PREFETCH_DISTANCE < size) [[likely]]
{
size_t next_pos = extractPart(pass, reader[i + PREFETCH_DISTANCE]);
__builtin_prefetch(&writer[size - 1 - histograms[pass * HISTOGRAM_SIZE + next_pos]], 1);
}

writer[size - 1 - (histograms[pass * HISTOGRAM_SIZE + pos]++)] = Traits::extractResult(element);
}
}
else
{
for (size_t i = 0; i < size; ++i)
{
size_t pos = extractPart(pass, reader[i]);
writer[++histograms[pass * HISTOGRAM_SIZE + pos]] = Traits::extractResult(reader[i]);
auto element = reader[i];
size_t pos = extractPart(pass, element);
if (i + PREFETCH_DISTANCE < size)
{
size_t next_pos = extractPart(pass, reader[i + PREFETCH_DISTANCE]);
__builtin_prefetch(&writer[histograms[pass * HISTOGRAM_SIZE + next_pos]], 1);
}

writer[histograms[pass * HISTOGRAM_SIZE + pos]++] = Traits::extractResult(element);
}
}
}
Expand Down Expand Up @@ -551,7 +583,7 @@ struct RadixSort
return;
}

radixSortLSDInternal<DIRECT_WRITE_TO_DESTINATION>(arr, size, reverse, destination);
radixSortLSDInternalHelper<DIRECT_WRITE_TO_DESTINATION>(arr, size, reverse, destination);
}

public:
Expand All @@ -560,12 +592,12 @@ struct RadixSort
*/
static void executeLSD(Element * arr, size_t size)
{
radixSortLSDInternal<false>(arr, size, false, nullptr);
radixSortLSDInternalHelper<false>(arr, size, false, nullptr);
}

static void executeLSD(Element * arr, size_t size, bool reverse)
{
radixSortLSDInternal<false>(arr, size, reverse, nullptr);
radixSortLSDInternalHelper<false>(arr, size, reverse, nullptr);
}

/** This function will start to sort inplace (modify 'arr')
Expand All @@ -576,7 +608,7 @@ struct RadixSort
*/
static void executeLSD(Element * arr, size_t size, bool reverse, Result * destination)
{
radixSortLSDInternal<true>(arr, size, reverse, destination);
radixSortLSDInternalHelper<true>(arr, size, reverse, destination);
}

/** Tries to fast sort elements for common sorting patterns (unstable).
Expand Down
5 changes: 5 additions & 0 deletions src/Common/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ target_link_libraries (orc_string_dictionary PRIVATE
ch_contrib::gbenchmark_all
dbms)

clickhouse_add_executable(benchmark_radix_sort radix_sort.cpp)
target_link_libraries (benchmark_radix_sort PRIVATE
ch_contrib::gbenchmark_all
dbms)

clickhouse_add_executable(wrap_in_nullable wrap_in_nullable.cpp)
target_link_libraries (wrap_in_nullable PRIVATE
ch_contrib::gbenchmark_all
Expand Down
72 changes: 72 additions & 0 deletions src/Common/benchmarks/radix_sort.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Storages/StorageGenerateRandom.h>
#include <benchmark/benchmark.h>
#include "pcg_random.hpp"

using namespace DB;

static void BM_RadixSort_UInt8(benchmark::State & state)
{
pcg64 rng;
UInt64 limit = DEFAULT_BLOCK_SIZE;
auto type = std::make_shared<DataTypeUInt8>();
auto column = fillColumnWithRandomData(type, limit, 0, 0, rng, nullptr);

for (auto _ : state)
{
IColumn::Permutation res;
column->getPermutation(IColumn::PermutationSortDirection::Ascending, IColumn::PermutationSortStability::Unstable, 0, 0, res);
benchmark::DoNotOptimize(res);
}
}

static void BM_RadixSort_Int16(benchmark::State & state)
{
pcg64 rng;
UInt64 limit = DEFAULT_BLOCK_SIZE;
auto type = std::make_shared<DataTypeInt16>();
auto column = fillColumnWithRandomData(type, limit, 0, 0, rng, nullptr);

for (auto _ : state)
{
IColumn::Permutation res;
column->getPermutation(IColumn::PermutationSortDirection::Ascending, IColumn::PermutationSortStability::Unstable, 0, 0, res);
benchmark::DoNotOptimize(res);
}
}

static void BM_RadixSort_Int32(benchmark::State & state)
{
pcg64 rng;
UInt64 limit = DEFAULT_BLOCK_SIZE;
auto type = std::make_shared<DataTypeInt32>();
auto column = fillColumnWithRandomData(type, limit, 0, 0, rng, nullptr);

for (auto _ : state)
{
IColumn::Permutation res;
column->getPermutation(IColumn::PermutationSortDirection::Ascending, IColumn::PermutationSortStability::Unstable, 0, 0, res);
benchmark::DoNotOptimize(res);
}
}

static void BM_RadixSort_UInt64(benchmark::State & state)
{
pcg64 rng;
UInt64 limit = DEFAULT_BLOCK_SIZE;
auto type = std::make_shared<DataTypeUInt64>();
auto column = fillColumnWithRandomData(type, limit, 0, 0, rng, nullptr);

for (auto _ : state)
{
IColumn::Permutation res;
column->getPermutation(IColumn::PermutationSortDirection::Ascending, IColumn::PermutationSortStability::Unstable, 0, 0, res);
benchmark::DoNotOptimize(res);
}
}

BENCHMARK(BM_RadixSort_UInt8);
BENCHMARK(BM_RadixSort_Int16);
BENCHMARK(BM_RadixSort_Int32);
BENCHMARK(BM_RadixSort_UInt64);
Loading