Open
Description
We should improve the LLVM IR for std::bit_floor
from libstdc++. Specifically, when we compile std::bit_floor
from libstdc++, we should generate LLVM IR that is as good as that we would generate for std::bit_floor
from our own libc++.
#include <bit>
unsigned my_bit_floor(unsigned x) {
return std::bit_floor(x);
}
libstdc++
$ clang -march=skylake -std=c++20 -O2 -S -emit-llvm bit_floor.cc
%cmp.i.i = icmp eq i32 %X, 0
%shr.i.i = lshr i32 %X, 1
%0 = tail call i32 @llvm.ctlz.i32(i32 %shr.i.i, i1 false), !range !5
%sub.i.i = sub nuw nsw i32 32, %0
%shl.i.i = shl nuw i32 1, %sub.i.i
%retval.0.i.i = select i1 %cmp.i.i, i32 0, i32 %shl.i.i
ret i32 %retval.0.i.i
libcxx
$ clang -march=skylake -std=c++20 -stdlib=libc++ -nostdinc++ -I/usr/lib/llvm-14/include/c++/v1 -O2 -S -emit-llvm bit_floor.cc
%cmp.i = icmp eq i32 %X, 0
%0 = tail call i32 @llvm.ctlz.i32(i32 %X, i1 false), !range !5
%shl.i = lshr i32 -2147483648, %0
%cond.i = select i1 %cmp.i, i32 0, i32 %shl.i
ret i32 %cond.i
Here is the value after each LLVM IR instruction:
input 0 1 2 0x40000000 0x80000000
--------------------------------------------
libstdc++
shr 0 0 1 0x20000000 0x40000000
ctlz 32 32 31 2 1
sub 0 0 1 30 31
shl 1 1 2 0x40000000 0x80000000
sel 0 1 2 0x40000000 0x80000000
libc++
ctlz 32 31 30 1 0
shr undef 1 2 0x40000000 0x80000000
sel 0 1 2 0x40000000 0x80000000
FWIW, here is the x86 assembly:
libstdc++
89 f8 mov %edi,%eax ; 25 bytes, critical path length: 6
b9 01 00 00 00 mov $0x1,%ecx
d1 e8 shr %eax
f3 0f bd c0 lzcnt %eax,%eax
f6 d8 neg %al
85 ff test %edi,%edi
c4 e2 79 f7 c1 shlx %eax,%ecx,%eax
0f 44 c7 cmove %edi,%eax
libcxx
f3 0f bd c7 lzcnt %edi,%eax ; 17 bytes, critical path length: 3
b9 00 00 00 80 mov $0x80000000,%ecx
c4 e2 7b f7 c1 shrx %eax,%ecx,%eax
0f 42 c7 cmovb %edi,%eax