Description
Originally reduced from an AArch64 miscompile, but I believe it applies to most if not all targets. This example is for x86_64:
typedef short short3 __attribute__((ext_vector_type(3)));
typedef struct { short s[4]; } short4;
short3 f1(short s) {
return (short3){s, s, s};
}
short4 f2(short s) {
short3 x = f1(s);
short4 y;
__builtin_memcpy(&y, &x, sizeof x);
return y;
}
Compiling this to LLVM IR produces:
$ bin/clang -emit-llvm -S -O3 -o - test.c
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable
define dso_local double @f1(i16 noundef signext %s) local_unnamed_addr #0 {
entry:
%retval.sroa.0 = alloca double, align 8
%vecinit = insertelement <3 x i16> poison, i16 %s, i64 0
%vecinit2 = shufflevector <3 x i16> %vecinit, <3 x i16> poison, <3 x i32> zeroinitializer
store <3 x i16> %vecinit2, ptr %retval.sroa.0, align 8
%retval.sroa.0.0.retval.sroa.0.0.retval.sroa.0.0.retval.sroa.0.0.retval.sroa.0.0. = load double, ptr %retval.sroa.0, align 8
ret double %retval.sroa.0.0.retval.sroa.0.0.retval.sroa.0.0.retval.sroa.0.0.retval.sroa.0.0.
}
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable
define dso_local i64 @f2(i16 noundef signext %s) local_unnamed_addr #0 {
entry:
%vecinit.i = insertelement <3 x i16> poison, i16 %s, i64 0
%extractVec1 = shufflevector <3 x i16> %vecinit.i, <3 x i16> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
%0 = bitcast <4 x i16> %extractVec1 to i64
ret i64 %0
}
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
!llvm.module.flags = !{!0, !1, !2, !3}
!llvm.ident = !{!4}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{!"clang version 21.0.0git (https://github.com/llvm/llvm-project 2692c3aa6760f1e4ea015f906926f63ec7dce044)"}
The definition of f2
first does shufflevector
, producing %extractVec1 = <4 x i16> <i16 %s, i16 %s, i16 %s, i16 poison>
. It then bitcast
s the result to i64
, and because poison
, unlike undef
, always applies to all bits, this potentially throws away the well-defined first three elements of the vector-copied-to-array depending on exactly which passes run at exactly which time and how they choose to interpret poison
.
In a larger example, I saw this go wrong in a context where SROA chose to split up the individual elements of y
and then reconstruct the scalar by doing bitwise operations, where constant folding then kicked in for the constant poison
element of the vector, but I believe this heavily reduced example already shows that the Clang-to-LLVM-IR translation is going wrong, even if the LLVM-IR-to-machine-code translation happens to produce the expected machine code so far.