rustc: Use LLVM's new saturating float-to-int intrinsics

This commit updates rustc, with an applicable LLVM version, to use LLVM's new `llvm.fpto{u,s}i.sat.*.*` intrinsics to implement saturating floating-point-to-int conversions. This results in a little bit tighter codegen for x86/x86_64, but the main purpose of this is to prepare for upcoming changes to the WebAssembly backend in LLVM where wasm's saturating float-to-int instructions will now be implemented with these intrinsics. This change allows simplifying a good deal of surrounding code, namely removing a lot of wasm-specific behavior. WebAssembly no longer has any special-casing of saturating arithmetic instructions and the need for `fptoint_may_trap` is gone and all handling code for that is now removed. This means that the only wasm-specific logic is in the `fpto{s,u}i` instructions which only get used for "out of bounds is undefined behavior". This does mean that for the WebAssembly target specifically the Rust compiler will no longer be 100% compatible with pre-LLVM 12 versions, but it seems like that's unlikely to be relied on by too many folks. Note that this change does immediately regress the codegen of saturating float-to-int casts on WebAssembly due to the specialization of the LLVM intrinsic not being present in our LLVM fork just yet. I'll be following up with an LLVM update to pull in those patches, but affects a few other SIMD things in flight for WebAssembly so I wanted to separate this change. Eventually the entire `cast_float_to_int` function can be removed when LLVM 12 is the minimum version, but that will require sinking the complexity of it into other backends such as Cranelfit.
rust-lang · Apr 21, 2021 · de2a460 · de2a460
1 parent 6265286
commit de2a460
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 207 deletions.
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -2,6 +2,7 @@ use crate::common::Funclet;
 use crate::context::CodegenCx;
 use crate::llvm::{self, BasicBlock, False};
 use crate::llvm::{AtomicOrdering, AtomicRmwBinOp, SynchronizationScope};
+use crate::llvm_util;
 use crate::type_::Type;
 use crate::type_of::LayoutLlvmExt;
 use crate::value::Value;
@@ -16,7 +17,7 @@ use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_hir::def_id::DefId;
 use rustc_middle::ty::layout::TyAndLayout;
 use rustc_middle::ty::{self, Ty, TyCtxt};
-use rustc_span::{sym, Span};
+use rustc_span::Span;
 use rustc_target::abi::{self, Align, Size};
 use rustc_target::spec::{HasTargetSpec, Target};
 use std::borrow::Cow;
@@ -669,81 +670,47 @@ impl BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
     }
 
     fn fptoui_sat(&mut self, val: &'ll Value, dest_ty: &'ll Type) -> Option<&'ll Value> {
-        // WebAssembly has saturating floating point to integer casts if the
-        // `nontrapping-fptoint` target feature is activated. We'll use those if
-        // they are available.
-        if self.sess().target.arch == "wasm32"
-            && self.sess().target_features.contains(&sym::nontrapping_dash_fptoint)
-        {
+        if llvm_util::get_version() >= (12, 0, 0) {
             let src_ty = self.cx.val_ty(val);
             let float_width = self.cx.float_width(src_ty);
             let int_width = self.cx.int_width(dest_ty);
-            let name = match (int_width, float_width) {
-                (32, 32) => Some("llvm.wasm.trunc.saturate.unsigned.i32.f32"),
-                (32, 64) => Some("llvm.wasm.trunc.saturate.unsigned.i32.f64"),
-                (64, 32) => Some("llvm.wasm.trunc.saturate.unsigned.i64.f32"),
-                (64, 64) => Some("llvm.wasm.trunc.saturate.unsigned.i64.f64"),
-                _ => None,
-            };
-            if let Some(name) = name {
-                let intrinsic = self.get_intrinsic(name);
-                return Some(self.call(intrinsic, &[val], None));
-            }
+            let name = format!("llvm.fptoui.sat.i{}.f{}", int_width, float_width);
+            let intrinsic = self.get_intrinsic(&name);
+            return Some(self.call(intrinsic, &[val], None));
         }
+
         None
     }
 
     fn fptosi_sat(&mut self, val: &'ll Value, dest_ty: &'ll Type) -> Option<&'ll Value> {
-        // WebAssembly has saturating floating point to integer casts if the
-        // `nontrapping-fptoint` target feature is activated. We'll use those if
-        // they are available.
-        if self.sess().target.arch == "wasm32"
-            && self.sess().target_features.contains(&sym::nontrapping_dash_fptoint)
-        {
+        if llvm_util::get_version() >= (12, 0, 0) {
             let src_ty = self.cx.val_ty(val);
             let float_width = self.cx.float_width(src_ty);
             let int_width = self.cx.int_width(dest_ty);
-            let name = match (int_width, float_width) {
-                (32, 32) => Some("llvm.wasm.trunc.saturate.signed.i32.f32"),
-                (32, 64) => Some("llvm.wasm.trunc.saturate.signed.i32.f64"),
-                (64, 32) => Some("llvm.wasm.trunc.saturate.signed.i64.f32"),
-                (64, 64) => Some("llvm.wasm.trunc.saturate.signed.i64.f64"),
-                _ => None,
-            };
-            if let Some(name) = name {
-                let intrinsic = self.get_intrinsic(name);
-                return Some(self.call(intrinsic, &[val], None));
-            }
+            let name = format!("llvm.fptosi.sat.i{}.f{}", int_width, float_width);
+            let intrinsic = self.get_intrinsic(&name);
+            return Some(self.call(intrinsic, &[val], None));
         }
-        None
-    }
 
-    fn fptosui_may_trap(&self, val: &'ll Value, dest_ty: &'ll Type) -> bool {
-        // Most of the time we'll be generating the `fptosi` or `fptoui`
-        // instruction for floating-point-to-integer conversions. These
-        // instructions by definition in LLVM do not trap. For the WebAssembly
-        // target, however, we'll lower in some cases to intrinsic calls instead
-        // which may trap. If we detect that this is a situation where we'll be
-        // using the intrinsics then we report that the call map trap, which
-        // callers might need to handle.
-        if !self.wasm_and_missing_nontrapping_fptoint() {
-            return false;
-        }
-        let src_ty = self.cx.val_ty(val);
-        let float_width = self.cx.float_width(src_ty);
-        let int_width = self.cx.int_width(dest_ty);
-        matches!((int_width, float_width), (32, 32) | (32, 64) | (64, 32) | (64, 64))
+        None
     }
 
     fn fptoui(&mut self, val: &'ll Value, dest_ty: &'ll Type) -> &'ll Value {
-        // When we can, use the native wasm intrinsics which have tighter
-        // codegen. Note that this has a semantic difference in that the
-        // intrinsic can trap whereas `fptoui` never traps. That difference,
-        // however, is handled by `fptosui_may_trap` above.
+        // On WebAssembly the `fptoui` and `fptosi` instructions currently have
+        // poor codegen. The reason for this is that the corresponding wasm
+        // instructions, `i32.trunc_f32_s` for example, will trap when the float
+        // is out-of-bounds, infinity, or nan. This means that LLVM
+        // automatically inserts control flow around `fptoui` and `fptosi`
+        // because the LLVM instruction `fptoui` is defined as producing a
+        // poison value, not having UB on out-of-bounds values.
         //
-        // Note that we skip the wasm intrinsics for vector types where `fptoui`
-        // must be used instead.
-        if self.wasm_and_missing_nontrapping_fptoint() {
+        // This method, however, is only used with non-saturating casts that
+        // have UB on out-of-bounds values. This means that it's ok if we use
+        // the raw wasm instruction since out-of-bounds values can do whatever
+        // we like. To ensure that LLVM picks the right instruction we choose
+        // the raw wasm intrinsic functions which avoid LLVM inserting all the
+        // other control flow automatically.
+        if self.sess().target.arch == "wasm32" {
             let src_ty = self.cx.val_ty(val);
             if self.cx.type_kind(src_ty) != TypeKind::Vector {
                 let float_width = self.cx.float_width(src_ty);
@@ -765,7 +732,8 @@ impl BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
     }
 
     fn fptosi(&mut self, val: &'ll Value, dest_ty: &'ll Type) -> &'ll Value {
-        if self.wasm_and_missing_nontrapping_fptoint() {
+        // see `fptoui` above for why wasm is different here
+        if self.sess().target.arch == "wasm32" {
             let src_ty = self.cx.val_ty(val);
             if self.cx.type_kind(src_ty) != TypeKind::Vector {
                 let float_width = self.cx.float_width(src_ty);
@@ -1419,9 +1387,4 @@ impl Builder<'a, 'll, 'tcx> {
             llvm::LLVMAddIncoming(phi, &val, &bb, 1 as c_uint);
         }
     }
-
-    fn wasm_and_missing_nontrapping_fptoint(&self) -> bool {
-        self.sess().target.arch == "wasm32"
-            && !self.sess().target_features.contains(&sym::nontrapping_dash_fptoint)
-    }
 }
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
@@ -503,14 +503,6 @@ impl CodegenCx<'b, 'tcx> {
         let t_f32 = self.type_f32();
         let t_f64 = self.type_f64();
 
-        ifn!("llvm.wasm.trunc.saturate.unsigned.i32.f32", fn(t_f32) -> t_i32);
-        ifn!("llvm.wasm.trunc.saturate.unsigned.i32.f64", fn(t_f64) -> t_i32);
-        ifn!("llvm.wasm.trunc.saturate.unsigned.i64.f32", fn(t_f32) -> t_i64);
-        ifn!("llvm.wasm.trunc.saturate.unsigned.i64.f64", fn(t_f64) -> t_i64);
-        ifn!("llvm.wasm.trunc.saturate.signed.i32.f32", fn(t_f32) -> t_i32);
-        ifn!("llvm.wasm.trunc.saturate.signed.i32.f64", fn(t_f64) -> t_i32);
-        ifn!("llvm.wasm.trunc.saturate.signed.i64.f32", fn(t_f32) -> t_i64);
-        ifn!("llvm.wasm.trunc.saturate.signed.i64.f64", fn(t_f64) -> t_i64);
         ifn!("llvm.wasm.trunc.unsigned.i32.f32", fn(t_f32) -> t_i32);
         ifn!("llvm.wasm.trunc.unsigned.i32.f64", fn(t_f64) -> t_i32);
         ifn!("llvm.wasm.trunc.unsigned.i64.f32", fn(t_f32) -> t_i64);
@@ -520,6 +512,28 @@ impl CodegenCx<'b, 'tcx> {
         ifn!("llvm.wasm.trunc.signed.i64.f32", fn(t_f32) -> t_i64);
         ifn!("llvm.wasm.trunc.signed.i64.f64", fn(t_f64) -> t_i64);
 
+        ifn!("llvm.fptosi.sat.i8.f32", fn(t_f32) -> t_i8);
+        ifn!("llvm.fptosi.sat.i16.f32", fn(t_f32) -> t_i16);
+        ifn!("llvm.fptosi.sat.i32.f32", fn(t_f32) -> t_i32);
+        ifn!("llvm.fptosi.sat.i64.f32", fn(t_f32) -> t_i64);
+        ifn!("llvm.fptosi.sat.i128.f32", fn(t_f32) -> t_i128);
+        ifn!("llvm.fptosi.sat.i8.f64", fn(t_f64) -> t_i8);
+        ifn!("llvm.fptosi.sat.i16.f64", fn(t_f64) -> t_i16);
+        ifn!("llvm.fptosi.sat.i32.f64", fn(t_f64) -> t_i32);
+        ifn!("llvm.fptosi.sat.i64.f64", fn(t_f64) -> t_i64);
+        ifn!("llvm.fptosi.sat.i128.f64", fn(t_f64) -> t_i128);
+
+        ifn!("llvm.fptoui.sat.i8.f32", fn(t_f32) -> t_i8);
+        ifn!("llvm.fptoui.sat.i16.f32", fn(t_f32) -> t_i16);
+        ifn!("llvm.fptoui.sat.i32.f32", fn(t_f32) -> t_i32);
+        ifn!("llvm.fptoui.sat.i64.f32", fn(t_f32) -> t_i64);
+        ifn!("llvm.fptoui.sat.i128.f32", fn(t_f32) -> t_i128);
+        ifn!("llvm.fptoui.sat.i8.f64", fn(t_f64) -> t_i8);
+        ifn!("llvm.fptoui.sat.i16.f64", fn(t_f64) -> t_i16);
+        ifn!("llvm.fptoui.sat.i32.f64", fn(t_f64) -> t_i32);
+        ifn!("llvm.fptoui.sat.i64.f64", fn(t_f64) -> t_i64);
+        ifn!("llvm.fptoui.sat.i128.f64", fn(t_f64) -> t_i128);
+
         ifn!("llvm.trap", fn() -> void);
         ifn!("llvm.debugtrap", fn() -> void);
         ifn!("llvm.frameaddress", fn(t_i32) -> i8p);

diff --git a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
@@ -11,7 +11,7 @@ use rustc_apfloat::{ieee, Float, Round, Status};
 use rustc_hir::lang_items::LangItem;
 use rustc_middle::mir;
 use rustc_middle::ty::cast::{CastTy, IntTy};
-use rustc_middle::ty::layout::{HasTyCtxt, TyAndLayout};
+use rustc_middle::ty::layout::HasTyCtxt;
 use rustc_middle::ty::{self, adjustment::PointerCast, Instance, Ty, TyCtxt};
 use rustc_span::source_map::{Span, DUMMY_SP};
 use rustc_span::symbol::sym;
@@ -385,10 +385,10 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                                 bx.inttoptr(usize_llval, ll_t_out)
                             }
                             (CastTy::Float, CastTy::Int(IntTy::I)) => {
-                                cast_float_to_int(&mut bx, true, llval, ll_t_in, ll_t_out, cast)
+                                cast_float_to_int(&mut bx, true, llval, ll_t_in, ll_t_out)
                             }
                             (CastTy::Float, CastTy::Int(_)) => {
-                                cast_float_to_int(&mut bx, false, llval, ll_t_in, ll_t_out, cast)
+                                cast_float_to_int(&mut bx, false, llval, ll_t_in, ll_t_out)
                             }
                             _ => bug!("unsupported cast: {:?} to {:?}", operand.layout.ty, cast.ty),
                         };
@@ -790,7 +790,6 @@ fn cast_float_to_int<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
     x: Bx::Value,
     float_ty: Bx::Type,
     int_ty: Bx::Type,
-    int_layout: TyAndLayout<'tcx>,
 ) -> Bx::Value {
     if let Some(false) = bx.cx().sess().opts.debugging_opts.saturating_float_casts {
         return if signed { bx.fptosi(x, int_ty) } else { bx.fptoui(x, int_ty) };
@@ -891,134 +890,39 @@ fn cast_float_to_int<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
     let int_min = bx.cx().const_uint_big(int_ty, int_min(signed, int_width) as u128);
     let zero = bx.cx().const_uint(int_ty, 0);
 
-    // The codegen here differs quite a bit depending on whether our builder's
-    // `fptosi` and `fptoui` instructions may trap for out-of-bounds values. If
-    // they don't trap then we can start doing everything inline with a
-    // `select` instruction because it's ok to execute `fptosi` and `fptoui`
-    // even if we don't use the results.
-    if !bx.fptosui_may_trap(x, int_ty) {
-        // Step 1 ...
-        let fptosui_result = if signed { bx.fptosi(x, int_ty) } else { bx.fptoui(x, int_ty) };
-        let less_or_nan = bx.fcmp(RealPredicate::RealULT, x, f_min);
-        let greater = bx.fcmp(RealPredicate::RealOGT, x, f_max);
-
-        // Step 2: We use two comparisons and two selects, with %s1 being the
-        // result:
-        //     %less_or_nan = fcmp ult %x, %f_min
-        //     %greater = fcmp olt %x, %f_max
-        //     %s0 = select %less_or_nan, int_ty::MIN, %fptosi_result
-        //     %s1 = select %greater, int_ty::MAX, %s0
-        // Note that %less_or_nan uses an *unordered* comparison. This
-        // comparison is true if the operands are not comparable (i.e., if x is
-        // NaN). The unordered comparison ensures that s1 becomes int_ty::MIN if
-        // x is NaN.
-        //
-        // Performance note: Unordered comparison can be lowered to a "flipped"
-        // comparison and a negation, and the negation can be merged into the
-        // select. Therefore, it not necessarily any more expensive than a
-        // ordered ("normal") comparison. Whether these optimizations will be
-        // performed is ultimately up to the backend, but at least x86 does
-        // perform them.
-        let s0 = bx.select(less_or_nan, int_min, fptosui_result);
-        let s1 = bx.select(greater, int_max, s0);
-
-        // Step 3: NaN replacement.
-        // For unsigned types, the above step already yielded int_ty::MIN == 0 if x is NaN.
-        // Therefore we only need to execute this step for signed integer types.
-        if signed {
-            // LLVM has no isNaN predicate, so we use (x == x) instead
-            let cmp = bx.fcmp(RealPredicate::RealOEQ, x, x);
-            bx.select(cmp, s1, zero)
-        } else {
-            s1
-        }
+    // Step 1 ...
+    let fptosui_result = if signed { bx.fptosi(x, int_ty) } else { bx.fptoui(x, int_ty) };
+    let less_or_nan = bx.fcmp(RealPredicate::RealULT, x, f_min);
+    let greater = bx.fcmp(RealPredicate::RealOGT, x, f_max);
+
+    // Step 2: We use two comparisons and two selects, with %s1 being the
+    // result:
+    //     %less_or_nan = fcmp ult %x, %f_min
+    //     %greater = fcmp olt %x, %f_max
+    //     %s0 = select %less_or_nan, int_ty::MIN, %fptosi_result
+    //     %s1 = select %greater, int_ty::MAX, %s0
+    // Note that %less_or_nan uses an *unordered* comparison. This
+    // comparison is true if the operands are not comparable (i.e., if x is
+    // NaN). The unordered comparison ensures that s1 becomes int_ty::MIN if
+    // x is NaN.
+    //
+    // Performance note: Unordered comparison can be lowered to a "flipped"
+    // comparison and a negation, and the negation can be merged into the
+    // select. Therefore, it not necessarily any more expensive than a
+    // ordered ("normal") comparison. Whether these optimizations will be
+    // performed is ultimately up to the backend, but at least x86 does
+    // perform them.
+    let s0 = bx.select(less_or_nan, int_min, fptosui_result);
+    let s1 = bx.select(greater, int_max, s0);
+
+    // Step 3: NaN replacement.
+    // For unsigned types, the above step already yielded int_ty::MIN == 0 if x is NaN.
+    // Therefore we only need to execute this step for signed integer types.
+    if signed {
+        // LLVM has no isNaN predicate, so we use (x == x) instead
+        let cmp = bx.fcmp(RealPredicate::RealOEQ, x, x);
+        bx.select(cmp, s1, zero)
     } else {
-        // In this case we cannot execute `fptosi` or `fptoui` and then later
-        // discard the result. The builder is telling us that these instructions
-        // will trap on out-of-bounds values, so we need to use basic blocks and
-        // control flow to avoid executing the `fptosi` and `fptoui`
-        // instructions.
-        //
-        // The general idea of what we're constructing here is, for f64 -> i32:
-        //
-        //      ;; block so far... %0 is the argument
-        //      %result = alloca i32, align 4
-        //      %inbound_lower = fcmp oge double %0, 0xC1E0000000000000
-        //      %inbound_upper = fcmp ole double %0, 0x41DFFFFFFFC00000
-        //      ;; match (inbound_lower, inbound_upper) {
-        //      ;;     (true, true) => %0 can be converted without trapping
-        //      ;;     (false, false) => %0 is a NaN
-        //      ;;     (true, false) => %0 is too large
-        //      ;;     (false, true) => %0 is too small
-        //      ;; }
-        //      ;;
-        //      ;; The (true, true) check, go to %convert if so.
-        //      %inbounds = and i1 %inbound_lower, %inbound_upper
-        //      br i1 %inbounds, label %convert, label %specialcase
-        //
-        //  convert:
-        //      %cvt = call i32 @llvm.wasm.trunc.signed.i32.f64(double %0)
-        //      store i32 %cvt, i32* %result, align 4
-        //      br label %done
-        //
-        //  specialcase:
-        //      ;; Handle the cases where the number is NaN, too large or too small
-        //
-        //      ;; Either (true, false) or (false, true)
-        //      %is_not_nan = or i1 %inbound_lower, %inbound_upper
-        //      ;; Figure out which saturated value we are interested in if not `NaN`
-        //      %saturated = select i1 %inbound_lower, i32 2147483647, i32 -2147483648
-        //      ;; Figure out between saturated and NaN representations
-        //      %result_nan = select i1 %is_not_nan, i32 %saturated, i32 0
-        //      store i32 %result_nan, i32* %result, align 4
-        //      br label %done
-        //
-        //  done:
-        //      %r = load i32, i32* %result, align 4
-        //      ;; ...
-        let done = bx.build_sibling_block("float_cast_done");
-        let mut convert = bx.build_sibling_block("float_cast_convert");
-        let mut specialcase = bx.build_sibling_block("float_cast_specialcase");
-
-        let result = PlaceRef::alloca(bx, int_layout);
-        result.storage_live(bx);
-
-        // Use control flow to figure out whether we can execute `fptosi` in a
-        // basic block, or whether we go to a different basic block to implement
-        // the saturating logic.
-        let inbound_lower = bx.fcmp(RealPredicate::RealOGE, x, f_min);
-        let inbound_upper = bx.fcmp(RealPredicate::RealOLE, x, f_max);
-        let inbounds = bx.and(inbound_lower, inbound_upper);
-        bx.cond_br(inbounds, convert.llbb(), specialcase.llbb());
-
-        // Translation of the `convert` basic block
-        let cvt = if signed { convert.fptosi(x, int_ty) } else { convert.fptoui(x, int_ty) };
-        convert.store(cvt, result.llval, result.align);
-        convert.br(done.llbb());
-
-        // Translation of the `specialcase` basic block. Note that like above
-        // we try to be a bit clever here for unsigned conversions. In those
-        // cases the `int_min` is zero so we don't need two select instructions,
-        // just one to choose whether we need `int_max` or not. If
-        // `inbound_lower` is true then we're guaranteed to not be `NaN` and
-        // since we're greater than zero we must be saturating to `int_max`. If
-        // `inbound_lower` is false then we're either NaN or less than zero, so
-        // we saturate to zero.
-        let result_nan = if signed {
-            let is_not_nan = specialcase.or(inbound_lower, inbound_upper);
-            let saturated = specialcase.select(inbound_lower, int_max, int_min);
-            specialcase.select(is_not_nan, saturated, zero)
-        } else {
-            specialcase.select(inbound_lower, int_max, int_min)
-        };
-        specialcase.store(result_nan, result.llval, result.align);
-        specialcase.br(done.llbb());
-
-        // Translation of the `done` basic block, positioning ourselves to
-        // continue from that point as well.
-        *bx = done;
-        let ret = bx.load(result.llval, result.align);
-        result.storage_dead(bx);
-        ret
+        s1
     }
 }