diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs index e1bbdf4a7ae41..a054e41b2084a 100644 --- a/src/libcore/lib.rs +++ b/src/libcore/lib.rs @@ -61,7 +61,9 @@ #![cfg_attr(not(stage0), deny(warnings))] #![feature(allow_internal_unstable)] +#![feature(asm)] #![feature(associated_type_defaults)] +#![feature(cfg_target_feature)] #![feature(concat_idents)] #![feature(const_fn)] #![feature(cfg_target_has_atomic)] diff --git a/src/libcore/num/dec2flt/algorithm.rs b/src/libcore/num/dec2flt/algorithm.rs index e33c2814bf2f4..d64ad449e9abf 100644 --- a/src/libcore/num/dec2flt/algorithm.rs +++ b/src/libcore/num/dec2flt/algorithm.rs @@ -32,19 +32,49 @@ fn power_of_ten(e: i16) -> Fp { Fp { f: sig, e: exp } } +#[cfg(any(not(target_arch="x86"), target_feature="sse2"))] +mod fpu_precision { + pub fn set_precision() { } +} + +#[cfg(all(target_arch="x86", not(target_feature="sse2")))] +mod fpu_precision { + use mem::size_of; + use ops::Drop; + + pub struct FPUControlWord(u16); + + fn set_cw(cw: u16) { + unsafe { asm!("fldcw $0" :: "m" (cw)) :: "volatile" } + } + + pub fn set_precision() -> FPUControlWord { + let cw = 0u16; + let cw_precision = match size_of::() { + 4 => 0x0000, // 32 bits + 8 => 0x0200, // 64 bits + _ => 0x0300, // default, 80 bits + }; + unsafe { asm!("fnstcw $0" : "=*m" (&cw)) ::: "volatile" } + set_cw((cw & 0xFCFF) | cw_precision); + FPUControlWord(cw) + } + + impl Drop for FPUControlWord { + fn drop(&mut self) { + set_cw(self.0) + } + } +} + /// The fast path of Bellerophon using machine-sized integers and floats. /// /// This is extracted into a separate function so that it can be attempted before constructing /// a bignum. /// /// The fast path crucially depends on arithmetic being correctly rounded, so on x86 -/// without SSE or SSE2 it will be **wrong** (as in, off by one ULP occasionally), because the x87 -/// FPU stack will round to 80 bit first before rounding to 64/32 bit. However, as such hardware -/// is extremely rare nowadays and in fact all in-tree target triples assume an SSE2-capable -/// microarchitecture, there is little incentive to deal with that. There's a test that will fail -/// when SSE or SSE2 is disabled, so people building their own non-SSE copy will get a heads up. -/// -/// FIXME: It would nevertheless be nice if we had a good way to detect and deal with x87. +/// without SSE or SSE2 it requires the precision of the x87 FPU stack to be changed +/// so that it directly rounds to 64/32 bit. pub fn fast_path(integral: &[u8], fractional: &[u8], e: i64) -> Option { let num_digits = integral.len() + fractional.len(); // log_10(f64::max_sig) ~ 15.95. We compare the exact value to max_sig near the end, @@ -60,6 +90,9 @@ pub fn fast_path(integral: &[u8], fractional: &[u8], e: i64) -> Opt if f > T::max_sig() { return None; } + + let _cw = fpu_precision::set_precision::(); + // The case e < 0 cannot be folded into the other branch. Negative powers result in // a repeating fractional part in binary, which are rounded, which causes real // (and occasioally quite significant!) errors in the final result.