Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
431 lines (357 sloc) 11.1 KB
/**
* Implement IEEE 754 half-precision binary floating point format binary16.
*
* This a 16 bit type, and consists of a sign bit, a 5 bit exponent, and a
* 10 bit significand.
* All operations on HalfFloat are CTFE'able.
*
* References:
* $(WEB en.wikipedia.org/wiki/Half-precision_floating-point_format, Wikipedia)
* Copyright: Copyright Digital Mars 2012-2014
* License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Authors: $(WEB digitalmars.com, Walter Bright)
* Source: $(SARGONSRC src/sargon/_halffloat.d)
* Macros:
* WIKI=Phobos/StdHalffloat
*/
module sargon.halffloat;
/**
* The half precision floating point type.
*
* The only operations are:
* $(UL
* $(LI explicit conversion of float to HalfFloat)
* $(LI implicit conversion of HalfFloat to float)
* )
* It operates in an analogous manner to shorts, which are converted to ints
* before performing any operations, and explicitly cast back to shorts.
* The half float is considered essentially a storage type, not a computation type.
* Example:
* ---
HalfFloat h = hf!27.2f;
HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 );
HalfFloat f = HalfFloat(0.0f);
* ---
* Bugs:
* The only rounding mode currently supported is Round To Nearest.
* The exceptions OVERFLOW, UNDERFLOW and INEXACT are not thrown.
*/
struct HalfFloat {
/* Provide implicit conversion of HalfFloat to float
*/
@property float toFloat() { return shortToFloat(s); }
alias toFloat this;
/* Done as a template in order to prevent implicit conversion
* of argument to float.
*/
this(T : float)(T f)
{
static assert(is(T == float));
s = floatToShort(f);
}
/* These are done as properties to avoid
* circular reference problems.
*/
///
static @property HalfFloat min_normal() { HalfFloat hf = void; hf.s = 0x0400; return hf; }
unittest { assert(min_normal == hf!0x1p-14); }
///
static @property HalfFloat max() { HalfFloat hf = void; hf.s = 0x7BFF; return hf; }
unittest { assert(max == hf!0x1.FFCp+15); }
///
static @property HalfFloat nan() { HalfFloat hf = void; hf.s = EXPMASK | 1; return hf; }
unittest { assert(nan != hf!(float.nan)); }
///
static @property HalfFloat infinity() { HalfFloat hf = void; hf.s = EXPMASK; return hf; }
unittest { assert(infinity == hf!(float.infinity)); }
///
static @property HalfFloat epsilon() { HalfFloat hf = void; hf.s = 0x1400; return hf; }
unittest { assert(epsilon == hf!0x1p-10); }
enum dig = 3; ///
enum mant_dig = 11; ///
enum max_10_exp = 5; ///
enum max_exp = 16; ///
enum min_10_exp = -5; ///
enum min_exp = -14; ///
private:
ushort s = EXPMASK | 1; // .init is HalfFloat.nan
}
/********************
* User defined literal for Half Float.
* Example:
* ---
* auto h = hf!1.3f;
* ---
*/
template hf(float v)
{
enum hf = HalfFloat(v);
}
private:
// Half float values
enum SIGNMASK = 0x8000;
enum EXPMASK = 0x7C00;
enum MANTMASK = 0x03FF;
enum HIDDENBIT = 0x0400;
// float values
enum FSIGNMASK = 0x80000000;
enum FEXPMASK = 0x7F800000;
enum FMANTMASK = 0x007FFFFF;
enum FHIDDENBIT = 0x00800000;
// Rounding mode
enum ROUND { TONEAREST, UPWARD, DOWNWARD, TOZERO };
enum ROUNDMODE = ROUND.TONEAREST;
ushort floatToShort(float f)
{
/* If the target CPU has a conversion instruction, this code could be
* replaced with inline asm or a compiler intrinsic, but leave this
* as the CTFE path so CTFE can work on it.
*/
/* The code currently does not set INEXACT, UNDERFLOW, or OVERFLOW,
* but is marked where those would go.
*/
uint s = *cast(uint*)&f;
ushort u = (s & FSIGNMASK) ? SIGNMASK : 0;
int exp = s & FEXPMASK;
if (exp == FEXPMASK) // if nan or infinity
{
if ((s & FMANTMASK) == 0) // if infinity
{
u |= EXPMASK;
}
else // else nan
{
u |= EXPMASK | 1;
}
return u;
}
uint significand = s & FMANTMASK;
if (exp == 0) // if subnormal or zero
{
if (significand == 0) // if zero
return u;
/* A subnormal float is going to give us a zero result anyway,
* so just set UNDERFLOW and INEXACT and return +-0.
*/
return u;
}
else // else normal
{
// normalize exponent and remove bias
exp = (exp >> 23) - 127;
significand |= FHIDDENBIT;
}
exp += 15; // bias the exponent
bool guard = false; // guard bit
bool sticky = false; // sticky bit
uint shift = 13; // lop off rightmost 13 bits
if (exp <= 0) // if subnormal
{ shift += -exp + 1; // more bits to lop off
exp = 0;
}
if (shift > 23)
{
// Set UNDERFLOW, INEXACT, return +-0
return u;
}
//printf("exp = x%x significand = x%x\n", exp, significand);
// Lop off rightmost 13 bits, but save guard and sticky bits
guard = (significand & (1 << (shift - 1))) != 0;
sticky = (significand & ((1 << (shift - 1)) - 1)) != 0;
significand >>= shift;
//printf("guard = %d, sticky = %d\n", guard, sticky);
//printf("significand = x%x\n", significand);
if (guard || sticky)
{
// Lost some bits, so set INEXACT and round the result
switch (ROUNDMODE)
{
case ROUND.TONEAREST:
if (guard && (sticky || (significand & 1)))
++significand;
break;
case ROUND.UPWARD:
if (!(s & FSIGNMASK))
++significand;
break;
case ROUND.DOWNWARD:
if (s & FSIGNMASK)
++significand;
break;
case ROUND.TOZERO:
break;
default:
assert(0);
}
if (exp == 0) // if subnormal
{
if (significand & HIDDENBIT) // and not a subnormal no more
++exp;
}
else if (significand & (HIDDENBIT << 1))
{
significand >>= 1;
++exp;
}
}
if (exp > 30)
{ // Set OVERFLOW and INEXACT, return +-infinity
return u | EXPMASK;
}
/* Add exponent and significand into result.
*/
u |= exp << 10; // exponent
u |= (significand & ~HIDDENBIT); // significand
return u;
}
unittest
{
static struct S { ushort u; float f; }
static S[] tests =
[
{ 0x3C00, 1.0f },
{ 0x3C01, 1.0009765625f },
{ 0xC000, -2.0f },
{ 0x7BFF, 65504.0f },
{ 0x0400, 6.10352e-5f },
{ 0x03FF, 6.09756e-5f },
{ 0x0001, 5.9604644775e-8f },
{ 0x0000, 0.0f },
{ 0x8000, -0.0f },
{ 0x7C00, float.infinity },
{ 0xFC00, -float.infinity },
{ 0x3555, 0.333252f },
{ 0x7C01, float.nan },
{ 0xFC01, -float.nan },
{ 0x0000, 1.0e-8f },
{ 0x8000, -1.0e-8f },
{ 0x7C00, 1.0e31f },
{ 0xFC00, -1.0e31f },
{ 0x0000, 1.0e-37f / 10.0f }, // subnormal float
{ 0x8000, -1.0e-37f / 10.0f },
{ 0x6800, 0x1002p-1 }, // guard
{ 0x6801, 0x1003p-1 }, // guard && sticky
{ 0x6802, 0x1006p-1 }, // guard && (significand & 1)
{ 0x6802, 0x1007p-1 }, // guard && sticky && (significand & 1)
{ 0x0400, 0x1FFFp-27 }, // round up subnormal to normal
{ 0x0800, 0x3FFFp-27 }, // lose bit, add one to exp
//{ , },
];
foreach (i, s; tests)
{
ushort u = floatToShort(s.f);
if (u != s.u)
{
printf("[%d] %g %04x expected %04x\n", i, s.f, u, s.u);
assert(0);
}
}
}
float shortToFloat(ushort s)
{
/* If the target CPU has a conversion instruction, this code could be
* replaced with inline asm or a compiler intrinsic, but leave this
* as the CTFE path so CTFE can work on it.
*/
/* This one is fairly easy because there are no possible errors
* and no necessary rounding.
*/
int exp = s & EXPMASK;
if (exp == EXPMASK) // if nan or infinity
{
float f;
if ((s & MANTMASK) == 0) // if infinity
{
f = float.infinity;
}
else // else nan
{
f = float.nan;
}
return (s & SIGNMASK) ? -f : f;
}
uint significand = s & MANTMASK;
if (exp == 0) // if subnormal or zero
{
if (significand == 0) // if zero
return (s & SIGNMASK) ? -0.0f : 0.0f;
// Normalize by shifting until the hidden bit is 1
while (!(significand & HIDDENBIT))
{
significand <<= 1;
--exp;
}
significand &= ~HIDDENBIT; // hidden bit is, well, hidden
exp -= 14;
}
else // else normal
{
// normalize exponent and remove bias
exp = (exp >> 10) - 15;
}
/* Assemble sign, exponent, and significand into float.
* Don't have to deal with overflow, inexact, or subnormal
* because the range of floats is big enough.
*/
assert(-126 <= exp && exp <= 127); // just to be sure
//printf("exp = %d, significand = x%x\n", exp, significand);
uint u = (s & SIGNMASK) << 16; // sign bit
u |= (exp + 127) << 23; // bias the exponent and shift into position
u |= significand << (23 - 10);
return *cast(float*)&u;
}
unittest
{
static struct S { ushort u; float f; }
static S[] tests =
[
{ 0x3C00, 1.0f },
{ 0xC000, -2.0f },
{ 0x7BFF, 65504f },
{ 0x0000, 0.0f },
{ 0x8000, -0.0f },
{ 0x7C00, float.infinity},
{ 0xFC00, -float.infinity},
//{ , },
];
foreach (i, s; tests)
{
float f = shortToFloat(s.u);
if (f != s.f)
{
printf("[%d] %04x %g expected %g\n", i, s.u, f, s.f);
assert(0);
}
}
}
version (unittest) import std.stdio;
unittest
{
HalfFloat h = hf!27.2;
HalfFloat j = cast(HalfFloat)( hf!3.5 + hf!5 );
HalfFloat f = HalfFloat(0.0f);
float k = j + h;
f.s = 0x1400;
writeln("1.0009765625 ", 1.0f + f);
assert(f == HalfFloat.epsilon);
f.s = 0x0400;
writeln("6.10352e-5 ", cast(float)f);
assert(f == HalfFloat.min_normal);
f.s = 0x03FF;
writeln("6.09756e-5 ", cast(float)f);
f.s = 1;
writefln("5.96046e-8 %.10e", cast(float)f);
f.s = 0;
writeln("0 ", cast(float)f);
assert(f == 0.0f);
f.s = 0x8000;
writeln("-0 ", cast(float)f);
assert(f == -0.0f);
f.s = 0x3555;
writeln("0.33325 ", cast(float)f);
f = HalfFloat.nan();
assert(f.s == 0x7C01);
float fl = f;
writefln("%x", *cast(uint*)&fl);
assert(*cast(uint*)&fl == 0x7FC0_0000);
}