Do only one iteration of newton-raphson refinement for FP16 inv #1185

mcfi · 2025-09-23T21:12:47Z

Do one iteration of newton-raphson refinement for FP16 inv
FP16 only has 10 explicitly stored bits of mantissa, so one
iteration of refinement should be precise enough. Numerical
results showed both one iteration and two iterations of
newton-raphson refinement had worst ULP being 1.

gunes-arm

Hi Ben! Thanks a lot for your contributions. It's a very interesting patch.

The CI pipeline wasn't able to run all the tests due to premature failures due to some style related issues.

Please also have a look at our contribution latest guides here: https://github.com/ARM-software/ComputeLibrary/blob/main/docs/contributor_guide/contribution_guidelines.dox#L479

I'll separate some of the topics below:

I see commit type and sign-off missing in your commit message. Please see the above contribution guideline for commit message format. Also, just a reminder that your future fixes should be squashed as mentioned in https://github.com/ARM-software/ComputeLibrary/blob/main/docs/contributor_guide/contribution_guidelines.dox#L502.

Your commit message says 11 mantissa bits for Fp16. It should be 10.

I wrote a simple program to validate your hypothesis:

#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <cmath>
#include <cstdio>
#include <cassert>
#include <limits>
#include <vector>

#include <arm_neon.h>

inline float16x8_t vinvq_f16_with_it(float16x8_t x, int it)
{
    float16x8_t recip = vrecpeq_f16(x);

    while(it > 0)
    {
        recip = vmulq_f16(vrecpsq_f16(x, recip), recip);
        --it;
    }

    return recip;
}

inline uint16_t fp16_to_bits(__fp16 x) {
    uint16_t out;
    std::memcpy(&out, &x, sizeof(out));
    return out;
}

inline __fp16 bits_to_fp16(uint16_t bits) {
    __fp16 val;
    std::memcpy(&val, &bits, sizeof(val));
    return val;
}

inline double fp16_to_f64(__fp16 val) {
    return static_cast<double>(val);
}

inline float16x8_t vinvsqrtq_f16_with_it(float16x8_t x, int it)
{
    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);

    while(it > 0)
    {
        sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
        --it;
    }

    return sqrt_reciprocal;
}

__fp16 my_reciprocal_fp16(__fp16 x, int it) {
    float16x8_t input_vec = vdupq_n_f16(x);
    float16x8_t output_vec = vinvq_f16_with_it(input_vec, it);
    __fp16 result[8];
    vst1q_f16(result, output_vec);
    return result[0];
}

double my_reciprocal_fp64(__fp16 x, int it)
{
    return fp16_to_f64(my_reciprocal_fp16(x, it));
}

__fp16 my_invsqrt_fp16(__fp16 x, int it) {
    float16x8_t input_vec = vdupq_n_f16(x);
    float16x8_t output_vec = vinvsqrtq_f16_with_it(input_vec, it);
    __fp16 result[8];
    vst1q_f16(result, output_vec);
    return result[0];
}

double my_invsqrt_fp64(__fp16 x, int it)
{
    return fp16_to_f64(my_invsqrt_fp16(x, it));
}

bool is_fp16_finite(uint16_t x) {
    return (x & 0x7C00) != 0x7C00;
}

int main(int argc, char **argv) {
    if(argc != 3) {
        printf("Usage is ./main <mode> <num_iterations>\n\tmode = 0: Inverse/Reciprocal\n\tmode = 1: SqrtInv\n");
        exit(1);
    }

    int mode = atoi(argv[1]);
    int iter = atoi(argv[2]);

    assert(mode == 0 || mode == 1);
    assert(iter >= 0);

    int total = 0;

    double max_rel_err = 0.0f;
    double max_abs_err = 0.0f;

    int worst_ulp = 0;

    const int max_ulp = 64;  // or 128 if you expect worse cases
    std::vector<int> ulp_histogram(max_ulp, 0);

    for (uint32_t bits = 0; bits <= 0xFFFF; ++bits) {
        __fp16 x = bits_to_fp16(static_cast<uint16_t>(bits));
        double xd  = fp16_to_f64(x);

        // Skip NaNs and zero (division by zero)
        if (std::isnan(xd) || xd  == 0.0f || std::isinf(xd)) continue;

        // Skip negative values for Sqrt
        if(mode == 1 && xd  < 0) continue;

        double ref = 0.f;

        if(mode == 0) ref = 1.0f / xd ;
        else if(mode == 1) ref = 1.0f / std::sqrt(xd);

        double est = 0.f;

        if(mode == 0) est = my_reciprocal_fp64 (x, iter);
        else if(mode == 1) est = my_invsqrt_fp64 (x, iter);

        __fp16 ref_fp16 = static_cast<__fp16>(ref);
        __fp16 est_fp16 = 0.f;

        if(mode == 0) est_fp16 = my_reciprocal_fp16(x, iter);
        else if(mode == 1) est_fp16 = my_invsqrt_fp16(x, iter);

        uint16_t ref_bits = fp16_to_bits(ref_fp16);
        uint16_t est_bits = fp16_to_bits(est_fp16);

        // Ensure both are finite and not NaN/Inf
        if(!is_fp16_finite(ref_bits) || !is_fp16_finite(est_bits)) continue;

        int ulp_error = std::abs(int(ref_bits) - int(est_bits));

        if (ulp_error > worst_ulp) {
            worst_ulp = ulp_error;
        }

        ulp_histogram[std::min(ulp_error, max_ulp-1)]++;

        double abs_err = std::abs(est - ref);
        double rel_err = std::abs(abs_err / ref);

        assert(std::isfinite(rel_err));
        assert(std::isfinite(abs_err));

        max_rel_err = std::max(max_rel_err, rel_err);
        max_abs_err = std::max(max_abs_err, abs_err);

        ++total;
    }

    printf("Checked %d total FP16 values (excluding NaN/0/Inf)\n", total);
    printf("Max absolute error: %.8f\n", max_abs_err);
    printf("Max relative error: %.8f\n", max_rel_err);
    printf("Worst ULP: %d\n", worst_ulp);

    printf("\nULP Breakdown:\n");
    for (int i = 0; i < max_ulp; ++i) {
        if (ulp_histogram[i] > 0) {
            printf("ULP = %2d : %d cases\n", i, ulp_histogram[i]);
        }
    }

    return 0;
}

Build command:

g++ -std=c++14 -O3 example.cpp -o main

Although the situation gets better for Inverse, you're right. The worst case ULP doesn't change:

./main 0 1                            
Checked 62974 total FP16 values (excluding NaN/0/Inf)
Max absolute error: 40.43321300
Max relative error: 0.00239944
Worst ULP: 1

ULP Breakdown:
ULP =  0 : 48014 cases
ULP =  1 : 14960 cases

./main 0 2
Checked 62974 total FP16 values (excluding NaN/0/Inf)
Max absolute error: 30.30463576
Max relative error: 0.00230217
Worst ULP: 1

ULP Breakdown:
ULP =  0 : 53512 cases
ULP =  1 : 9462 cases

But, for InvSqrt, as far as I see, it decreases from 2 to 1:

./main 1 1
Checked 31743 total FP16 values (excluding NaN/0/Inf)
Max absolute error: 0.99046739
Max relative error: 0.00091767
Worst ULP: 2

ULP Breakdown:
ULP =  0 : 23080 cases
ULP =  1 : 8648 cases
ULP =  2 : 15 cases

./main 1 2
Checked 31743 total FP16 values (excluding NaN/0/Inf)
Max absolute error: 0.82670260
Max relative error: 0.00087251
Worst ULP: 1

ULP Breakdown:
ULP =  0 : 25161 cases
ULP =  1 : 6582 cases

Let me know what you think.

src/core/NEON/NEMath.inl

mcfi · 2025-09-30T22:09:39Z

Thanks for getting back, @gunes-arm, appreciated.

For inv, our results were the same. Yes, 2 iterations could get us slightly more precise results, but the worst case stays the same as 1 iteration. The question here is whether the extra iteration is better or worse. My take is, we should aim for the least iterations that get us worst ULP == 1, because the whole point of fast reciprocal is to to be fast with reasonably good precision. Otherwise, why not do 2+ iterations. :)

For invqrt, our results were different because I used the following for the reference values, which involves two roundings so slightly less precise than what you did by computing the double result and then rounding back to FP16. As your results showed, only 15 out of 31743 cases had ULP == 2 while the remaining have ULP<=1. I would argue that the extra iteration for such a small precision gain is unnecessary but I wouldn't insist.

inline float16x4_t vinvsqrt_f16_div(float16x4_t x)
{
    float16x4_t recip = vdiv_f16(vdup_n_f16(1.0f), vsqrt_f16(x));
    return recip;
}

Does it sound good to you if I revert the invsqrt changes while keep the inv changes?

gunes-arm · 2025-10-01T08:56:58Z

Hi @mcfi, my pleasure. I think we can certainly accept the contribution for the Inverse operation. You have a valid point there. Unfortunately, the library does not set solid rules in numerical precision, but if we would have set such rules, I think worst-case ULP would have been that rule.

mcfi · 2025-10-01T21:39:17Z

Sounds great, please see my latest iteration, thanks.

gunes-arm · 2025-10-02T13:52:20Z

Hi Ben, the CI failed with commit type check. It should follow https://www.conventionalcommits.org/en/v1.0.0/ style (https://github.com/ARM-software/ComputeLibrary/blob/main/docs/contributor_guide/contribution_guidelines.dox#L340).

I believe perf is suitable. See our past commit messages for examples.

FP16 only has 10 explicitly stored bits of mantissa, so one iteration of newton-raphson refinement should be precise enough. Numerical results showed both one iteration and two iterations of newton-raphson refinement had worst ULP being 1. Signed-off-by: Ben Niu <benniu@meta.com>

mcfi · 2025-10-02T16:53:37Z

Oh, I see, please see my latest commit message. I installed the pre-commit hook and it seems like all the pre-commit checks have passed. Not sure why the jenkins' precommit job still failed. I can't see any details since I don't have access to the it...

gunes-arm · 2025-10-03T09:35:59Z

Thanks. We trigger manually for external contributions. I've triggered it now. Let' see.

mcfi · 2025-10-03T16:38:17Z

Thanks, Gunes!

gunes-arm self-requested a review September 29, 2025 12:14

gunes-arm requested changes Sep 29, 2025

View reviewed changes

src/core/NEON/NEMath.inl Show resolved Hide resolved

mcfi force-pushed the patch-1 branch from 63c856e to 85cad90 Compare October 1, 2025 21:37

mcfi changed the title ~~Do one iteration of newton-raphson refinement for FP16 invsqrt and inv~~ Do only one iteration of newton-raphson refinement for FP16 inv Oct 1, 2025

mcfi force-pushed the patch-1 branch from 85cad90 to 329c890 Compare October 2, 2025 16:47

mcfi force-pushed the patch-1 branch from 329c890 to 997dd27 Compare October 2, 2025 16:49

gunes-arm self-requested a review October 3, 2025 10:53

gunes-arm approved these changes Oct 3, 2025

View reviewed changes

gunes-arm merged commit c66c4d3 into ARM-software:main Oct 3, 2025
2 checks passed

mcfi deleted the patch-1 branch October 3, 2025 16:38

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Do only one iteration of newton-raphson refinement for FP16 inv #1185

Do only one iteration of newton-raphson refinement for FP16 inv #1185

Uh oh!

mcfi commented Sep 23, 2025 •

edited

Loading

Uh oh!

gunes-arm left a comment

Uh oh!

Uh oh!

mcfi commented Sep 30, 2025

Uh oh!

gunes-arm commented Oct 1, 2025

Uh oh!

mcfi commented Oct 1, 2025

Uh oh!

gunes-arm commented Oct 2, 2025

Uh oh!

mcfi commented Oct 2, 2025 •

edited

Loading

Uh oh!

gunes-arm commented Oct 3, 2025 •

edited

Loading

Uh oh!

Uh oh!

mcfi commented Oct 3, 2025

Uh oh!

Uh oh!

Do only one iteration of newton-raphson refinement for FP16 inv #1185

Do only one iteration of newton-raphson refinement for FP16 inv #1185

Uh oh!

Conversation

mcfi commented Sep 23, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gunes-arm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

mcfi commented Sep 30, 2025

Uh oh!

gunes-arm commented Oct 1, 2025

Uh oh!

mcfi commented Oct 1, 2025

Uh oh!

gunes-arm commented Oct 2, 2025

Uh oh!

mcfi commented Oct 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gunes-arm commented Oct 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

mcfi commented Oct 3, 2025

Uh oh!

Uh oh!

mcfi commented Sep 23, 2025 •

edited

Loading

mcfi commented Oct 2, 2025 •

edited

Loading

gunes-arm commented Oct 3, 2025 •

edited

Loading