Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

f32 += f32 * u32 is faster in a loop than f32 += f32; can be defeated (a little bit) with #[cold] annotation? #138953

Open
nabijaczleweli opened this issue Mar 25, 2025 · 3 comments
Labels
A-codegen Area: Code generation C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such E-needs-mcve Call for participation: This issue has a repro, but needs a Minimal Complete and Verifiable Example I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue.

Comments

@nabijaczleweli
Copy link
Contributor

nabijaczleweli commented Mar 25, 2025

I tried this code:

#![feature(new_zeroed_alloc)]

trait Square {
    fn sqr(self) -> Self;
}
impl Square for f32 {
    fn sqr(self) -> Self {
        self * self
    }
}

#[derive(Debug, Copy, Clone, PartialEq, PartialOrd)]
struct Lab {
    l: f32,
    a: f32,
    b: f32,
}
impl Eq for Lab {}
impl Ord for Lab {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.partial_cmp(other).unwrap()
    }
}

fn msc_iter_mf(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;

        loop {
            let (newcenter_count, newcenter) = known_rgbs.iter()
                .filter(|(_, lab, _)| {
                    let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
                    distance_squared <= radiussy_squared
                })
                .fold((0,
                       Lab {
                           l: 0f32,
                           a: 0f32,
                           b: 0f32,
                       }),
                      |(cnt, acc), (_, lab, _freq)| {
                    (cnt + _freq,
                     Lab {
                         l: acc.l + lab.l * (*_freq as f32),
                         a: acc.a + lab.a * (*_freq as f32),
                         b: acc.b + lab.b * (*_freq as f32),
                     })
                });

            let newcenter = Lab {
                l: newcenter.l / newcenter_count as f32,
                a: newcenter.a / newcenter_count as f32,
                b: newcenter.b / newcenter_count as f32,
            };
            if newcenter == center {
                break;
            }
            center = newcenter;
        }


        outputs[*rgb as usize] = center;
    }
}

fn msc_iter_p(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;

        loop {
            let (newcenter_count, newcenter) = known_rgbs.iter()
                .filter(|(_, lab, _)| {
                    let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
                    distance_squared <= radiussy_squared
                })
                .fold((0,
                       Lab {
                           l: 0f32,
                           a: 0f32,
                           b: 0f32,
                       }),
                      |(cnt, acc), (_, lab, _freq)| {
                    (cnt + 1,
                     Lab {
                         l: acc.l + lab.l,
                         a: acc.a + lab.a,
                         b: acc.b + lab.b,
                     })
                });

            let newcenter = Lab {
                l: newcenter.l / newcenter_count as f32,
                a: newcenter.a / newcenter_count as f32,
                b: newcenter.b / newcenter_count as f32,
            };
            if newcenter == center {
                break;
            }
            center = newcenter;
        }


        outputs[*rgb as usize] = center;
    }
}

fn msc_for_mf(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;
        loop {
            let (mut newcenter_count, mut newcenter) = (0,
                                                        Lab {
                                                            l: 0f32,
                                                            a: 0f32,
                                                            b: 0f32,
                                                        });
            for (_, lab, freq) in known_rgbs {
                let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
                if distance_squared <= radiussy_squared {
                    newcenter_count += *freq;
                    newcenter.l += lab.l * (*freq as f32);
                    newcenter.a += lab.a * (*freq as f32);
                    newcenter.b += lab.b * (*freq as f32);
                }
            }

            newcenter.l /= newcenter_count as f32;
            newcenter.a /= newcenter_count as f32;
            newcenter.b /= newcenter_count as f32;
            if newcenter == center {
                outputs[*rgb as usize] = center;
                break;
            }
            center = newcenter;
        }
    }
}

fn msc_for_p(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;
        loop {
            let (mut newcenter_count, mut newcenter) = (0,
                                                        Lab {
                                                            l: 0f32,
                                                            a: 0f32,
                                                            b: 0f32,
                                                        });
            for (_, lab, _) in known_rgbs {
                let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
                if distance_squared <= radiussy_squared {
                    newcenter_count += 1;
                    newcenter.l += lab.l;
                    newcenter.a += lab.a;
                    newcenter.b += lab.b;
                }
            }

            newcenter.l /= newcenter_count as f32;
            newcenter.a /= newcenter_count as f32;
            newcenter.b /= newcenter_count as f32;
            if newcenter == center {
                outputs[*rgb as usize] = center;
                break;
            }
            center = newcenter;
        }
    }
}

fn msc_for_p_cold(outputs: &mut [Lab; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, Lab, u32)>, radiussy_squared: f32) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;
        loop {
            let (mut newcenter_count, mut newcenter) = (0,
                                                        Lab {
                                                            l: 0f32,
                                                            a: 0f32,
                                                            b: 0f32,
                                                        });
            for (_, lab, _) in known_rgbs {
                let distance_squared = (lab.l - center.l).sqr() + (lab.a - center.a).sqr() + (lab.b - center.b).sqr();
                if distance_squared <= radiussy_squared {
                    #[cold]
                    fn cold() {}
                    cold();
                    newcenter_count += 1;
                    newcenter.l += lab.l;
                    newcenter.a += lab.a;
                    newcenter.b += lab.b;
                }
            }

            newcenter.l /= newcenter_count as f32;
            newcenter.a /= newcenter_count as f32;
            newcenter.b /= newcenter_count as f32;
            if newcenter == center {
                outputs[*rgb as usize] = center;
                break;
            }
            center = newcenter;
        }
    }
}

fn main() {
    use std::io::BufRead;
    let mut outputs = unsafe { Box::<[Lab; 0xFFFFFF + 1]>::new_zeroed().assume_init() };
    let mut known_rgbs = vec![];
    for l in std::io::BufReader::new(std::fs::File::open("kr_").unwrap()).lines().map(Result::unwrap) {
        let mut iter = l.split_whitespace();
        known_rgbs.push((iter.next().unwrap().parse::<u32>().unwrap(),
                         Lab {
                             l: iter.next().unwrap().parse::<f32>().unwrap(),
                             a: iter.next().unwrap().parse::<f32>().unwrap(),
                             b: iter.next().unwrap().parse::<f32>().unwrap(),
                         },
                         1));
    }
    macro_rules! one {
        ($f:ident) => {
            let start = std::time::Instant::now();
            $f(&mut *outputs, &known_rgbs, 0.02f32 * 0.02f32);
            let end = std::time::Instant::now();
            println!("{}:\t{:?}", stringify!($f), end - start);
        }
    }
    one!(msc_iter_mf);
    one!(msc_iter_p);
    one!(msc_for_mf);
    one!(msc_for_p);
    one!(msc_for_p_cold);
}

kr_.gz (real, non-synthetic, data; shouldn't really matter though)

This is five identical implementations (if freq is fixed at 1, which it is).

I expected to see this happen: *_p variants are faster-or-at-worst-identical to *_mf.

Instead, this happened: *_mf is 30% (iter) or 21% (for) faster than *_p despite being more computationally complex.

@zopsicle analysed this godbolt of the fors as "linear_core_p does the addition unconditionally, then conditionally stores the result. linear_core_mf preserves the original control flow. If you put #[cold] fn cold() {} cold(); inside the if statement it will preserve the control flow." and they were right, but the _cold variant is still 7% worse than *_mf.

This is obviously wrong.

Measurements, for me, on a i7-2600:

msc_iter_mf:    16.235556598
msc_iter_p:     21.680596694
msc_for_mf:     15.856701278
msc_for_p:      19.5196018094
msc_for_p_cold: 14.865306603

Meta

rustc --version --verbose:

rustc 1.85.1 (4eb161250 2025-03-15)
binary: rustc
commit-hash: 4eb161250e340c8f48f66e2b929ef4a5bed7c181
commit-date: 2025-03-15
host: x86_64-pc-windows-gnu
release: 1.85.1
LLVM version: 19.1.7

Building in release mode in cargo and --codegen opt-level=3 out cargo.

Please note that I am not interested in making the code faster overall, just in the pessimisation that leads to the relative difference between the variants. (You know how programmers are.)

@nabijaczleweli nabijaczleweli added the C-bug Category: This is a bug. label Mar 25, 2025
@rustbot rustbot added the needs-triage This issue may need triage. Remove it if it has been sufficiently triaged. label Mar 25, 2025
@moxian
Copy link
Contributor

moxian commented Mar 26, 2025

That's a lot of harness code. Can this be minimized to make it easier to analyze maybe?

@rustbot label: -C-bug +C-optimization +I-slow +E-needs-mcve -needs-triage +T-compiler +A-codegen

@rustbot rustbot added C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such E-needs-mcve Call for participation: This issue has a repro, but needs a Minimal Complete and Verifiable Example I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. A-codegen Area: Code generation and removed C-bug Category: This is a bug. needs-triage This issue may need triage. Remove it if it has been sufficiently triaged. labels Mar 26, 2025
@nabijaczleweli
Copy link
Contributor Author

nabijaczleweli commented Mar 26, 2025

I reduced this a bit, but going below.... some? complexity level (as in dot product of three floats vs of just one) in the condition makes the iter variants be within 4%:

msc_iter_mf:    24.006395524s
msc_iter_p:     23.1620648s
msc_for_mf:     23.60311804s
msc_for_p:      42.233192545s
msc_for_p_cold: 21.665853454s

so idk, there's something there that wants the full non-reduced condition to trigger.

fn msc_iter_mf(outputs: &mut [f32; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, f32, u32)>) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;

        loop {
            let (newcenter_count, newcenter) = known_rgbs.iter()
                .filter(|(_, lab, _)| (lab - center).abs() <= 0.02f32)
                .fold((0, 0f32), |(cnt, acc), (_, lab, freq)| (cnt + freq, acc + lab * (*freq as f32)));

            let newcenter = newcenter / newcenter_count as f32;
            if newcenter == center {
                break;
            }
            center = newcenter;
        }


        outputs[*rgb as usize] = center;
    }
}

fn msc_iter_p(outputs: &mut [f32; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, f32, u32)>) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;

        loop {
            let (newcenter_count, newcenter) = known_rgbs.iter()
                .filter(|(_, lab, _)| (lab - center).abs() <= 0.02f32)
                .fold((0, 0f32), |(cnt, acc), (_, lab, _)| (cnt + 1, acc + lab));

            let newcenter = newcenter / newcenter_count as f32;
            if newcenter == center {
                break;
            }
            center = newcenter;
        }


        outputs[*rgb as usize] = center;
    }
}

fn msc_for_mf(outputs: &mut [f32; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, f32, u32)>) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;
        loop {
            let (mut newcenter_count, mut newcenter) = (0, 0f32);
            for (_, lab, freq) in known_rgbs {
                if (lab - center).abs() <= 0.02f32 {
                    newcenter_count += *freq;
                    newcenter += lab * (*freq as f32);
                }
            }

            newcenter /= newcenter_count as f32;
            if newcenter == center {
                outputs[*rgb as usize] = center;
                break;
            }
            center = newcenter;
        }
    }
}

fn msc_for_p(outputs: &mut [f32; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, f32, u32)>) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;
        loop {
            let (mut newcenter_count, mut newcenter) = (0, 0f32);
            for (_, lab, _) in known_rgbs {
                if (lab - center).abs() <= 0.02f32 {
                    newcenter_count += 1;
                    newcenter += lab;
                }
            }

            newcenter /= newcenter_count as f32;
            if newcenter == center {
                outputs[*rgb as usize] = center;
                break;
            }
            center = newcenter;
        }
    }
}

fn msc_for_p_cold(outputs: &mut [f32; 0xFFFFFF + 1], known_rgbs: &Vec<(u32, f32, u32)>) {
    for (rgb, lab, _) in known_rgbs {
        let mut center = *lab;
        loop {
            let (mut newcenter_count, mut newcenter) = (0, 0f32);
            for (_, lab, _) in known_rgbs {
                if (lab - center).abs() <= 0.02f32 {
                    #[cold]
                    fn cold() {}
                    cold();
                    newcenter_count += 1;
                    newcenter += lab;
                }
            }

            newcenter /= newcenter_count as f32;
            if newcenter == center {
                outputs[*rgb as usize] = center;
                break;
            }
            center = newcenter;
        }
    }
}

static mut OUTPUTS: [f32; 0xFFFFFF + 1] = [0f32; 0xFFFFFF + 1];
fn main() {
    use std::io::BufRead;
    let mut known_rgbs = vec![];
    for l in std::io::BufReader::new(std::fs::File::open("kr_").unwrap()).lines().map(Result::unwrap) {
        let mut iter = l.split_whitespace();
        known_rgbs.push((iter.next().unwrap().parse::<u32>().unwrap(), iter.next().unwrap().parse::<f32>().unwrap(), 1));
    }
    macro_rules! one {
        ($f:ident) => {
            let start = std::time::Instant::now();
            $f(unsafe { &mut OUTPUTS }, &known_rgbs);
            let end = std::time::Instant::now();
            println!("{}:\t{:?}", stringify!($f), end - start);
        }
    }
    one!(msc_iter_mf);
    one!(msc_iter_p);
    one!(msc_for_mf);
    one!(msc_for_p);
    one!(msc_for_p_cold);
}

@nabijaczleweli
Copy link
Contributor Author

nabijaczleweli commented Mar 26, 2025

Here's some perf annotates on a E5645 which exhibits the same characteristics:

mf2-msc_for_mf

Percent│370:   movss    0x4(%rsi),%xmm2
       │     ↓ jmp      399
       │       nop
  0.01 │380:   mov      %r8d,%edi
       │       xorps    %xmm4,%xmm4
  0.00 │       cvtsi2ss %rdi,%xmm4
  0.01 │       divss    %xmm4,%xmm3
  0.00 │       ucomiss  %xmm2,%xmm3
       │       movaps   %xmm3,%xmm2
  0.00 │     ↓ jne      399
       │     ↓ jnp      3f0
       │399:   xor      %edi,%edi
       │       xorps    %xmm3,%xmm3
       │       xor      %r8d,%r8d
  0.00 │     ↓ jmp      3b9
       │       data16   data16 data16 cs nopw 0x0(%rax,%rax,1)
  3.56 │3b0:   add      $0xc,%rdi
  0.22 │       cmp      %rdi,%rdx
       │     ↑ je       380
  1.05 │3b9:   movss    0x4(%rax,%rdi,1),%xmm4
 19.67 │       movaps   %xmm4,%xmm5
  0.74 │       subss    %xmm2,%xmm5
  0.65 │       andps    %xmm0,%xmm5
 12.76 │       ucomiss  %xmm5,%xmm1
 45.39 │     ↑ jb       3b0
  5.78 │       mov      0x8(%rax,%rdi,1),%r9d
  0.00 │       xorps    %xmm5,%xmm5
  4.41 │       cvtsi2ss %r9,%xmm5
  0.00 │       add      %r9d,%r8d
  1.18 │       mulss    %xmm5,%xmm4
  1.56 │       addss    %xmm4,%xmm3
  2.99 │     ↑ jmp      3b0
       │       nop
  0.00 │3f0:   mov      (%rsi),%edi

mf2-msc_for_p

Percent│370:   movss    0x4(%rsi),%xmm2
       │     ↓ jmp      396
       │       nop
  0.00 │380:   xorps    %xmm4,%xmm4
  0.00 │       cvtsi2ss %r8d,%xmm4
  0.02 │       divss    %xmm4,%xmm3
  0.00 │       ucomiss  %xmm2,%xmm3
       │       movaps   %xmm3,%xmm2
  0.00 │     ↓ jne      396
       │     ↓ jnp      3d0
       │396:   xor      %edi,%edi
       │       xorps    %xmm3,%xmm3
       │       xor      %r8d,%r8d
       │     ↓ jmp      3a9
 11.12 │3a0:   add      $0xc,%rdi
  0.83 │       cmp      %rdi,%rdx
       │     ↑ je       380
  3.31 │3a9:   movss    0x4(%rax,%rdi,1),%xmm4
 17.19 │       movaps   %xmm4,%xmm5
  2.99 │       subss    %xmm2,%xmm5
  1.03 │       andps    %xmm0,%xmm5
  1.49 │       ucomiss  %xmm5,%xmm1
 40.16 │       sbb      $0xffffffff,%r8d
  0.01 │       ucomiss  %xmm5,%xmm1
 14.04 │     ↑ jb       3a0
  5.32 │       addss    %xmm4,%xmm3
  2.48 │     ↑ jmp      3a0
       │       nop
  0.00 │3d0:   mov      (%rsi),%edi

mf2-msc_for_p_cold

Percent│370:   movss    0x4(%rsi),%xmm2
       │     ↓ jmp      396
       │       nop
  0.00 │380:   xorps    %xmm4,%xmm4
  0.00 │       cvtsi2ss %r8d,%xmm4
  0.02 │       divss    %xmm4,%xmm3
  0.00 │       ucomiss  %xmm2,%xmm3
       │       movaps   %xmm3,%xmm2
  0.00 │     ↓ jne      396
       │     ↓ jnp      3e0
       │396:   xor      %edi,%edi
       │       xorps    %xmm3,%xmm3
       │       xor      %r8d,%r8d
       │       xchg     %ax,%ax
  3.06 │3a0:   movss    0x4(%rax,%rdi,1),%xmm4
  1.55 │       movaps   %xmm4,%xmm5
  1.61 │       subss    %xmm2,%xmm5
 21.11 │       andps    %xmm0,%xmm5
  6.85 │       ucomiss  %xmm5,%xmm1
 36.19 │     ↓ jae      3c0
  3.34 │       add      $0xc,%rdi
 17.82 │       cmp      %rdi,%rdx
       │     ↑ jne      3a0
  0.01 │     ↑ jmp      380
  4.46 │3c0:   inc      %r8d
  0.58 │       addss    %xmm4,%xmm3
       │       add      $0xc,%rdi
  3.39 │       cmp      %rdi,%rdx
       │     ↑ jne      3a0
       │     ↑ jmp      380
       │       data16   data16 data16 data16 cs nopw 0x0(%rax,%rax,1)
  0.00 │3e0:   mov      (%rsi),%edi

What jumps out to me in mf2-msc_for_p is that it spends 40% of its run-time in sbb $0xffffffff,%r8d

I've reproduced the performance differential on a i5-1235U as well so this is not a decroded-uarch problem:
[orig]

msc_iter_mf:    6.206259814s
msc_iter_p:     7.821287191s
msc_for_mf:     6.20635699s
msc_for_p:      6.535377248s
msc_for_p_cold: 5.811809458s

[reduced]

msc_for_mf:     9.094245507s
msc_for_p:      13.308971712s
msc_for_p_cold: 9.897744601s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-codegen Area: Code generation C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such E-needs-mcve Call for participation: This issue has a repro, but needs a Minimal Complete and Verifiable Example I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue.
Projects
None yet
Development

No branches or pull requests

3 participants