Auto merge of #83357 - saethlin:vec-reserve-inlining, r=dtolnay · rust-lang/rust@32d3276

Commit

Auto merge of #83357 - saethlin:vec-reserve-inlining, r=dtolnay

Reduce the impact of Vec::reserve calls that do not cause any allocation

I think a lot of callers expect `Vec::reserve` to be nearly free when no resizing is required, but unfortunately that isn't the case. LLVM makes remarkably poor inlining choices (along the path from `Vec::reserve` to `RawVec::grow_amortized`), so depending on the surrounding context you either get a huge blob of `RawVec`'s resizing logic inlined into some seemingly-unrelated function, or not enough inlining happens and/or the actual check in `needs_to_grow` ends up behind a function call. My goal is to make the codegen for `Vec::reserve` match the mental that callers seem to have: It's reliably just a `sub cmp ja` if there is already sufficient capacity.

This patch has the following impact on the serde_json benchmarks: https://github.com/serde-rs/json-benchmark/tree/ca3efde8a5b75ff59271539b67452911860248c7 run with `cargo +stage1 run --release -- -n 1024`

Before:
```
                                DOM                  STRUCT
======= serde_json ======= parse|stringify ===== parse|stringify ====
data/canada.json         340 MB/s   490 MB/s   630 MB/s   370 MB/s
data/citm_catalog.json   460 MB/s   540 MB/s  1010 MB/s   550 MB/s
data/twitter.json        330 MB/s   840 MB/s   640 MB/s   630 MB/s

======= json-rust ======== parse|stringify ===== parse|stringify ====
data/canada.json         580 MB/s   990 MB/s
data/citm_catalog.json   720 MB/s   660 MB/s
data/twitter.json        570 MB/s   960 MB/s
```

After:
```
                                DOM                  STRUCT
======= serde_json ======= parse|stringify ===== parse|stringify ====
data/canada.json         330 MB/s   510 MB/s   610 MB/s   380 MB/s
data/citm_catalog.json   450 MB/s   640 MB/s   970 MB/s   830 MB/s
data/twitter.json        330 MB/s   880 MB/s   670 MB/s   960 MB/s

======= json-rust ======== parse|stringify ===== parse|stringify ====
data/canada.json         560 MB/s  1130 MB/s
data/citm_catalog.json   710 MB/s   880 MB/s
data/twitter.json        530 MB/s  1230 MB/s

```

That's approximately a one-third increase in throughput on two of the benchmarks, and no effect on one (The benchmark suite has sufficient jitter that I could pick a run where there are no regressions, so I'm not convinced they're meaningful here).

This also produces perf increases on the order of 3-5% in a few other microbenchmarks that I'm tracking. It might be useful to see if this has a cascading effect on inlining choices in some large codebases.

Compiling this simple program demonstrates the change in codegen that causes the perf impact:
```rust
fn main() {
    reserve(&mut Vec::new());
}

#[inline(never)]
fn reserve(v: &mut Vec<u8>) {
    v.reserve(1234);
}
```

Before:
```rust
00000000000069b0 <scratch::reserve>:
    69b0:       53                      push   %rbx
    69b1:       48 83 ec 30             sub    $0x30,%rsp
    69b5:       48 8b 47 08             mov    0x8(%rdi),%rax
    69b9:       48 8b 4f 10             mov    0x10(%rdi),%rcx
    69bd:       48 89 c2                mov    %rax,%rdx
    69c0:       48 29 ca                sub    %rcx,%rdx
    69c3:       48 81 fa d1 04 00 00    cmp    $0x4d1,%rdx
    69ca:       77 73                   ja     6a3f <scratch::reserve+0x8f>
    69cc:       48 81 c1 d2 04 00 00    add    $0x4d2,%rcx
    69d3:       72 75                   jb     6a4a <scratch::reserve+0x9a>
    69d5:       48 89 fb                mov    %rdi,%rbx
    69d8:       48 8d 14 00             lea    (%rax,%rax,1),%rdx
    69dc:       48 39 ca                cmp    %rcx,%rdx
    69df:       48 0f 47 ca             cmova  %rdx,%rcx
    69e3:       48 83 f9 08             cmp    $0x8,%rcx
    69e7:       be 08 00 00 00          mov    $0x8,%esi
    69ec:       48 0f 47 f1             cmova  %rcx,%rsi
    69f0:       48 85 c0                test   %rax,%rax
    69f3:       74 17                   je     6a0c <scratch::reserve+0x5c>
    69f5:       48 8b 0b                mov    (%rbx),%rcx
    69f8:       48 89 0c 24             mov    %rcx,(%rsp)
    69fc:       48 89 44 24 08          mov    %rax,0x8(%rsp)
    6a01:       48 c7 44 24 10 01 00    movq   $0x1,0x10(%rsp)
    6a08:       00 00
    6a0a:       eb 08                   jmp    6a14 <scratch::reserve+0x64>
    6a0c:       48 c7 04 24 00 00 00    movq   $0x0,(%rsp)
    6a13:       00
    6a14:       48 8d 7c 24 18          lea    0x18(%rsp),%rdi
    6a19:       48 89 e1                mov    %rsp,%rcx
    6a1c:       ba 01 00 00 00          mov    $0x1,%edx
    6a21:       e8 9a fe ff ff          call   68c0 <alloc::raw_vec::finish_grow>
    6a26:       48 8b 7c 24 20          mov    0x20(%rsp),%rdi
    6a2b:       48 8b 74 24 28          mov    0x28(%rsp),%rsi
    6a30:       48 83 7c 24 18 01       cmpq   $0x1,0x18(%rsp)
    6a36:       74 0d                   je     6a45 <scratch::reserve+0x95>
    6a38:       48 89 3b                mov    %rdi,(%rbx)
    6a3b:       48 89 73 08             mov    %rsi,0x8(%rbx)
    6a3f:       48 83 c4 30             add    $0x30,%rsp
    6a43:       5b                      pop    %rbx
    6a44:       c3                      ret
    6a45:       48 85 f6                test   %rsi,%rsi
    6a48:       75 08                   jne    6a52 <scratch::reserve+0xa2>
    6a4a:       ff 15 38 c4 03 00       call   *0x3c438(%rip)        # 42e88 <_GLOBAL_OFFSET_TABLE_+0x490>
    6a50:       0f 0b                   ud2
    6a52:       ff 15 f0 c4 03 00       call   *0x3c4f0(%rip)        # 42f48 <_GLOBAL_OFFSET_TABLE_+0x550>
    6a58:       0f 0b                   ud2
    6a5a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
```

After:
```asm
0000000000006910 <scratch::reserve>:
    6910:       48 8b 47 08             mov    0x8(%rdi),%rax
    6914:       48 8b 77 10             mov    0x10(%rdi),%rsi
    6918:       48 29 f0                sub    %rsi,%rax
    691b:       48 3d d1 04 00 00       cmp    $0x4d1,%rax
    6921:       77 05                   ja     6928 <scratch::reserve+0x18>
    6923:       e9 e8 fe ff ff          jmp    6810 <alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle>
    6928:       c3                      ret
    6929:       0f 1f 80 00 00 00 00    nopl   0x0(%rax)
```

Loading branch information

bors committed Mar 30, 2021

2 parents 902ca44 + 73d7734 commit 32d3276

library/alloc/src/raw_vec.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -315,8 +315,24 @@ impl<T, A: Allocator> RawVec<T, A> { @@
         /// #   vector.push_all(&[1, 3, 5, 7, 9]);
         /// # }
         /// ```
+        #[inline]
         pub fn reserve(&mut self, len: usize, additional: usize) {
-            handle_reserve(self.try_reserve(len, additional));
+            // Callers expect this function to be very cheap when there is already sufficient capacity.
+            // Therefore, we move all the resizing and error-handling logic from grow_amortized and
+            // handle_reserve behind a call, while making sure that the this function is likely to be
+            // inlined as just a comparison and a call if the comparison fails.
+            #[cold]
+            fn do_reserve_and_handle<T, A: Allocator>(
+                slf: &mut RawVec<T, A>,
+                len: usize,
+                additional: usize,
+            ) {
+                handle_reserve(slf.grow_amortized(len, additional));
+            }
+            if self.needs_to_grow(len, additional) {
+                do_reserve_and_handle(self, len, additional);
+            }
         }
         /// The same as `reserve`, but returns on errors instead of panicking or aborting.
@@ Expand Down @@

0 comments on commit `32d3276`

Please sign in to comment.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Commit

There are no files selected for viewing

0 comments on commit `32d3276`

Commit

There are no files selected for viewing

0 comments on commit 32d3276

0 comments on commit `32d3276`