Skip to content

Commit e2ad5fc

Browse files
authored
Arm64: Implement region write barriers (#111636)
Extend the Arm64 writebarrier function to support regions and use the WriteBarrierManager, similar to Amd64. This results in 10 different versions of the JIT_WriteBarrier, with the WriteBarrierManager deciding on which version to use. Pseudo code for the writebarrier is included in GC-write-barriers.md This is expected to make the writebarrier slower, but improve the performance of the GC. DOTNET_GCWriteBarrier=3 can be used give the same functionality as before this change. The behavior of the writebarrier is: Before the PR: check ephemeral bounds, update a byte in the card table, mark the card bundle After the PR: DOTNET_GCWriteBarrier=1 (default, bit region write barriers): check ephemeral bounds, check regions, update a bit in the card table, mark the card bundle DOTNET_GCWriteBarrier=2 (byte region write barriers): check ephemeral bounds, check regions, update a byte in the card table, mark the card bundle DOTNET_GCWriteBarrier=3 (server write barriers): check ephemeral bounds, update a byte in the card table, mark the card bundle. This is the same as before the PR. DOTNET_gcServer=1: update a byte in the card table, mark the card bundle. Test results on an 8 core Cobalt 100. Ephemeral test (dotnet/performance) WB_nonephemeral : -20% WB_ephemeral: -16% WKS GC is calculating the generation of regions in addition to comparing with g_ephemeral_low/high". So while it might set fewer cards, it is more expensive and it shows. With DOTNET_GCWriteBarrier=3: WB_nonephemeral : +15% WB_ephemeral: +1% SVR GC WB also became more expensive but it sets way fewer cards (for nonephemeral it should set almost no cards). GCPerfsim Flags: -tc 2 -tagb 200 -tlgb 2 -lohpi 0 -sohsi 50 -ramb 20 -rlmb 0.2 -sohpi 0 No environment variables set: Gen0 pause: -21.06%. Gen1 pause -14.25% DOTNET_GCWriteBarrier=2: Gen0 pause: -6.7%. Gen1 pause -2.78% DOTNET_GCWriteBarrier=3 : Gen0 pause: -1.37%. Gen1 pause -1.26% DOTNET_gcServer=1 DOTNET_GCHeapCount=8: Gen0 pause: -7.24%. Gen1 pause -3.49% Above are linux numbers. On windows for no env var set we are seeing not as much but still quite noticeable pause improvements around 8% to 10% for this config of GCPerfSim.
1 parent 7dc2819 commit e2ad5fc

14 files changed

+1512
-910
lines changed
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# GC write barriers
2+
3+
The GC write barrier function (JIT_WriteBarrier) is generally the hottest function in CoreCLR and is written in assembly. The full pseudo code for the function is as follows:
4+
5+
6+
````
7+
JIT_WriteBarrier(Object **dst, Object *ref)
8+
Set *dst = ref
9+
10+
// Shadow Heap update
11+
ifdef WRITE_BARRIER_CHECK: // Only set in DEBUG mode
12+
if g_GCShadow != 0:
13+
long *shadow_dst = g_GCShadow + (dst - g_lowest_address)
14+
// Check shadow heap location is within shadow heap
15+
if shadow_dst < g_GCShadowEnd:
16+
*shadow_dst = ref
17+
atomic: wait for stores to complete
18+
if *dst != ref:
19+
*shadow_dst = INVALIDGCVALUE
20+
21+
// Update the write watch table, if it's in use
22+
ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP:
23+
if g_sw_ww_table != 0:
24+
char *ww_table_dst = g_sw_ww_table + (dst>>11)
25+
if *ww_table_dst != 0:
26+
*ww_table_dst = 0xff
27+
28+
// Return if the reference is not in ephemeral generations
29+
if ref < g_ephemeral_low || ref >= g_ephemeral_high:
30+
return
31+
32+
// Region Checks
33+
if g_region_to_generation_table != 0:
34+
35+
// Calculate region generations
36+
char reg_loc_dst = *((dst >> g_region_shr) + g_region_to_generation_table)
37+
char reg_loc_ref = *((ref >> g_region_shr) + g_region_to_generation_table)
38+
39+
// Return if the region we're storing into is Gen 0
40+
if reg_loc_dst == 0:
41+
return
42+
43+
// Return if the new reference is not from old to young
44+
if reg_loc_ref >= reg_loc_dst:
45+
return
46+
47+
// Bitwise write barriers only
48+
if g_region_use_bitwise_write_barrier:
49+
50+
char *card_table_dst = (dst >> 11) + g_card_table
51+
char dst_bit = 1 << (dst >> 8 && 7)
52+
53+
// Check if we need to update the card table
54+
if *card_table_dst & dst_bit == 0:
55+
return
56+
57+
// Atomically update the card table
58+
lock: *card_table_dst |= dst_bit
59+
60+
goto CardBundle
61+
62+
// Check if we need to update the card table
63+
char *card_table_dst = (dst >> 11) + g_card_table
64+
if *card_table_dst == 0xff:
65+
return
66+
67+
// Update the card table
68+
*card_table_dst = 0xff
69+
70+
CardBundle:
71+
72+
// Mark the card bundle table as dirty
73+
Ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES:
74+
char card_bundle_dst = (dst >> 21) + g_card_bundle_table
75+
if *card_bundle_dst != 0xff:
76+
*card_bundle_dst = 0xff
77+
78+
````
79+
80+
The Checked Write Barrier has additional checks:
81+
82+
````
83+
JIT_CheckedWriteBarrier(Object **dst, Object *ref)
84+
85+
// Return if the destination is not on the heap
86+
if ref < g_lowest_address || ref >= g_highest_address:
87+
return
88+
89+
return JIT_WriteBarrier(dst, ref)
90+
````
91+
92+
## WriteBarrierManager
93+
94+
On AMD64 and Arm64, there several different implementations of the write barrier function. Each version is a subset of the `JIT_WriteBarrier` above, assuming different state, meaning most `if` checks can be skipped. The actual write barrier that is called is a copy of one of these implementations.
95+
96+
The WriteBarrierManager keeps track of which implementation is currently being used. As internal state changes, the WriteBarrierManager updates the copy to the correct implementation. In practice, most of the internal state is fixed on startup, with only changes to/from use of write watch barriers changing during runtime.
97+
98+
`WRITE_BARRIER_CHECK` is only set in `DEBUG` mode. On Arm64 `WRITE_BARRIER_CHECK` checks exist at the top of each version of the function when `DEBUG` mode is enabled. On `Amd64` these checks do not exist. Instead, a special `JIT_WriteBarrier_Debug` version of the function exists, which contains most of the functionality of `JIT_WriteBarrier` pseudo code and is used exclusively when `DEBUG` mode is enabled.
99+
100+
On Arm64, `g_region_use_bitwise_write_barrier` is only set if LSE atomics are present on the hardware, as only LSE provides a single instruction to atomically update a byte via a bitwise OR.
101+

src/coreclr/pal/inc/unixasmmacrosarm64.inc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@ C_FUNC(\Name):
3737
C_FUNC(\Name):
3838
.endm
3939

40+
// On MacOS, local labels cannot be used in arithmetic expressions.
41+
#if defined(__APPLE__)
42+
#define FIXUP_LABEL(name) name
43+
#else
44+
#define FIXUP_LABEL(name) .L##name
45+
#endif
46+
4047
.macro LEAF_ENTRY Name, Section
4148
.global C_FUNC(\Name)
4249
#if defined(__APPLE__)

src/coreclr/vm/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -785,11 +785,11 @@ if(CLR_CMAKE_TARGET_ARCH_AMD64)
785785
)
786786

787787
set(VM_SOURCES_WKS_ARCH
788-
${ARCH_SOURCES_DIR}/jitinterfaceamd64.cpp
789788
${ARCH_SOURCES_DIR}/profiler.cpp
790789
exceptionhandling.cpp
791790
gcinfodecoder.cpp
792791
jitinterfacegen.cpp
792+
writebarriermanager.cpp
793793
)
794794

795795
set(VM_HEADERS_WKS_ARCH
@@ -856,6 +856,7 @@ elseif(CLR_CMAKE_TARGET_ARCH_ARM64)
856856
set(VM_SOURCES_WKS_ARCH
857857
${ARCH_SOURCES_DIR}/profiler.cpp
858858
gcinfodecoder.cpp
859+
writebarriermanager.cpp
859860
)
860861

861862
if(CLR_CMAKE_HOST_UNIX)

src/coreclr/vm/arm64/asmhelpers.S

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -180,83 +180,6 @@ PATCH_LABEL ThePreStubPatchLabel
180180
ret lr
181181
LEAF_END ThePreStubPatch, _TEXT
182182

183-
// void JIT_UpdateWriteBarrierState(bool skipEphemeralCheck, size_t writeableOffset)
184-
//
185-
// Update shadow copies of the various state info required for barrier
186-
//
187-
// State info is contained in a literal pool at the end of the function
188-
// Placed in text section so that it is close enough to use ldr literal and still
189-
// be relocatable. Eliminates need for PREPARE_EXTERNAL_VAR in hot code.
190-
//
191-
// Align and group state info together so it fits in a single cache line
192-
// and each entry can be written atomically
193-
//
194-
LEAF_ENTRY JIT_UpdateWriteBarrierState, _TEXT
195-
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -16
196-
197-
// x0-x7, x10 will contain intended new state
198-
// x8 will preserve skipEphemeralCheck
199-
// x12 will be used for pointers
200-
201-
mov x8, x0
202-
mov x9, x1
203-
204-
PREPARE_EXTERNAL_VAR g_card_table, x12
205-
ldr x0, [x12]
206-
207-
#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES
208-
PREPARE_EXTERNAL_VAR g_card_bundle_table, x12
209-
ldr x1, [x12]
210-
#endif
211-
212-
#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
213-
PREPARE_EXTERNAL_VAR g_write_watch_table, x12
214-
ldr x2, [x12]
215-
#endif
216-
217-
PREPARE_EXTERNAL_VAR g_ephemeral_low, x12
218-
ldr x3, [x12]
219-
220-
PREPARE_EXTERNAL_VAR g_ephemeral_high, x12
221-
ldr x4, [x12]
222-
223-
cbz x8, LOCAL_LABEL(EphemeralCheckEnabled)
224-
movz x3, #0
225-
movn x4, #0
226-
LOCAL_LABEL(EphemeralCheckEnabled):
227-
228-
PREPARE_EXTERNAL_VAR g_lowest_address, x12
229-
ldr x5, [x12]
230-
231-
PREPARE_EXTERNAL_VAR g_highest_address, x12
232-
ldr x6, [x12]
233-
234-
#ifdef WRITE_BARRIER_CHECK
235-
PREPARE_EXTERNAL_VAR g_GCShadow, x12
236-
ldr x7, [x12]
237-
238-
PREPARE_EXTERNAL_VAR g_GCShadowEnd, x12
239-
ldr x10, [x12]
240-
#endif
241-
242-
// Update wbs state
243-
PREPARE_EXTERNAL_VAR JIT_WriteBarrier_Table_Loc, x12
244-
ldr x12, [x12]
245-
add x12, x12, x9
246-
247-
stp x0, x1, [x12], 16
248-
stp x2, x3, [x12], 16
249-
stp x4, x5, [x12], 16
250-
str x6, [x12], 8
251-
#ifdef WRITE_BARRIER_CHECK
252-
stp x7, x10, [x12], 16
253-
#endif
254-
255-
256-
EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, 16
257-
EPILOG_RETURN
258-
LEAF_END JIT_UpdateWriteBarrierState
259-
260183
// ------------------------// ------------------------------------------------------------------
261184
// __declspec(naked) void F_CALL_CONV JIT_WriteBarrier_Callable(Object **dst, Object* val)
262185
LEAF_ENTRY JIT_WriteBarrier_Callable, _TEXT

src/coreclr/vm/arm64/asmhelpers.asm

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -248,84 +248,6 @@ ThePreStubPatchLabel
248248
ret lr
249249
LEAF_END
250250

251-
;-----------------------------------------------------------------------------
252-
; void JIT_UpdateWriteBarrierState(bool skipEphemeralCheck, size_t writeableOffset)
253-
;
254-
; Update shadow copies of the various state info required for barrier
255-
;
256-
; State info is contained in a literal pool at the end of the function
257-
; Placed in text section so that it is close enough to use ldr literal and still
258-
; be relocatable. Eliminates need for PREPARE_EXTERNAL_VAR in hot code.
259-
;
260-
; Align and group state info together so it fits in a single cache line
261-
; and each entry can be written atomically
262-
;
263-
LEAF_ENTRY JIT_UpdateWriteBarrierState
264-
PROLOG_SAVE_REG_PAIR fp, lr, #-16!
265-
266-
; x0-x7, x10 will contain intended new state
267-
; x8 will preserve skipEphemeralCheck
268-
; x12 will be used for pointers
269-
270-
mov x8, x0
271-
mov x9, x1
272-
273-
adrp x12, g_card_table
274-
ldr x0, [x12, g_card_table]
275-
276-
#ifdef FEATURE_MANUALLY_MANAGED_CARD_BUNDLES
277-
adrp x12, g_card_bundle_table
278-
ldr x1, [x12, g_card_bundle_table]
279-
#endif
280-
281-
#ifdef FEATURE_USE_SOFTWARE_WRITE_WATCH_FOR_GC_HEAP
282-
adrp x12, g_write_watch_table
283-
ldr x2, [x12, g_write_watch_table]
284-
#endif
285-
286-
adrp x12, g_ephemeral_low
287-
ldr x3, [x12, g_ephemeral_low]
288-
289-
adrp x12, g_ephemeral_high
290-
ldr x4, [x12, g_ephemeral_high]
291-
292-
; Check skipEphemeralCheck
293-
cbz x8, EphemeralCheckEnabled
294-
movz x3, #0
295-
movn x4, #0
296-
297-
EphemeralCheckEnabled
298-
adrp x12, g_lowest_address
299-
ldr x5, [x12, g_lowest_address]
300-
301-
adrp x12, g_highest_address
302-
ldr x6, [x12, g_highest_address]
303-
304-
#ifdef WRITE_BARRIER_CHECK
305-
adrp x12, $g_GCShadow
306-
ldr x7, [x12, $g_GCShadow]
307-
308-
adrp x12, $g_GCShadowEnd
309-
ldr x10, [x12, $g_GCShadowEnd]
310-
#endif
311-
312-
; Update wbs state
313-
adrp x12, JIT_WriteBarrier_Table_Loc
314-
ldr x12, [x12, JIT_WriteBarrier_Table_Loc]
315-
add x12, x12, x9
316-
stp x0, x1, [x12], 16
317-
stp x2, x3, [x12], 16
318-
stp x4, x5, [x12], 16
319-
str x6, [x12], 8
320-
#ifdef WRITE_BARRIER_CHECK
321-
stp x7, x10, [x12], 16
322-
#endif
323-
324-
EPILOG_RESTORE_REG_PAIR fp, lr, #16!
325-
EPILOG_RETURN
326-
327-
LEAF_END JIT_UpdateWriteBarrierState
328-
329251
#ifdef FEATURE_COMINTEROP
330252

331253
; ------------------------------------------------------------------

0 commit comments

Comments
 (0)