Only reinterpret cast once type is actually placed there

jhuber6 · jhuber6 · commit 732f22549b7c · 2025-05-19T10:52:22.000-05:00
diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
@@ -226,7 +226,8 @@ struct Slab {
     // We attempt to place these next to each other.
     // TODO: We should coalesce these bits and use the result of `fetch_or` to
     //       search for free bits in parallel.
-    for (uint64_t mask = ~0ull; mask; mask = gpu::ballot(lane_mask, !result)) {
+    for (uint64_t mask = lane_mask; mask;
+         mask = gpu::ballot(lane_mask, !result)) {
       uint32_t id = impl::lane_count(uniform & mask);
       uint32_t index =
           (gpu::broadcast_value(lane_mask, impl::xorshift32(state)) + id) %
@@ -334,17 +335,17 @@ template <typename T> struct GuardPtr {
                                     cpp::MemoryOrder::RELAXED,
                                     cpp::MemoryOrder::RELAXED)) {
       count = cpp::numeric_limits<uint64_t>::max();
-      T *mem = reinterpret_cast<T *>(impl::rpc_allocate(sizeof(T)));
+      void *mem = impl::rpc_allocate(sizeof(T));
       if (!mem)
         return nullptr;
       new (mem) T(cpp::forward<Args>(args)...);
 
       cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-      ptr.store(mem, cpp::MemoryOrder::RELAXED);
+      ptr.store(reinterpret_cast<T *>(mem), cpp::MemoryOrder::RELAXED);
       cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
       if (!ref.acquire(n, count))
         ref.reset(n, count);
-      return mem;
+      return reinterpret_cast<T *>(mem);
     }
 
     if (!expected || expected == reinterpret_cast<T *>(SENTINEL))