diff --git a/.gitattributes b/.gitattributes
index 4ac00cad84a5..d0981b45e6a3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -72,6 +72,9 @@ tools/mantis2gh_stripped.csv typo.missing-header
 .depend                  typo.prune
 /.depend.menhir          typo.prune
 
+# These can be fixed at some point
+/tools/*.py              typo.long-line
+
 # Makefiles may contain tabs
 Makefile*                typo.makefile-whitespace=may
 
@@ -183,6 +186,7 @@ tools/ocamlsize text eol=lf
 tools/pre-commit-githook text eol=lf
 tools/markdown-add-pr-links.sh text eol=lf
 runtime/caml/compatibility.h typo.long-line=may
+runtime/caml/sizeclasses.h typo.missing-header typo.white-at-eol
 
 # These are all Perl scripts, so may not actually require this
 manual/tools/caml-tex text eol=lf
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 338a60b17093..e1cf4a516d50 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -3,91 +3,101 @@ name: Build
 on: [push, pull_request]
 
 jobs:
-  no-naked-pointers:
-    runs-on: ubuntu-latest
+
+  build:
+    name: 'linux'
+    runs-on: 'ubuntu-latest'
     steps:
-    - name: Checkout
-      uses: actions/checkout@v2
-    - name: configure tree
-      run: ./configure --disable-naked-pointers --disable-stdlib-manpages --disable-dependency-generation --enable-ocamltest
-    - name: Build
-      run: |
-        make -j world.opt
-    - name: Run the testsuite
-      run: |
-        make -C testsuite USE_RUNTIME=d all
-  i386-static:
-    runs-on: ubuntu-latest
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: configure tree
+        run: |
+          MAKE_ARG=-j XARCH=x64 bash -xe tools/ci/actions/runner.sh configure
+      - name: Build
+        run: |
+          MAKE_ARG=-j bash -xe tools/ci/actions/runner.sh build
+      - name: Prepare Artifact
+        run: |
+          tar -czf /tmp/sources.tar.gz .
+      - uses: actions/upload-artifact@v2
+        with:
+          name: compiler
+          path: /tmp/sources.tar.gz
+          retention-days: 1
+
+  build-misc:
+    name: ${{ matrix.name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        include:
+          - name: linux-O0
+            os: ubuntu-latest
+            config_arg: CFLAGS='-O0'
+          - name: linux-debug
+            os: ubuntu-latest
+            env: OCAMLRUNPARAM=v=0,V=1 USE_RUNTIME=d
+          - name: macos
+            os: macos-latest
     steps:
-    - name: Checkout
-      uses: actions/checkout@v2
-    - name: Packages
-      run: |
-        sudo apt-get update -y && sudo apt-get install -y gcc-multilib gfortran-multilib
-    - name: configure tree
-      run: |
-        XARCH=i386 CONFIG_ARG='--disable-stdlib-manpages --disable-shared --enable-cmm-invariants' bash -xe tools/ci/actions/runner.sh configure
-    - name: Build
-      run: |
-        bash -xe tools/ci/actions/runner.sh build
-    - name: Run the testsuite
-      run: |
-        bash -xe tools/ci/actions/runner.sh test
-    - name: Install
-      run: |
-        bash -xe tools/ci/actions/runner.sh install
-    - name: Other checks
-      run: |
-        bash -xe tools/ci/actions/runner.sh other-checks
-  full-flambda:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: configure tree
+        run: |
+          CONFIG_ARG=${{ matrix.config_arg }} MAKE_ARG=-j XARCH=x64 bash -xe tools/ci/actions/runner.sh configure
+      - name: Build
+        run: |
+          MAKE_ARG=-j bash -xe tools/ci/actions/runner.sh build
+      - name: Run the testsuite
+        if: ${{ matrix.name != 'linux-O0' }}
+        run: |
+          bash -c 'SHOW_TIMINGS=1 tools/ci/actions/runner.sh test'
+      - name: Run the testsuite (linux-O0, parallel)
+        if: ${{ matrix.name == 'linux-O0' }}
+        env:
+          OCAMLRUNPARAM: v=0,V=1
+          USE_RUNTIME: d
+        run: |
+          bash -xe tools/ci/actions/runner.sh test_multicore 1 "parallel" "lib-threads" "lib-systhreads"
+
+  testsuite:
+    needs: build
+    # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#jobsjob_idneeds    strategy:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        id:
+          - debug-s4096
+          - taskset
+          - normal
+          - super
     steps:
-    - name: Checkout
-      uses: actions/checkout@v2
-      with:
-        fetch-depth: 50
-    - name: Packages
-      run: |
-        sudo apt-get update -y && sudo apt-get install -y texlive-latex-extra texlive-fonts-recommended hevea sass
-  # Ensure that make distclean can be run from an empty tree
-    - name: distclean
-      run: |
-        MAKE_ARG=-j make distclean
-    - name: configure tree
-      run: |
-        MAKE_ARG=-j XARCH=x64 CONFIG_ARG='--enable-flambda --enable-cmm-invariants --enable-dependency-generation --enable-native-toplevel' OCAMLRUNPARAM=b,v=0 bash -xe tools/ci/actions/runner.sh configure
-    - name: Build
-      run: |
-        MAKE_ARG=-j OCAMLRUNPARAM=b,v=0 bash -xe tools/ci/actions/runner.sh build
-    - name: Run the testsuite
-      run: |
-        MAKE_ARG=-j OCAMLRUNPARAM=b,v=0 bash -xe tools/ci/actions/runner.sh test
-    - name: Build API Documentation
-      run: |
-        MAKE_ARG=-j OCAMLRUNPARAM=b,v=0 bash -xe tools/ci/actions/runner.sh api-docs
-    - name: Install
-      run: |
-        MAKE_ARG=-j OCAMLRUNPARAM=b,v=0 bash -xe tools/ci/actions/runner.sh install
-    - name: Check for manual changes
-      id: manual
-      run: >-
-        tools/ci/actions/check-manual-modified.sh
-        '${{ github.ref }}'
-        '${{ github.event_name }}'
-        '${{ github.event.pull_request.base.ref }}'
-        '${{ github.event.pull_request.base.sha }}'
-        '${{ github.event.pull_request.head.ref }}'
-        '${{ github.event.pull_request.head.sha }}'
-        '${{ github.event.ref }}'
-        '${{ github.event.before }}'
-        '${{ github.event.ref }}'
-        '${{ github.event.after }}'
-        '${{ github.event.repository.full_name }}'
-    - name: Build the manual
-      run: |
-        MAKE_ARG=-j OCAMLRUNPARAM=b,v=0 bash -xe tools/ci/actions/runner.sh manual
-      # Temporarily disabled 23-Apr-2021 while Dune isn't building
-      if: steps.manual.outputs.changed == 'disabled'
-    - name: Other checks
-      run: |
-        MAKE_ARG=-j OCAMLRUNPARAM=b,v=0 bash -xe tools/ci/actions/runner.sh other-checks
+      - uses: actions/download-artifact@v2
+        with:
+          name: compiler
+      - name: Unpack Artifact
+        run: |
+          tar xf sources.tar.gz
+      - name: Run the testsuite
+        if: ${{ matrix.id == 'normal' }}
+        run: |
+          bash -xe tools/ci/actions/runner.sh test
+      - name: Run the testsuite (Super Charged)
+        if: ${{ matrix.id == 'super' }}
+        run: |
+          bash -xe tools/ci/actions/runner.sh test_multicore 3 "parallel" \
+          "callback" "gc-roots" "lib-threads" "lib-systhreads" \
+          "weak-ephe-final"
+      - name: Run the testsuite (s=4096, debug runtime)
+        env:
+          OCAMLRUNPARAM: s=4096,v=0
+          USE_RUNTIME: d
+        if: ${{ matrix.id == 'debug-s4096' }}
+        run: |
+          bash -xe tools/ci/actions/runner.sh test_multicore 1 "parallel" \
+          "lib-threads" "lib-systhreads" "weak-ephe-final"
+      - name: Run the testsuite (taskset -c 0)
+        if: ${{ matrix.id == 'taskset' }}
+        run: |
+          bash -xe tools/ci/actions/runner.sh test_multicore 1 "parallel" \
+          "lib-threads" "lib-systhreads" "weak-ephe-final"
diff --git a/Changes b/Changes
index 2768fd17d703..4e81c55c08e9 100644
--- a/Changes
+++ b/Changes
@@ -4,6 +4,8 @@ Working version
 ### Language features:
 
 ### Runtime system:
+- #10831: Multicore OCaml
+  (The multicore team, caml-devel and more)
 
 ### Code generation and optimizations:
 
diff --git a/Makefile.config.in b/Makefile.config.in
index eb3d85eb1df5..63bdc810a4dc 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -181,6 +181,7 @@ OTHERLIBRARIES=@otherlibraries@
 # Needed for the "systhreads" package
 PTHREAD_LIBS=@PTHREAD_LIBS@
 PTHREAD_CAML_LIBS=$(addprefix -cclib ,$(PTHREAD_LIBS))
+PTHREAD_CFLAGS=@PTHREAD_CFLAGS@
 
 UNIX_OR_WIN32=@unix_or_win32@
 UNIXLIB=@unixlib@
@@ -236,11 +237,15 @@ ASM_CFI_SUPPORTED=@asm_cfi_supported@
 WITH_FRAME_POINTERS=@frame_pointers@
 WITH_PROFINFO=@profinfo@
 PROFINFO_WIDTH=@profinfo_width@
+LIBUNWIND_AVAILABLE=@libunwind_available@
+LIBUNWIND_INCLUDE_FLAGS=@libunwind_include_flags@
+LIBUNWIND_LINK_FLAGS=@libunwind_link_flags@
 WITH_FPIC=@fpic@
 TARGET=@target@
 HOST=@host@
 FLAMBDA=@flambda@
 WITH_FLAMBDA_INVARIANTS=@flambda_invariants@
+FORCE_INSTRUMENTED_RUNTIME=@force_instrumented_runtime@
 WITH_CMM_INVARIANTS=@cmm_invariants@
 FORCE_SAFE_STRING=@force_safe_string@
 DEFAULT_SAFE_STRING=@default_safe_string@
@@ -250,7 +255,7 @@ FLAT_FLOAT_ARRAY=@flat_float_array@
 FUNCTION_SECTIONS=@function_sections@
 AWK=@AWK@
 STDLIB_MANPAGES=@stdlib_manpages@
-NAKED_POINTERS=@naked_pointers@
+NAKED_POINTERS=false
 
 ### Native command to build ocamlrun.exe
 
diff --git a/api_docgen/Makefile.docfiles b/api_docgen/Makefile.docfiles
index 14a3b1832208..09c24557b582 100644
--- a/api_docgen/Makefile.docfiles
+++ b/api_docgen/Makefile.docfiles
@@ -35,8 +35,8 @@ str_MLIS := str.mli
 unix_MLIS := unix.mli unixLabels.mli
 dynlink_MLIS := dynlink.mli
 thread_MLIS := \
-  thread.mli condition.mli mutex.mli event.mli \
-  threadUnix.mli semaphore.mli
+  thread.mli event.mli \
+  threadUnix.mli
 
 STDLIB=$(filter-out stdlib__Pervasives, $(STDLIB_MODULES))
 
diff --git a/appveyor.yml b/appveyor.yml
index 719be706760d..9da84eeeaba8 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -31,16 +31,17 @@ environment:
     FORCE_CYGWIN_UPGRADE: 0
     BUILD_MODE: world.opt
   matrix:
-    - PORT: mingw32
+    - PORT: mingw64
       BOOTSTRAP_FLEXDLL: true
-    - PORT: msvc64
-      BOOTSTRAP_FLEXDLL: false
-      BUILD_MODE: steps
-    - PORT: msvc32
-      BOOTSTRAP_FLEXDLL: false
-      BUILD_MODE: C
-      SDK: |-
-        "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x86
+# OCaml 5.00 does not yet support MSVC
+#    - PORT: msvc64
+#      BOOTSTRAP_FLEXDLL: false
+#      BUILD_MODE: steps
+#    - PORT: msvc32
+#      BOOTSTRAP_FLEXDLL: false
+#      BUILD_MODE: C
+#      SDK: |-
+#        "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x86
 
 matrix:
   fast_finish: true
diff --git a/asmcomp/CSEgen.ml b/asmcomp/CSEgen.ml
index 2f54a35cc741..4275de4ac9df 100644
--- a/asmcomp/CSEgen.ml
+++ b/asmcomp/CSEgen.ml
@@ -236,9 +236,9 @@ method class_of_operation op =
   | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
   | Ifloatofint | Iintoffloat -> Op_pure
   | Ispecific _ -> Op_other
+  | Idls_get -> Op_load Mutable
 
 (* Operations that are so cheap that it isn't worth factoring them. *)
-
 method is_cheap_operation op =
   match op with
   | Iconst_int _ -> true
diff --git a/asmcomp/afl_instrument.ml b/asmcomp/afl_instrument.ml
index c493a2505ab3..3cd716fa64d3 100644
--- a/asmcomp/afl_instrument.ml
+++ b/asmcomp/afl_instrument.ml
@@ -42,13 +42,19 @@ let rec with_afl_logging b dbg =
     let afl_area = V.create_local "shared_mem" in
     let op oper args = Cop (oper, args, dbg) in
     Clet(VP.create afl_area,
-      op (Cload (Word_int, Asttypes.Mutable)) [afl_area_ptr dbg],
-      Clet(VP.create cur_pos, op Cxor [op (Cload (Word_int, Asttypes.Mutable))
+      op (Cload ({memory_chunk=Word_int;
+                  mutability=Asttypes.Mutable;
+                  is_atomic=false})) [afl_area_ptr dbg],
+      Clet(VP.create cur_pos, op Cxor [op (Cload {memory_chunk=Word_int;
+                                                  mutability=Asttypes.Mutable;
+                                                  is_atomic=false})
         [afl_prev_loc dbg]; Cconst_int (cur_location, dbg)],
       Csequence(
         op (Cstore(Byte_unsigned, Assignment))
           [op Cadda [Cvar afl_area; Cvar cur_pos];
-            op Cadda [op (Cload (Byte_unsigned, Asttypes.Mutable))
+            op Cadda [op (Cload {memory_chunk=Byte_unsigned;
+                                 mutability=Asttypes.Mutable;
+                                 is_atomic=false})
                         [op Cadda [Cvar afl_area; Cvar cur_pos]];
                       Cconst_int (1, dbg)]],
         op (Cstore(Word_int, Assignment))
diff --git a/asmcomp/amd64/emit.mlp b/asmcomp/amd64/emit.mlp
index eb38efbbe238..ef768b11d870 100644
--- a/asmcomp/amd64/emit.mlp
+++ b/asmcomp/amd64/emit.mlp
@@ -63,11 +63,22 @@ let cfi_endproc () =
 let cfi_adjust_cfa_offset n =
   if Config.asm_cfi_supported then D.cfi_adjust_cfa_offset n
 
+let cfi_remember_state () =
+  if Config.asm_cfi_supported then D.cfi_remember_state ()
+
+let cfi_restore_state () =
+  if Config.asm_cfi_supported then D.cfi_restore_state ()
+
+let cfi_def_cfa_register reg =
+  if Config.asm_cfi_supported then D.cfi_def_cfa_register reg
+
 let emit_debug_info dbg =
   emit_debug_info_gen dbg D.file D.loc
 
 let fp = Config.with_frame_pointers
 
+let stack_threshold_size = Config.stack_threshold * 8 (* bytes *)
+
 let frame_size env =                     (* includes return address *)
   if env.f.fun_frame_required then begin
     let sz =
@@ -525,26 +536,35 @@ let emit_instr env fallthrough i =
           end
         end
       end
-  | Lop(Iextcall { func; alloc; }) ->
+  | Lop(Iextcall { func; alloc; stack_ofs }) ->
       add_used_symbol func;
-      if alloc then begin
+      let base_stack_size =
+        if Arch.win64 then
+          32 (* Windows x64 rcx+rdx+r8+r9 shadow stack *)
+        else
+          0 in
+      if stack_ofs > base_stack_size then begin
+        I.lea (mem64 QWORD base_stack_size RSP) r13;
+        I.lea (mem64 QWORD stack_ofs RSP) r12;
+        load_symbol_addr func rax;
+        emit_call "caml_c_call_stack_args";
+        record_frame env i.live (Dbg_other i.dbg);
+      end else if alloc then begin
         load_symbol_addr func rax;
         emit_call "caml_c_call";
         record_frame env i.live (Dbg_other i.dbg);
-        if system <> S_win64 then begin
-
-          (* In amd64.S, "caml_c_call" tail-calls the C function (in order to
-             produce nicer backtraces), so we need to restore r15 manually after
-             it returns (note that this increases code size).
-
-             In amd64nt.asm (used for Win64), "caml_c_call" invokes the C
-             function via a regular call, and restores r15 itself, thus avoiding
-             the code size increase. *)
-
-          I.mov (domain_field Domainstate.Domain_young_ptr) r15
-        end
       end else begin
-        emit_call func
+        I.mov rsp rbp;
+        cfi_remember_state ();
+        cfi_def_cfa_register "rbp";
+        (* NB: gdb has asserts on contiguous stacks that mean it
+           will not unwind through this unless we were to tag this
+           calling frame with cfi_signal_frame in it's definition. *)
+        I.mov (domain_field Domainstate.Domain_c_stack) rsp;
+
+        emit_call func;
+        I.mov rbp rsp;
+        cfi_restore_state ();
       end
   | Lop(Istackoffset n) ->
       if n < 0
@@ -720,6 +740,8 @@ let emit_instr env fallthrough i =
       I.movsxd (arg32 i 0) (res i 0)
   | Lop(Ispecific(Izextend32)) ->
       I.mov (arg32 i 0) (res32 i 0)
+  | Lop (Idls_get) ->
+      I.mov (domain_field Domainstate.Domain_dls_root) (res i 0)
   | Lreloadretaddr ->
       ()
   | Lreturn ->
@@ -820,12 +842,12 @@ let emit_instr env fallthrough i =
       load_label_addr lbl_handler r11;
       I.push r11;
       cfi_adjust_cfa_offset 8;
-      I.push (domain_field Domainstate.Domain_exception_pointer);
+      I.push (domain_field Domainstate.Domain_exn_handler);
       cfi_adjust_cfa_offset 8;
-      I.mov rsp (domain_field Domainstate.Domain_exception_pointer);
+      I.mov rsp (domain_field Domainstate.Domain_exn_handler);
       env.stack_offset <- env.stack_offset + 16;
   | Lpoptrap ->
-      I.pop (domain_field Domainstate.Domain_exception_pointer);
+      I.pop (domain_field Domainstate.Domain_exn_handler);
       cfi_adjust_cfa_offset (-8);
       I.add (int 8) rsp;
       cfi_adjust_cfa_offset (-8);
@@ -833,15 +855,14 @@ let emit_instr env fallthrough i =
   | Lraise k ->
       begin match k with
       | Lambda.Raise_regular ->
-          I.mov (int 0) (domain_field Domainstate.Domain_backtrace_pos);
           emit_call "caml_raise_exn";
           record_frame env Reg.Set.empty (Dbg_raise i.dbg)
       | Lambda.Raise_reraise ->
-          emit_call "caml_raise_exn";
+          emit_call "caml_reraise_exn";
           record_frame env Reg.Set.empty (Dbg_raise i.dbg)
       | Lambda.Raise_notrace ->
-          I.mov (domain_field Domainstate.Domain_exception_pointer) rsp;
-          I.pop (domain_field Domainstate.Domain_exception_pointer);
+          I.mov (domain_field Domainstate.Domain_exn_handler) rsp;
+          I.pop (domain_field Domainstate.Domain_exn_handler);
           I.pop r11;
           I.jmp r11
       end
@@ -855,6 +876,30 @@ let rec emit_all env fallthrough i =
 
 let all_functions = ref []
 
+type preproc_fun_result =
+  { max_stack_size : int;
+    contains_nontail_calls : bool }
+
+let preproc_fun env fun_body _fun_name =
+  let rec proc_instr r s i =
+    if i.desc = Lend then r else
+        let upd_size r delta =
+          {r with max_stack_size = max r.max_stack_size (s+delta)}
+        in
+        let (r',s') = match i.desc with
+          | Lop (Istackoffset n) -> (upd_size r n, s+n)
+          | Lpushtrap _ -> (upd_size r 16, s+16)
+          | Lpoptrap -> (r, s-16)
+          | Lop (Icall_ind | Icall_imm _ ) ->
+              ({r with contains_nontail_calls = true}, s)
+          | _ -> (r, s)
+        in
+        proc_instr r' s' i.next
+  in
+  let fs = frame_size env in
+  let r = {max_stack_size = fs; contains_nontail_calls = false} in
+  proc_instr r fs fun_body
+
 (* Emission of a function declaration *)
 
 let fundecl fundecl =
@@ -873,9 +918,38 @@ let fundecl fundecl =
   D.label (emit_symbol fundecl.fun_name);
   emit_debug_info fundecl.fun_dbg;
   cfi_startproc ();
+  if !Clflags.runtime_variant = "d" then
+    emit_call "caml_assert_stack_invariants";
+  let { max_stack_size; contains_nontail_calls} =
+    preproc_fun env fundecl.fun_body fundecl.fun_name
+  in
+  let handle_overflow = ref None in
+  if contains_nontail_calls || max_stack_size >= stack_threshold_size then begin
+    let overflow = new_label () and ret = new_label () in
+    let threshold_offset = Domainstate.stack_ctx_words * 8 + stack_threshold_size in
+    I.lea (mem64 NONE (-(max_stack_size + threshold_offset)) RSP) r10;
+    I.cmp (domain_field Domainstate.Domain_current_stack) r10;
+    I.jb (label overflow);
+    def_label ret;
+    handle_overflow := Some (overflow, ret)
+  end;
   emit_all env true fundecl.fun_body;
   List.iter emit_call_gc env.call_gc_sites;
   emit_call_bound_errors env;
+  begin match !handle_overflow with
+  | None -> ()
+  | Some (overflow,ret) -> begin
+      def_label overflow;
+      (* Pass the desired stack size on the stack, since all of the
+        argument-passing registers may be in use.
+        Also serves to align the stack properly before the call *)
+      I.push (int (Config.stack_threshold + max_stack_size / 8));
+        (* measured in words *)
+      emit_call "caml_call_realloc_stack";
+      I.pop r10; (* ignored *)
+      I.jmp (label ret)
+    end
+  end;
   if fundecl.fun_frame_required then begin
     let n = (frame_size env) - 8 - (if fp then 8 else 0) in
     if n <> 0
diff --git a/asmcomp/amd64/proc.ml b/asmcomp/amd64/proc.ml
index b7047a1ead98..64b05a95c84f 100644
--- a/asmcomp/amd64/proc.ml
+++ b/asmcomp/amd64/proc.ml
@@ -291,15 +291,18 @@ let regs_are_volatile _rs = false
 (* Registers destroyed by operations *)
 
 let destroyed_at_c_call =
+  (* C calling conventions preserve rbp, but it is clobbered
+     by the code sequence used for C calls in emit.mlp, so it
+     is marked as destroyed. *)
   if win64 then
-    (* Win64: rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15 preserved *)
+    (* Win64: rbx, rsi, rdi, r12-r15, xmm6-xmm15 preserved *)
     Array.of_list(List.map phys_reg
-      [0;4;5;6;7;10;11;
+      [0;4;5;6;7;10;11;12;
        100;101;102;103;104;105])
   else
-    (* Unix: rbp, rbx, r12-r15 preserved *)
+    (* Unix: rbx, r12-r15 preserved *)
     Array.of_list(List.map phys_reg
-      [0;2;3;4;5;6;7;10;11;
+      [0;2;3;4;5;6;7;10;11;12;
        100;101;102;103;104;105;106;107;
        108;109;110;111;112;113;114;115])
 
@@ -310,9 +313,12 @@ let destroyed_at_alloc_or_poll =
     [| r11 |]
 
 let destroyed_at_oper = function
-    Iop(Icall_ind | Icall_imm _ | Iextcall { alloc = true; }) ->
-    all_phys_regs
-  | Iop(Iextcall { alloc = false; }) -> destroyed_at_c_call
+    Iop(Icall_ind | Icall_imm _) ->
+      all_phys_regs
+  | Iop(Iextcall {alloc; stack_ofs; }) ->
+      assert (stack_ofs >= 0);
+      if alloc || stack_ofs > 0 then all_phys_regs
+      else destroyed_at_c_call
   | Iop(Iintop(Idiv | Imod)) | Iop(Iintop_imm((Idiv | Imod), _))
         -> [| rax; rdx |]
   | Iop(Istore(Single, _, _)) -> [| rxmm15 |]
diff --git a/asmcomp/amd64/selection.ml b/asmcomp/amd64/selection.ml
index 47e566b9f5ae..0ace071d1560 100644
--- a/asmcomp/amd64/selection.ml
+++ b/asmcomp/amd64/selection.ml
@@ -203,7 +203,7 @@ method! select_operation op args dbg =
       self#select_floatarith false Idivf Ifloatdiv args
   | Cextcall("sqrt", _, _, false) ->
      begin match args with
-       [Cop(Cload ((Double as chunk), _), [loc], _dbg)] ->
+       [Cop(Cload {memory_chunk=(Double as chunk)}, [loc], _dbg)] ->
          let (addr, arg) = self#select_addressing chunk loc in
          (Ispecific(Ifloatsqrtf addr), [arg])
      | [arg] ->
@@ -251,11 +251,11 @@ method! select_operation op args dbg =
 
 method select_floatarith commutative regular_op mem_op args =
   match args with
-    [arg1; Cop(Cload ((Double as chunk), _), [loc2], _)] ->
+    [arg1; Cop(Cload {memory_chunk=(Double as chunk)}, [loc2], _)] ->
       let (addr, arg2) = self#select_addressing chunk loc2 in
       (Ispecific(Ifloatarithmem(mem_op, addr)),
                  [arg1; arg2])
-  | [Cop(Cload ((Double as chunk), _), [loc1], _); arg2]
+  | [Cop(Cload {memory_chunk=(Double as chunk)}, [loc1], _); arg2]
         when commutative ->
       let (addr, arg1) = self#select_addressing chunk loc1 in
       (Ispecific(Ifloatarithmem(mem_op, addr)),
diff --git a/asmcomp/arm/selection.ml b/asmcomp/arm/selection.ml
index 12ea5808ca38..bff9a83a1111 100644
--- a/asmcomp/arm/selection.ml
+++ b/asmcomp/arm/selection.ml
@@ -53,6 +53,7 @@ exception Use_default
 let r1 = phys_reg 1
 let r6 = phys_reg 6
 let r7 = phys_reg 7
+let r12 = phys_reg 8
 
 let pseudoregs_for_operation op arg res =
   match op with
@@ -267,9 +268,11 @@ method private select_operation_softfp op args dbg =
        [Cop(Cextcall(func, typ_int, [XFloat;XFloat], false),
             args, dbg)])
   (* Add coercions around loads and stores of 32-bit floats *)
-  | (Cload (Single, mut), args) ->
+  | (Cload {memory_chunk=Single; mutability; is_atomic=false}, args) ->
       (self#iextcall "__aeabi_f2d" typ_float [XInt],
-        [Cop(Cload (Word_int, mut), args, dbg)])
+        [Cop(Cload {memory_chunk=Word_int;
+                    mutability;
+                    is_atomic=false}, args, dbg)])
   | (Cstore (Single, init), [arg1; arg2]) ->
       let arg2' =
         Cop(Cextcall("__aeabi_d2f", typ_int, [XFloat], false),
diff --git a/asmcomp/cmm.ml b/asmcomp/cmm.ml
index ae62b4fd9d1d..fa7701296d35 100644
--- a/asmcomp/cmm.ml
+++ b/asmcomp/cmm.ml
@@ -150,7 +150,10 @@ type memory_chunk =
 and operation =
     Capply of machtype
   | Cextcall of string * machtype * exttype list * bool
-  | Cload of memory_chunk * Asttypes.mutable_flag
+  | Cload of
+      { memory_chunk: memory_chunk
+      ; mutability: Asttypes.mutable_flag
+      ; is_atomic: bool }
   | Calloc
   | Cstore of memory_chunk * Lambda.initialization_or_assignment
   | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi
@@ -165,6 +168,7 @@ and operation =
   | Craise of Lambda.raise_kind
   | Ccheckbound
   | Copaque
+  | Cdls_get
 
 type expression =
     Cconst_int of int * Debuginfo.t
diff --git a/asmcomp/cmm.mli b/asmcomp/cmm.mli
index f37aef03e175..a92a60c04b4b 100644
--- a/asmcomp/cmm.mli
+++ b/asmcomp/cmm.mli
@@ -144,7 +144,10 @@ and operation =
       (** The [machtype] is the machine type of the result.
           The [exttype list] describes the unboxing types of the arguments.
           An empty list means "all arguments are machine words [XInt]". *)
-  | Cload of memory_chunk * Asttypes.mutable_flag
+  | Cload of
+      { memory_chunk: memory_chunk
+      ; mutability: Asttypes.mutable_flag
+      ; is_atomic: bool }
   | Calloc
   | Cstore of memory_chunk * Lambda.initialization_or_assignment
   | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi
@@ -163,6 +166,7 @@ and operation =
                    It results in a bounds error if the index is greater than
                    or equal to the bound. *)
   | Copaque (* Sys.opaque_identity *)
+  | Cdls_get
 
 (** Every basic block should have a corresponding [Debuginfo.t] for its
     beginning. *)
diff --git a/asmcomp/cmm_helpers.ml b/asmcomp/cmm_helpers.ml
index 7ad42ceaea6a..4e0b39133839 100644
--- a/asmcomp/cmm_helpers.ml
+++ b/asmcomp/cmm_helpers.ml
@@ -569,9 +569,13 @@ let unbox_float dbg =
           | Some (Uconst_float x) ->
               Cconst_float (x, dbg) (* or keep _dbg? *)
           | _ ->
-              Cop(Cload (Double, Immutable), [cmm], dbg)
+              Cop(Cload {memory_chunk=Double; mutability=Immutable;
+                         is_atomic=false},
+                  [cmm], dbg)
           end
-      | cmm -> Cop(Cload (Double, Immutable), [cmm], dbg)
+      | cmm -> Cop(Cload {memory_chunk=Double; mutability=Immutable;
+                         is_atomic=false},
+                   [cmm], dbg)
     )
 
 (* Complex *)
@@ -579,10 +583,12 @@ let unbox_float dbg =
 let box_complex dbg c_re c_im =
   Cop(Calloc, [alloc_floatarray_header 2 dbg; c_re; c_im], dbg)
 
-let complex_re c dbg = Cop(Cload (Double, Immutable), [c], dbg)
-let complex_im c dbg = Cop(Cload (Double, Immutable),
-                        [Cop(Cadda, [c; Cconst_int (size_float, dbg)], dbg)],
-                        dbg)
+let complex_re c dbg =
+  Cop(Cload {memory_chunk=Double; mutability=Immutable; is_atomic=false},
+      [c], dbg)
+let complex_im c dbg =
+  Cop(Cload {memory_chunk=Double; mutability=Immutable; is_atomic=false},
+      [Cop(Cadda, [c; Cconst_int (size_float, dbg)], dbg)], dbg)
 
 (* Unit *)
 
@@ -619,13 +625,17 @@ let rec remove_unit = function
 
 (* Access to block fields *)
 
+let mk_load_mut memory_chunk =
+  Cload {memory_chunk; mutability=Mutable; is_atomic=false}
+
 let field_address ptr n dbg =
   if n = 0
   then ptr
   else Cop(Cadda, [ptr; Cconst_int(n * size_addr, dbg)], dbg)
 
-let get_field_gen mut ptr n dbg =
-  Cop(Cload (Word_val, mut), [field_address ptr n dbg], dbg)
+let get_field_gen mutability ptr n dbg =
+  Cop(Cload {memory_chunk=Word_val; mutability; is_atomic=false},
+      [field_address ptr n dbg], dbg)
 
 let set_field ptr n newval init dbg =
   Cop(Cstore (Word_val, init), [field_address ptr n dbg; newval], dbg)
@@ -638,7 +648,7 @@ let non_profinfo_mask =
 let get_header ptr dbg =
   (* We cannot deem this as [Immutable] due to the presence of [Obj.truncate]
      and [Obj.set_tag]. *)
-  Cop(Cload (Word_int, Mutable),
+  Cop(mk_load_mut Word_int,
     [Cop(Cadda, [ptr; Cconst_int(-size_int, dbg)], dbg)], dbg)
 
 let get_header_without_profinfo ptr dbg =
@@ -655,7 +665,7 @@ let get_tag ptr dbg =
     Cop(Cand, [get_header ptr dbg; Cconst_int (255, dbg)], dbg)
   else                                  (* If byte loads are efficient *)
     (* Same comment as [get_header] above *)
-    Cop(Cload (Byte_unsigned, Mutable),
+    Cop(mk_load_mut Byte_unsigned,
         [Cop(Cadda, [ptr; Cconst_int(tag_offset, dbg)], dbg)], dbg)
 
 let get_size ptr dbg =
@@ -722,13 +732,13 @@ let array_indexing ?typ log2size ptr ofs dbg =
                     Cconst_int((-1) lsl (log2size - 1), dbg)], dbg)
 
 let addr_array_ref arr ofs dbg =
-  Cop(Cload (Word_val, Mutable),
+  Cop(mk_load_mut Word_val,
     [array_indexing log2_size_addr arr ofs dbg], dbg)
 let int_array_ref arr ofs dbg =
-  Cop(Cload (Word_int, Mutable),
+  Cop(mk_load_mut Word_int,
     [array_indexing log2_size_addr arr ofs dbg], dbg)
 let unboxed_float_array_ref arr ofs dbg =
-  Cop(Cload (Double, Mutable),
+  Cop(mk_load_mut Double,
     [array_indexing log2_size_float arr ofs dbg], dbg)
 let float_array_ref arr ofs dbg =
   box_float dbg (unboxed_float_array_ref arr ofs dbg)
@@ -763,11 +773,11 @@ let string_length exp dbg =
              dbg),
          Cop(Csubi,
              [Cvar tmp_var;
-               Cop(Cload (Byte_unsigned, Mutable),
+               Cop(mk_load_mut Byte_unsigned,
                      [Cop(Cadda, [str; Cvar tmp_var], dbg)], dbg)], dbg)))
 
 let bigstring_length ba dbg =
-  Cop(Cload (Word_int, Mutable), [field_address ba 5 dbg], dbg)
+  Cop(mk_load_mut Word_int, [field_address ba 5 dbg], dbg)
 
 (* Message sending *)
 
@@ -779,7 +789,7 @@ let lookup_tag obj tag dbg =
 
 let lookup_label obj lab dbg =
   bind "lab" lab (fun lab ->
-    let table = Cop (Cload (Word_val, Mutable), [obj], dbg) in
+    let table = Cop (mk_load_mut Word_val, [obj], dbg) in
     addr_array_ref table lab dbg)
 
 let call_cached_method obj tag cache pos args dbg =
@@ -872,7 +882,7 @@ let bigarray_indexing unsafe elt_kind layout b args dbg =
         bind "idx" arg (fun idx ->
           (* Load the untagged int bound for the given dimension *)
           let bound =
-            Cop(Cload (Word_int, Mutable),
+            Cop(mk_load_mut Word_int,
                 [field_address b dim_ofs dbg], dbg)
           in
           let idxn = untag_int idx dbg in
@@ -883,7 +893,7 @@ let bigarray_indexing unsafe elt_kind layout b args dbg =
       let rem = ba_indexing (dim_ofs + delta_ofs) delta_ofs argl in
       (* Load the untagged int bound for the given dimension *)
       let bound =
-        Cop(Cload (Word_int, Mutable),
+        Cop(mk_load_mut Word_int,
             [field_address b dim_ofs dbg], dbg)
       in
       if unsafe then add_int (mul_int (decr_int rem dbg) bound dbg) arg1 dbg
@@ -910,7 +920,7 @@ let bigarray_indexing unsafe elt_kind layout b args dbg =
     bigarray_elt_size elt_kind in
   (* [array_indexing] can simplify the given expressions *)
   array_indexing ~typ:Addr (Misc.log2 elt_size)
-                 (Cop(Cload (Word_int, Mutable),
+                 (Cop(mk_load_mut Word_int,
                     [field_address b 1 dbg], dbg)) offset dbg
 
 let bigarray_word_kind : Lambda.bigarray_kind -> memory_chunk = function
@@ -937,13 +947,13 @@ let bigarray_get unsafe elt_kind layout b args dbg =
         bind "addr"
           (bigarray_indexing unsafe elt_kind layout b args dbg) (fun addr ->
             bind "reval"
-              (Cop(Cload (kind, Mutable), [addr], dbg)) (fun reval ->
+              (Cop(mk_load_mut kind, [addr], dbg)) (fun reval ->
                 bind "imval"
-                  (Cop(Cload (kind, Mutable),
+                  (Cop(mk_load_mut kind,
                        [Cop(Cadda, [addr; Cconst_int (sz, dbg)], dbg)], dbg))
                   (fun imval -> box_complex dbg reval imval)))
     | _ ->
-        Cop(Cload (bigarray_word_kind elt_kind, Mutable),
+        Cop(mk_load_mut (bigarray_word_kind elt_kind),
             [bigarray_indexing unsafe elt_kind layout b args dbg],
             dbg))
 
@@ -1025,8 +1035,8 @@ let split_int64_for_32bit_target arg dbg =
   bind "split_int64" arg (fun arg ->
     let first = Cop (Cadda, [Cconst_int (size_int, dbg); arg], dbg) in
     let second = Cop (Cadda, [Cconst_int (2 * size_int, dbg); arg], dbg) in
-    Ctuple [Cop (Cload (Thirtytwo_unsigned, Mutable), [first], dbg);
-            Cop (Cload (Thirtytwo_unsigned, Mutable), [second], dbg)])
+    Ctuple [Cop (mk_load_mut Thirtytwo_unsigned, [first], dbg);
+            Cop (mk_load_mut Thirtytwo_unsigned, [second], dbg)])
 
 let alloc_matches_boxed_int bi ~hdr ~ops =
   match (bi : Primitive.boxed_integer), hdr, ops with
@@ -1046,9 +1056,11 @@ let unbox_int dbg bi =
     if size_int = 4 && bi = Primitive.Pint64 then
       split_int64_for_32bit_target arg dbg
     else
+      let memory_chunk = if bi = Primitive.Pint32
+      then Thirtytwo_signed else Word_int
+      in
       Cop(
-        Cload((if bi = Primitive.Pint32 then Thirtytwo_signed else Word_int),
-              Immutable),
+        Cload {memory_chunk; mutability=Immutable; is_atomic=false},
         [Cop(Cadda, [arg; Cconst_int (size_addr, dbg)], dbg)], dbg)
   in
   map_tail
@@ -1103,11 +1115,11 @@ let make_unsigned_int bi arg dbg =
 
 let unaligned_load_16 ptr idx dbg =
   if Arch.allow_unaligned_access
-  then Cop(Cload (Sixteen_unsigned, Mutable), [add_int ptr idx dbg], dbg)
+  then Cop(mk_load_mut Sixteen_unsigned, [add_int ptr idx dbg], dbg)
   else
     let cconst_int i = Cconst_int (i, dbg) in
-    let v1 = Cop(Cload (Byte_unsigned, Mutable), [add_int ptr idx dbg], dbg) in
-    let v2 = Cop(Cload (Byte_unsigned, Mutable),
+    let v1 = Cop(mk_load_mut Byte_unsigned, [add_int ptr idx dbg], dbg) in
+    let v2 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 1) dbg], dbg) in
     let b1, b2 = if Arch.big_endian then v1, v2 else v2, v1 in
     Cop(Cor, [lsl_int b1 (cconst_int 8) dbg; b2], dbg)
@@ -1132,17 +1144,17 @@ let unaligned_set_16 ptr idx newval dbg =
 
 let unaligned_load_32 ptr idx dbg =
   if Arch.allow_unaligned_access
-  then Cop(Cload (Thirtytwo_unsigned, Mutable), [add_int ptr idx dbg], dbg)
+  then Cop(mk_load_mut Thirtytwo_unsigned, [add_int ptr idx dbg], dbg)
   else
     let cconst_int i = Cconst_int (i, dbg) in
-    let v1 = Cop(Cload (Byte_unsigned, Mutable), [add_int ptr idx dbg], dbg) in
-    let v2 = Cop(Cload (Byte_unsigned, Mutable),
+    let v1 = Cop(mk_load_mut Byte_unsigned, [add_int ptr idx dbg], dbg) in
+    let v2 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 1) dbg], dbg)
     in
-    let v3 = Cop(Cload (Byte_unsigned, Mutable),
+    let v3 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 2) dbg], dbg)
     in
-    let v4 = Cop(Cload (Byte_unsigned, Mutable),
+    let v4 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 3) dbg], dbg)
     in
     let b1, b2, b3, b4 =
@@ -1194,23 +1206,23 @@ let unaligned_set_32 ptr idx newval dbg =
 let unaligned_load_64 ptr idx dbg =
   assert(size_int = 8);
   if Arch.allow_unaligned_access
-  then Cop(Cload (Word_int, Mutable), [add_int ptr idx dbg], dbg)
+  then Cop(mk_load_mut Word_int, [add_int ptr idx dbg], dbg)
   else
     let cconst_int i = Cconst_int (i, dbg) in
-    let v1 = Cop(Cload (Byte_unsigned, Mutable), [add_int ptr idx dbg], dbg) in
-    let v2 = Cop(Cload (Byte_unsigned, Mutable),
+    let v1 = Cop(mk_load_mut Byte_unsigned, [add_int ptr idx dbg], dbg) in
+    let v2 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 1) dbg], dbg) in
-    let v3 = Cop(Cload (Byte_unsigned, Mutable),
+    let v3 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 2) dbg], dbg) in
-    let v4 = Cop(Cload (Byte_unsigned, Mutable),
+    let v4 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 3) dbg], dbg) in
-    let v5 = Cop(Cload (Byte_unsigned, Mutable),
+    let v5 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 4) dbg], dbg) in
-    let v6 = Cop(Cload (Byte_unsigned, Mutable),
+    let v6 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 5) dbg], dbg) in
-    let v7 = Cop(Cload (Byte_unsigned, Mutable),
+    let v7 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 6) dbg], dbg) in
-    let v8 = Cop(Cload (Byte_unsigned, Mutable),
+    let v8 = Cop(mk_load_mut Byte_unsigned,
                  [add_int (add_int ptr idx dbg) (cconst_int 7) dbg], dbg) in
     let b1, b2, b3, b4, b5, b6, b7, b8 =
       if Arch.big_endian
@@ -1738,7 +1750,7 @@ let cache_public_method meths tag cache dbg =
   Clet_mut (
   VP.create li, typ_int, cconst_int 3,
   Clet_mut (
-  VP.create hi, typ_int, Cop(Cload (Word_int, Mutable), [meths], dbg),
+  VP.create hi, typ_int, Cop(mk_load_mut Word_int, [meths], dbg),
   Csequence(
   ccatch
     (raise_num, [],
@@ -1754,7 +1766,7 @@ let cache_public_method meths tag cache dbg =
         Cifthenelse
           (Cop (Ccmpi Clt,
                 [tag;
-                 Cop(Cload (Word_int, Mutable),
+                 Cop(mk_load_mut Word_int,
                      [Cop(Cadda,
                           [meths; lsl_const (Cvar mi) log2_size_addr dbg],
                           dbg)],
@@ -1845,12 +1857,12 @@ let send_function arity =
     let cached_pos = Cvar cached in
     let tag_pos = Cop(Cadda, [Cop (Cadda, [cached_pos; Cvar meths], dbg ());
                               cconst_int(3*size_addr-1)], dbg ()) in
-    let tag' = Cop(Cload (Word_int, Mutable), [tag_pos], dbg ()) in
+    let tag' = Cop(mk_load_mut Word_int, [tag_pos], dbg ()) in
     Clet (
-    VP.create meths, Cop(Cload (Word_val, Mutable), [obj], dbg ()),
+    VP.create meths, Cop(mk_load_mut Word_val, [obj], dbg ()),
     Clet (
     VP.create cached,
-      Cop(Cand, [Cop(Cload (Word_int, Mutable), [cache], dbg ()); mask],
+      Cop(Cand, [Cop(mk_load_mut Word_int, [cache], dbg ()); mask],
           dbg ()),
     Clet (
     VP.create real,
@@ -1860,7 +1872,7 @@ let send_function arity =
                 dbg (),
                 cached_pos,
                 dbg ()),
-    Cop(Cload (Word_val, Mutable),
+    Cop(mk_load_mut Word_val,
       [Cop(Cadda, [Cop (Cadda, [Cvar real; Cvar meths], dbg ());
        cconst_int(2*size_addr-1)], dbg ())], dbg ()))))
 
@@ -2106,7 +2118,7 @@ let generic_functions shared units =
 type unary_primitive = expression -> Debuginfo.t -> expression
 
 let floatfield n ptr dbg =
-  Cop(Cload (Double, Mutable),
+  Cop(mk_load_mut Double,
       [if n = 0 then ptr
        else Cop(Cadda, [ptr; Cconst_int(n * size_float, dbg)], dbg)],
       dbg)
@@ -2131,7 +2143,7 @@ let offsetref n arg dbg =
     (bind "ref" arg (fun arg ->
          Cop(Cstore (Word_int, Assignment),
              [arg;
-              add_const (Cop(Cload (Word_int, Mutable), [arg], dbg))
+              add_const (Cop(mk_load_mut Word_int, [arg], dbg))
                 (n lsl 1) dbg],
              dbg)))
 
@@ -2188,18 +2200,19 @@ let assignment_kind
     (init: Lambda.initialization_or_assignment) =
   match init, ptr with
   | Assignment, Pointer -> Caml_modify
-  | Heap_initialization, Pointer -> Caml_initialize
+  | Heap_initialization, Pointer
+  | Root_initialization, Pointer -> Caml_initialize
   | Assignment, Immediate
   | Heap_initialization, Immediate
-  | Root_initialization, (Immediate | Pointer) -> Simple
+  | Root_initialization, Immediate -> Simple
 
 let setfield n ptr init arg1 arg2 dbg =
   match assignment_kind ptr init with
   | Caml_modify ->
       return_unit dbg
         (Cop(Cextcall("caml_modify", typ_void, [], false),
-             [field_address arg1 n dbg; arg2],
-             dbg))
+            [field_address arg1 n dbg; arg2],
+            dbg))
   | Caml_initialize ->
       return_unit dbg
         (Cop(Cextcall("caml_initialize", typ_void, [], false),
@@ -2278,7 +2291,7 @@ let int_comp_caml cmp arg1 arg2 dbg =
               [arg1; arg2], dbg)) dbg
 
 let stringref_unsafe arg1 arg2 dbg =
-  tag_int(Cop(Cload (Byte_unsigned, Mutable),
+  tag_int(Cop(mk_load_mut Byte_unsigned,
               [add_int arg1 (untag_int arg2 dbg) dbg],
               dbg)) dbg
 
@@ -2288,7 +2301,7 @@ let stringref_safe arg1 arg2 dbg =
       bind "str" arg1 (fun str ->
         Csequence(
           make_checkbound dbg [string_length str dbg; idx],
-          Cop(Cload (Byte_unsigned, Mutable),
+          Cop(mk_load_mut Byte_unsigned,
             [add_int str idx dbg], dbg))))) dbg
 
 let string_load size unsafe arg1 arg2 dbg =
@@ -2301,10 +2314,10 @@ let string_load size unsafe arg1 arg2 dbg =
 
 let bigstring_load size unsafe arg1 arg2 dbg =
   box_sized size dbg
-    (bind "index" (untag_int arg2 dbg) (fun idx ->
-     bind "ba" arg1 (fun ba ->
-     bind "ba_data"
-     (Cop(Cload (Word_int, Mutable), [field_address ba 1 dbg], dbg))
+   (bind "index" (untag_int arg2 dbg) (fun idx ->
+    bind "ba" arg1 (fun ba ->
+    bind "ba_data"
+     (Cop(mk_load_mut Word_int, [field_address ba 1 dbg], dbg))
      (fun ba_data ->
         check_bound unsafe size dbg
           (bigstring_length ba dbg)
@@ -2509,7 +2522,7 @@ let bigstring_set size unsafe arg1 arg2 arg3 dbg =
     bind "index" (untag_int arg2 dbg) (fun idx ->
     bind "ba" arg1 (fun ba ->
     bind "ba_data"
-         (Cop(Cload (Word_int, Mutable), [field_address ba 1 dbg], dbg))
+         (Cop(mk_load_mut Word_int, [field_address ba 1 dbg], dbg))
          (fun ba_data ->
             check_bound unsafe size dbg (bigstring_length ba dbg)
               idx (unaligned_set size ba_data idx newval dbg))))))
@@ -2585,7 +2598,7 @@ let entry_point namelist =
   let incr_global_inited () =
     Cop(Cstore (Word_int, Assignment),
         [cconst_symbol "caml_globals_inited";
-         Cop(Caddi, [Cop(Cload (Word_int, Mutable),
+         Cop(Caddi, [Cop(mk_load_mut Word_int,
                        [cconst_symbol "caml_globals_inited"], dbg ());
                      cconst_int 1], dbg ())], dbg ()) in
   let body =
diff --git a/asmcomp/cmm_helpers.mli b/asmcomp/cmm_helpers.mli
index 4fe47dd7a90a..3c87e9c0d3c6 100644
--- a/asmcomp/cmm_helpers.mli
+++ b/asmcomp/cmm_helpers.mli
@@ -188,6 +188,7 @@ val return_unit : Debuginfo.t -> expression -> expression
 val remove_unit : expression -> expression
 
 (** Blocks *)
+val mk_load_mut : memory_chunk -> operation
 
 (** [field_address ptr n dbg] returns an expression for the address of the
     [n]th field of the block pointed to by [ptr] *)
diff --git a/asmcomp/cmmgen.ml b/asmcomp/cmmgen.ml
index a004702da1f4..9c660de7a84d 100644
--- a/asmcomp/cmmgen.ml
+++ b/asmcomp/cmmgen.ml
@@ -521,7 +521,7 @@ let rec transl env e =
             dbg)
       | (Pbigarraydim(n), [b]) ->
           let dim_ofs = 4 + n in
-          tag_int (Cop(Cload (Word_int, Mutable),
+          tag_int (Cop(mk_load_mut Word_int,
             [field_address (transl env b) dim_ofs dbg],
             dbg)) dbg
       | (p, [arg]) ->
@@ -537,6 +537,10 @@ let rec transl env e =
         ->
           fatal_error "Cmmgen.transl:prim, wrong arity"
       | ((Pfield_computed|Psequand
+         | Prunstack | Pperform | Presume | Preperform
+         | Pdls_get
+         | Patomic_load _ | Patomic_exchange
+         | Patomic_cas | Patomic_fetch_add
          | Psequor | Pnot | Pnegint | Paddint | Psubint
          | Pmulint | Pandint | Porint | Pxorint | Plslint
          | Plsrint | Pasrint | Pintoffloat | Pfloatofint
@@ -679,7 +683,7 @@ let rec transl env e =
       end
   | Uunreachable ->
       let dbg = Debuginfo.none in
-      Cop(Cload (Word_int, Mutable), [Cconst_int (0, dbg)], dbg)
+      Cop(mk_load_mut Word_int, [Cconst_int (0, dbg)], dbg)
 
 and transl_catch env nfail ids body handler dbg =
   let ids = List.map (fun (id, kind) -> (id, kind, ref No_result)) ids in
@@ -795,7 +799,7 @@ and transl_prim_1 env p arg dbg =
     Popaque ->
       opaque (transl env arg) dbg
   (* Heap operations *)
-  | Pfield n ->
+  | Pfield(n, _, _) ->
       get_field env (transl env arg) n dbg
   | Pfloatfield n ->
       let ptr = transl env arg in
@@ -852,7 +856,20 @@ and transl_prim_1 env p arg dbg =
   | Pbswap16 ->
       tag_int (bswap16 (ignore_high_bit_int (untag_int
         (transl env arg) dbg)) dbg) dbg
+  | Pperform ->
+      let cont = make_alloc dbg Obj.cont_tag [int_const dbg 0] in
+      Cop(Capply typ_val,
+       [Cconst_symbol ("caml_perform", dbg); transl env arg; cont],
+       dbg)
+  | Pdls_get ->
+      Cop(Cdls_get, [transl env arg], dbg)
+  | Patomic_load {immediate_or_pointer = Immediate} ->
+      Cop(mk_load_mut Word_int, [transl env arg], dbg)
+  | Patomic_load {immediate_or_pointer = Pointer} ->
+      Cop(mk_load_mut Word_val, [transl env arg], dbg)
   | (Pfield_computed | Psequand | Psequor
+    | Prunstack | Presume | Preperform
+    | Patomic_exchange | Patomic_cas | Patomic_fetch_add
     | Paddint | Psubint | Pmulint | Pandint
     | Porint | Pxorint | Plslint | Plsrint | Pasrint
     | Paddfloat | Psubfloat | Pmulfloat | Pdivfloat
@@ -1036,6 +1053,14 @@ and transl_prim_2 env p arg1 arg2 dbg =
       tag_int (Cop(Ccmpi cmp,
                      [transl_unbox_int dbg env bi arg1;
                       transl_unbox_int dbg env bi arg2], dbg)) dbg
+  | Patomic_exchange ->
+     Cop (Cextcall ("caml_atomic_exchange", typ_val, [], false),
+          [transl env arg1; transl env arg2], dbg)
+  | Patomic_fetch_add ->
+     Cop (Cextcall ("caml_atomic_fetch_add", typ_int, [], false),
+          [transl env arg1; transl env arg2], dbg)
+  | Prunstack | Pperform | Presume | Preperform | Pdls_get
+  | Patomic_cas | Patomic_load _
   | Pnot | Pnegint | Pintoffloat | Pfloatofint | Pnegfloat
   | Pabsfloat | Pstringlength | Pbyteslength | Pbytessetu | Pbytessets
   | Pisint | Pbswap16 | Pint_as_pointer | Popaque | Pread_symbol _
@@ -1087,6 +1112,31 @@ and transl_prim_3 env p arg1 arg2 arg3 dbg =
       bigstring_set size unsafe (transl env arg1) (transl env arg2)
         (transl_unbox_sized size dbg env arg3) dbg
 
+  | Patomic_cas ->
+     Cop (Cextcall ("caml_atomic_cas", typ_int, [], false),
+          [transl env arg1; transl env arg2; transl env arg3], dbg)
+
+  (* Effects *)
+  | Presume ->
+      Cop (Capply typ_val,
+           [Cconst_symbol ("caml_resume", dbg);
+           transl env arg1; transl env arg2; transl env arg3],
+           dbg)
+
+  | Prunstack ->
+      Cop (Capply typ_val,
+           [Cconst_symbol ("caml_runstack", dbg);
+           transl env arg1; transl env arg2; transl env arg3],
+           dbg)
+
+  | Preperform ->
+      Cop (Capply typ_val,
+           [Cconst_symbol ("caml_reperform", dbg);
+           transl env arg1; transl env arg2; transl env arg3],
+           dbg)
+
+  | Pperform | Pdls_get
+  | Patomic_exchange | Patomic_fetch_add | Patomic_load _
   | Pfield_computed | Psequand | Psequor | Pnot | Pnegint | Paddint
   | Psubint | Pmulint | Pandint | Porint | Pxorint | Plslint | Plsrint | Pasrint
   | Pintoffloat | Pfloatofint | Pnegfloat | Pabsfloat | Paddfloat | Psubfloat
diff --git a/asmcomp/emitaux.ml b/asmcomp/emitaux.ml
index d3587c1a6ef6..10d2ab9d1c78 100644
--- a/asmcomp/emitaux.ml
+++ b/asmcomp/emitaux.ml
@@ -312,12 +312,25 @@ let cfi_endproc () =
   if is_cfi_enabled () then
     emit_string "\t.cfi_endproc\n"
 
+let cfi_remember_state () =
+  if is_cfi_enabled () then
+    emit_string "\t.cfi_remember_state\n"
+
+let cfi_restore_state () =
+  if is_cfi_enabled () then
+    emit_string "\t.cfi_restore_state\n"
+
 let cfi_adjust_cfa_offset n =
   if is_cfi_enabled () then
   begin
     emit_string "\t.cfi_adjust_cfa_offset\t"; emit_int n; emit_string "\n";
   end
 
+let cfi_def_cfa_offset n =
+  if is_cfi_enabled () then begin
+    emit_string "\t.cfi_def_cfa_offset\t"; emit_int n; emit_string "\n";
+  end
+
 let cfi_offset ~reg ~offset =
   if is_cfi_enabled () then begin
     emit_string "\t.cfi_offset ";
diff --git a/asmcomp/emitaux.mli b/asmcomp/emitaux.mli
index df0b0197ab59..22100f196988 100644
--- a/asmcomp/emitaux.mli
+++ b/asmcomp/emitaux.mli
@@ -70,6 +70,9 @@ val cfi_startproc : unit -> unit
 val cfi_endproc : unit -> unit
 val cfi_adjust_cfa_offset : int -> unit
 val cfi_offset : reg:int -> offset:int -> unit
+val cfi_def_cfa_offset : int -> unit
+val cfi_remember_state : unit -> unit
+val cfi_restore_state : unit -> unit
 
 val binary_backend_available: bool ref
     (** Is a binary backend available.  If yes, we don't need
diff --git a/asmcomp/linear.ml b/asmcomp/linear.ml
index 1773f4d436f6..ec2fbe9939ab 100644
--- a/asmcomp/linear.ml
+++ b/asmcomp/linear.ml
@@ -49,6 +49,7 @@ let has_fallthrough = function
 
 type fundecl =
   { fun_name: string;
+    fun_args: Reg.Set.t;
     fun_body: instruction;
     fun_fast: bool;
     fun_dbg : Debuginfo.t;
diff --git a/asmcomp/linear.mli b/asmcomp/linear.mli
index 2f52c209548d..ad5647693490 100644
--- a/asmcomp/linear.mli
+++ b/asmcomp/linear.mli
@@ -50,6 +50,7 @@ val invert_test: Mach.test -> Mach.test
 
 type fundecl =
   { fun_name: string;
+    fun_args: Reg.Set.t;
     fun_body: instruction;
     fun_fast: bool;
     fun_dbg : Debuginfo.t;
diff --git a/asmcomp/linearize.ml b/asmcomp/linearize.ml
index bf983ae061ea..1d2b3fab4b4e 100644
--- a/asmcomp/linearize.ml
+++ b/asmcomp/linearize.ml
@@ -327,6 +327,7 @@ let fundecl f =
       fun_prologue_required
   in
   { fun_name = f.Mach.fun_name;
+    fun_args = Reg.set_of_array f.Mach.fun_args;
     fun_body;
     fun_fast = not (List.mem Cmm.Reduce_code_size f.Mach.fun_codegen_options);
     fun_dbg  = f.Mach.fun_dbg;
diff --git a/asmcomp/mach.ml b/asmcomp/mach.ml
index f3a43e2978a5..803b079d6540 100644
--- a/asmcomp/mach.ml
+++ b/asmcomp/mach.ml
@@ -49,7 +49,8 @@ type operation =
   | Itailcall_imm of { func : string; }
   | Iextcall of { func : string;
                   ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
-                  alloc : bool; }
+                  alloc : bool;
+                  stack_ofs : int; }
   | Istackoffset of int
   | Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
   | Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
@@ -61,6 +62,7 @@ type operation =
   | Iopaque
   | Ispecific of Arch.specific_operation
   | Ipoll of { return_label: Cmm.label option }
+  | Idls_get
 
 type instruction =
   { desc: instruction_desc;
@@ -148,6 +150,7 @@ let rec instr_iter f i =
 let operation_is_pure = function
   | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
   | Iextcall _ | Istackoffset _ | Istore _ | Ialloc _ | Ipoll _
+  | Idls_get
   | Iintop(Icheckbound) | Iintop_imm(Icheckbound, _) | Iopaque -> false
   | Ispecific sop -> Arch.operation_is_pure sop
   | _ -> true
diff --git a/asmcomp/mach.mli b/asmcomp/mach.mli
index 866a21fe2c50..a530c7df6f14 100644
--- a/asmcomp/mach.mli
+++ b/asmcomp/mach.mli
@@ -49,7 +49,8 @@ type operation =
   | Itailcall_imm of { func : string; }
   | Iextcall of { func : string;
                   ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
-                  alloc : bool; }
+                  alloc : bool;
+                  stack_ofs : int; }
   | Istackoffset of int
   | Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
   | Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
@@ -62,6 +63,7 @@ type operation =
   | Iopaque
   | Ispecific of Arch.specific_operation
   | Ipoll of { return_label: Cmm.label option }
+  | Idls_get
 
 type instruction =
   { desc: instruction_desc;
diff --git a/asmcomp/polling.ml b/asmcomp/polling.ml
index b2efb27dcfa8..c8ca125ac6b7 100644
--- a/asmcomp/polling.ml
+++ b/asmcomp/polling.ml
@@ -259,7 +259,7 @@ let find_poll_alloc_or_calls instr =
             Iconst_symbol _ | Iextcall { alloc = false } | Istackoffset _ |
             Iload _ | Istore _ | Iintop _ | Iintop_imm _ | Ifloatofint |
             Iintoffloat | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf |
-            Iopaque | Ispecific _)-> None
+            Iopaque | Ispecific _ | Idls_get) -> None
       | Iend | Ireturn | Iifthenelse _ | Iswitch _ | Icatch _ | Iexit _ |
         Itrywith _ | Iraise _ -> None
     in
diff --git a/asmcomp/printcmm.ml b/asmcomp/printcmm.ml
index b36a912c22bd..078f92e5e9ab 100644
--- a/asmcomp/printcmm.ml
+++ b/asmcomp/printcmm.ml
@@ -117,8 +117,10 @@ let operation d = function
   | Capply _ty -> "app" ^ location d
   | Cextcall(lbl, _ty_res, _ty_args, _alloc) ->
       Printf.sprintf "extcall \"%s\"%s" lbl (location d)
-  | Cload (c, Asttypes.Immutable) -> Printf.sprintf "load %s" (chunk c)
-  | Cload (c, Asttypes.Mutable) -> Printf.sprintf "load_mut %s" (chunk c)
+  | Cload {memory_chunk; mutability} -> (
+    match mutability with
+    | Asttypes.Immutable -> Printf.sprintf "load %s" (chunk memory_chunk)
+    | Asttypes.Mutable   -> Printf.sprintf "load_mut %s" (chunk memory_chunk))
   | Calloc -> "alloc" ^ location d
   | Cstore (c, init) ->
     let init =
@@ -156,6 +158,7 @@ let operation d = function
   | Craise k -> Lambda.raise_kind k ^ location d
   | Ccheckbound -> "checkbound" ^ location d
   | Copaque -> "opaque"
+  | Cdls_get -> "dls_get"
 
 let rec expr ppf = function
   | Cconst_int (n, _dbg) -> fprintf ppf "%i" n
diff --git a/asmcomp/printmach.ml b/asmcomp/printmach.ml
index 656f95118532..f87b329d60b9 100644
--- a/asmcomp/printmach.ml
+++ b/asmcomp/printmach.ml
@@ -150,6 +150,7 @@ let operation op arg ppf res =
   | Iopaque -> fprintf ppf "opaque %a" reg arg.(0)
   | Ispecific op ->
       Arch.print_specific_operation reg op ppf arg
+  | Idls_get -> fprintf ppf "dls_get"
   | Ipoll { return_label } ->
       fprintf ppf "poll call";
       match return_label with
diff --git a/asmcomp/schedgen.ml b/asmcomp/schedgen.ml
index 942e35e8bac3..9e6a926c6c44 100644
--- a/asmcomp/schedgen.ml
+++ b/asmcomp/schedgen.ml
@@ -388,6 +388,7 @@ method schedule_fundecl f =
     let new_body = schedule f.fun_body 0 in
     clear_code_dag();
     { fun_name = f.fun_name;
+      fun_args = f.fun_args;
       fun_body = new_body;
       fun_fast = f.fun_fast;
       fun_dbg  = f.fun_dbg;
diff --git a/asmcomp/selectgen.ml b/asmcomp/selectgen.ml
index 6a56cc2a2769..d3be5f682087 100644
--- a/asmcomp/selectgen.ml
+++ b/asmcomp/selectgen.ml
@@ -67,14 +67,15 @@ let env_empty = {
 let oper_result_type = function
     Capply ty -> ty
   | Cextcall(_s, ty_res, _ty_args, _alloc) -> ty_res
-  | Cload (c, _) ->
-      begin match c with
+  | Cload {memory_chunk} ->
+      begin match memory_chunk with
       | Word_val -> typ_val
       | Single | Double -> typ_float
       | _ -> typ_int
       end
   | Calloc -> typ_val
   | Cstore (_c, _) -> typ_void
+  | Cdls_get -> typ_val
   | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi |
     Cand | Cor | Cxor | Clsl | Clsr | Casr |
     Ccmpi _ | Ccmpa _ | Ccmpf _ -> typ_int
@@ -328,7 +329,8 @@ method is_simple_expr = function
       | Cload _ | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi | Cand | Cor
       | Cxor | Clsl | Clsr | Casr | Ccmpi _ | Caddv | Cadda | Ccmpa _ | Cnegf
       | Cabsf | Caddf | Csubf | Cmulf | Cdivf | Cfloatofint | Cintoffloat
-      | Ccmpf _ | Ccheckbound -> List.for_all self#is_simple_expr args
+      | Ccmpf _ | Ccheckbound | Cdls_get ->
+          List.for_all self#is_simple_expr args
       end
   | Cassign _ | Cifthenelse _ | Cswitch _ | Ccatch _ | Cexit _
   | Ctrywith _ -> false
@@ -366,8 +368,9 @@ method effects_of exp =
       | Calloc -> EC.none
       | Cstore _ -> EC.effect_only Effect.Arbitrary
       | Craise _ | Ccheckbound -> EC.effect_only Effect.Raise
-      | Cload (_, Asttypes.Immutable) -> EC.none
-      | Cload (_, Asttypes.Mutable) -> EC.coeffect_only Coeffect.Read_mutable
+      | Cload {mutability = Asttypes.Immutable} -> EC.none
+      | Cload {mutability = Asttypes.Mutable} | Cdls_get ->
+          EC.coeffect_only Coeffect.Read_mutable
       | Caddi | Csubi | Cmuli | Cmulhi | Cdivi | Cmodi | Cand | Cor | Cxor
       | Clsl | Clsr | Casr | Ccmpi _ | Caddv | Cadda | Ccmpa _ | Cnegf | Cabsf
       | Caddf | Csubf | Cmulf | Cdivf | Cfloatofint | Cintoffloat | Ccmpf _ ->
@@ -425,8 +428,8 @@ method mark_instr = function
       | Lambda.Raise_regular
       | Lambda.Raise_reraise ->
           (* PR#6239 *)
-          (* caml_stash_backtrace; we #mark_call rather than
-             #mark_c_tailcall to get a good stack backtrace *)
+        (* caml_stash_backtrace; we #mark_call rather than
+           #mark_c_tailcall to get a good stack backtrace *)
           self#mark_call
     end
   | Itrywith _ ->
@@ -442,10 +445,10 @@ method select_operation op args _dbg =
   | (Capply _, _) ->
     (Icall_ind, args)
   | (Cextcall(func, ty_res, ty_args, alloc), _) ->
-    Iextcall { func; ty_res; ty_args; alloc; }, args
-  | (Cload (chunk, mut), [arg]) ->
-      let (addr, eloc) = self#select_addressing chunk arg in
-      (Iload(chunk, addr, mut), [eloc])
+    Iextcall { func; alloc; ty_res; ty_args; stack_ofs = -1}, args
+  | (Cload {memory_chunk; mutability}, [arg]) ->
+      let (addr, eloc) = self#select_addressing memory_chunk arg in
+      (Iload(memory_chunk, addr, mutability), [eloc])
   | (Cstore (chunk, init), [arg1; arg2]) ->
       let (addr, eloc) = self#select_addressing chunk arg1 in
       let is_assign =
@@ -461,6 +464,7 @@ method select_operation op args _dbg =
         (Istore(chunk, addr, is_assign), [arg2; eloc])
         (* Inversion addr/datum in Istore *)
       end
+  | (Cdls_get, _) -> Idls_get, args
   | (Calloc, _) -> (Ialloc {bytes = 0; dbginfo = []}), args
   | (Caddi, _) -> self#select_arith_comm Iadd args
   | (Csubi, _) -> self#select_arith Isub args
@@ -712,12 +716,13 @@ method emit_expr (env:environment) exp =
               self#insert_debug env (Iop new_op) dbg loc_arg loc_res;
               self#insert_move_results env loc_res rd stack_ofs;
               Some rd
-          | Iextcall { ty_args; _} ->
+          | Iextcall r ->
               let (loc_arg, stack_ofs) =
-                self#emit_extcall_args env ty_args new_args in
+                self#emit_extcall_args env r.ty_args new_args in
               let rd = self#regs_for ty in
               let loc_res =
-                self#insert_op_debug env new_op dbg
+                self#insert_op_debug env
+                  (Iextcall {r with stack_ofs = stack_ofs}) dbg
                   loc_arg (Proc.loc_external_results (Reg.typv rd)) in
               self#insert_move_results env loc_res rd stack_ofs;
               Some rd
diff --git a/asmcomp/strmatch.ml b/asmcomp/strmatch.ml
index 63d8407e57c4..f92547a38a55 100644
--- a/asmcomp/strmatch.ml
+++ b/asmcomp/strmatch.ml
@@ -76,7 +76,9 @@ module Make(I:I) = struct
   let mk_let_cell id str ind body =
     let dbg = Debuginfo.none in
     let cell =
-      Cop(Cload (Word_int, Asttypes.Mutable),
+      Cop(Cload {memory_chunk=Word_int;
+                 mutability=Asttypes.Mutable;
+                 is_atomic=false},
         [Cop(Cadda,[str;Cconst_int(Arch.size_int*ind, dbg)], dbg)],
         dbg) in
     Clet(id, cell, body)
diff --git a/asmcomp/x86_ast.mli b/asmcomp/x86_ast.mli
index 96d87bebb04f..d3355bdf5b48 100644
--- a/asmcomp/x86_ast.mli
+++ b/asmcomp/x86_ast.mli
@@ -208,6 +208,10 @@ type asm_line =
   | Cfi_adjust_cfa_offset of int
   | Cfi_endproc
   | Cfi_startproc
+  | Cfi_remember_state
+  | Cfi_restore_state
+  | Cfi_def_cfa_register of string
+  | Cfi_def_cfa_offset of int
   | File of int * string (* (file_num, file_name) *)
   | Indirect_symbol of string
   | Loc of int * int * int (* (file_num, line, col) *)
diff --git a/asmcomp/x86_dsl.ml b/asmcomp/x86_dsl.ml
index e647f66c67ac..18fecddf248d 100644
--- a/asmcomp/x86_dsl.ml
+++ b/asmcomp/x86_dsl.ml
@@ -48,8 +48,10 @@ let ah  = Reg8H AH
 let cl  = Reg8L RCX
 let ax  = Reg16 RAX
 let rax = Reg64 RAX
+let rdx = Reg64 RDX
 let r10 = Reg64 R10
 let r11 = Reg64 R11
+let r12 = Reg64 R12
 let r13 = Reg64 R13
 let r14 = Reg64 R14
 let r15 = Reg64 R15
@@ -84,6 +86,10 @@ module D = struct
   let cfi_adjust_cfa_offset n = directive (Cfi_adjust_cfa_offset n)
   let cfi_endproc () = directive Cfi_endproc
   let cfi_startproc () = directive Cfi_startproc
+  let cfi_remember_state () = directive Cfi_remember_state
+  let cfi_restore_state () = directive Cfi_restore_state
+  let cfi_def_cfa_register reg = directive (Cfi_def_cfa_register reg)
+  let cfi_def_cfa_offset n = directive (Cfi_def_cfa_offset n)
   let comment s = directive (Comment s)
   let data () = section [ ".data" ] None []
   let extrn s ptr = directive (External (s, ptr))
diff --git a/asmcomp/x86_dsl.mli b/asmcomp/x86_dsl.mli
index 080331fcee2f..c587e8fac11c 100644
--- a/asmcomp/x86_dsl.mli
+++ b/asmcomp/x86_dsl.mli
@@ -37,8 +37,10 @@ val ah: arg
 val cl: arg
 val ax: arg
 val rax: arg
+val rdx: arg
 val r10: arg
 val r11: arg
+val r12: arg
 val r13: arg
 val r14: arg
 val r15: arg
@@ -74,6 +76,10 @@ module D : sig
   val cfi_adjust_cfa_offset: int -> unit
   val cfi_endproc: unit -> unit
   val cfi_startproc: unit -> unit
+  val cfi_remember_state: unit -> unit
+  val cfi_restore_state: unit -> unit
+  val cfi_def_cfa_register: string -> unit
+  val cfi_def_cfa_offset: int -> unit
   val comment: string -> unit
   val data: unit -> unit
   val extrn: string -> data_type -> unit
diff --git a/asmcomp/x86_gas.ml b/asmcomp/x86_gas.ml
index 6d2363a76bf7..4576e2a5123c 100644
--- a/asmcomp/x86_gas.ml
+++ b/asmcomp/x86_gas.ml
@@ -279,6 +279,10 @@ let print_line b = function
   | Cfi_adjust_cfa_offset n -> bprintf b "\t.cfi_adjust_cfa_offset %d" n
   | Cfi_endproc -> bprintf b "\t.cfi_endproc"
   | Cfi_startproc -> bprintf b "\t.cfi_startproc"
+  | Cfi_remember_state -> bprintf b "\t.cfi_remember_state"
+  | Cfi_restore_state -> bprintf b "\t.cfi_restore_state"
+  | Cfi_def_cfa_register reg -> bprintf b "\t.cfi_def_cfa_register %%%s" reg
+  | Cfi_def_cfa_offset n -> bprintf b "\t.cfi_def_cfa_offset %d" n
   | File (file_num, file_name) ->
       bprintf b "\t.file\t%d\t\"%s\""
         file_num (X86_proc.string_of_string_literal file_name)
diff --git a/asmcomp/x86_masm.ml b/asmcomp/x86_masm.ml
index eb010b8b7066..aa74e0c93389 100644
--- a/asmcomp/x86_masm.ml
+++ b/asmcomp/x86_masm.ml
@@ -239,6 +239,10 @@ let print_line b = function
   | Cfi_adjust_cfa_offset _
   | Cfi_endproc
   | Cfi_startproc
+  | Cfi_def_cfa_register _
+  | Cfi_def_cfa_offset _
+  | Cfi_remember_state
+  | Cfi_restore_state
   | File _
   | Indirect_symbol _
   | Loc _
diff --git a/boot/ocamlc b/boot/ocamlc
index b65bd9a15ce5..d431dbd1f41d 100755
Binary files a/boot/ocamlc and b/boot/ocamlc differ
diff --git a/boot/ocamllex b/boot/ocamllex
index b6efefde1b79..b35488cadc2e 100755
Binary files a/boot/ocamllex and b/boot/ocamllex differ
diff --git a/bytecomp/bytegen.ml b/bytecomp/bytegen.ml
index 27e170ffce29..418ebca83132 100644
--- a/bytecomp/bytegen.ml
+++ b/bytecomp/bytegen.ml
@@ -109,7 +109,8 @@ let rec is_tailcall = function
    from the tail call optimization? *)
 
 let preserve_tailcall_for_prim = function
-  | Popaque | Psequor | Psequand ->
+  | Popaque | Psequor | Psequand
+  | Prunstack | Pperform | Presume | Preperform ->
       true
   | Pbytes_to_string | Pbytes_of_string | Pignore | Pgetglobal _ | Psetglobal _
   | Pmakeblock _ | Pfield _ | Pfield_computed | Psetfield _
@@ -130,7 +131,9 @@ let preserve_tailcall_for_prim = function
   | Pbytes_load_32 _ | Pbytes_load_64 _ | Pbytes_set_16 _ | Pbytes_set_32 _
   | Pbytes_set_64 _ | Pbigstring_load_16 _ | Pbigstring_load_32 _
   | Pbigstring_load_64 _ | Pbigstring_set_16 _ | Pbigstring_set_32 _
-  | Pbigstring_set_64 _ | Pctconst _ | Pbswap16 | Pbbswap _ | Pint_as_pointer ->
+  | Pbigstring_set_64 _ | Pctconst _ | Pbswap16 | Pbbswap _ | Pint_as_pointer
+  | Patomic_exchange | Patomic_cas | Patomic_fetch_add | Patomic_load _
+  | Pdls_get ->
       false
 
 (* Add a Kpop N instruction in front of a continuation *)
@@ -369,6 +372,8 @@ let compunit_name = ref ""
 
 let max_stack_used = ref 0
 
+let check_stack sz =
+  if sz > !max_stack_used then max_stack_used := sz
 
 (* Sequence of string tests *)
 
@@ -383,7 +388,8 @@ let comp_bint_primitive bi suff args =
                 | Pint64 -> "caml_int64_" in
   Kccall(pref ^ suff, List.length args)
 
-let comp_primitive p args =
+let comp_primitive p sz args =
+  check_stack sz;
   match p with
     Pgetglobal id -> Kgetglobal id
   | Psetglobal id -> Ksetglobal id
@@ -391,13 +397,16 @@ let comp_primitive p args =
   | Pcompare_ints -> Kccall("caml_int_compare", 2)
   | Pcompare_floats -> Kccall("caml_float_compare", 2)
   | Pcompare_bints bi -> comp_bint_primitive bi "compare" args
-  | Pfield n -> Kgetfield n
+  | Pfield(n, _ptr, _mut) -> Kgetfield n
   | Pfield_computed -> Kgetvectitem
   | Psetfield(n, _ptr, _init) -> Ksetfield n
   | Psetfield_computed(_ptr, _init) -> Ksetvectitem
   | Psetfloatfield (n, _init) -> Ksetfloatfield n
   | Pduprecord _ -> Kccall("caml_obj_dup", 1)
   | Pccall p -> Kccall(p.prim_name, p.prim_arity)
+  | Pperform ->
+      check_stack (sz + 4);
+      Kperform
   | Pnegint -> Knegint
   | Paddint -> Kaddint
   | Psubint -> Ksubint
@@ -508,9 +517,15 @@ let comp_primitive p args =
   | Pint_as_pointer -> Kccall("caml_int_as_pointer", 1)
   | Pbytes_to_string -> Kccall("caml_string_of_bytes", 1)
   | Pbytes_of_string -> Kccall("caml_bytes_of_string", 1)
+  | Patomic_load _ -> Kccall("caml_atomic_load", 1)
+  | Patomic_exchange -> Kccall("caml_atomic_exchange", 2)
+  | Patomic_cas -> Kccall("caml_atomic_cas", 3)
+  | Patomic_fetch_add -> Kccall("caml_atomic_fetch_add", 2)
+  | Pdls_get -> Kccall("caml_domain_dls_get", 1)
   (* The cases below are handled in [comp_expr] before the [comp_primitive] call
      (in the order in which they appear below),
      so they should never be reached in this function. *)
+  | Prunstack | Presume | Preperform
   | Pignore | Popaque
   | Pnot | Psequand | Psequor
   | Praise _
@@ -538,7 +553,7 @@ module Storer =
    Result = list of instructions that evaluate exp, then perform cont. *)
 
 let rec comp_expr env exp sz cont =
-  if sz > !max_stack_used then max_stack_used := sz;
+  check_stack sz;
   match exp with
     Lvar id | Lmutvar id ->
       begin try
@@ -754,6 +769,25 @@ let rec comp_expr env exp sz cont =
                  (Kmakeblock(List.length args, 0) ::
                   Kccall("caml_make_array", 1) :: cont)
       end
+  | Lprim((Presume|Prunstack), args, _) ->
+      let nargs = List.length args - 1 in
+      assert (nargs = 2);
+      (* Resume itself only pushes 3 words, but perform adds another *)
+      check_stack (sz + 4);
+      if is_tailcall cont then
+        comp_args env args sz
+          (Kresumeterm(sz + nargs) :: discard_dead_code cont)
+      else
+        comp_args env args sz (Kresume :: cont)
+  | Lprim(Preperform, args, _) ->
+      let nargs = List.length args - 1 in
+      assert (nargs = 2);
+      check_stack (sz + 3);
+      if is_tailcall cont then
+        comp_args env args sz
+          (Kreperformterm(sz + nargs) :: discard_dead_code cont)
+      else
+        fatal_error "Reperform used in non-tail position"
   | Lprim (Pduparray (kind, mutability),
            [Lprim (Pmakearray (kind',_),args,_)], loc) ->
       assert (kind = kind');
@@ -769,7 +803,8 @@ let rec comp_expr env exp sz cont =
   | Lprim (Pintcomp c, [arg ; (Lconst _ as k)], _) ->
       let p = Pintcomp (swap_integer_comparison c)
       and args = [k ; arg] in
-      comp_args env args sz (comp_primitive p args :: cont)
+      let nargs = List.length args - 1 in
+      comp_args env args sz (comp_primitive p (sz + nargs - 1) args :: cont)
   | Lprim (Pfloatcomp cmp, args, _) ->
       let cont =
         match cmp with
@@ -792,7 +827,8 @@ let rec comp_expr env exp sz cont =
       let cont = add_pseudo_event loc !compunit_name cont in
       comp_args env args sz (Kgetfloatfield n :: cont)
   | Lprim(p, args, _) ->
-      comp_args env args sz (comp_primitive p args :: cont)
+      let nargs = List.length args - 1 in
+      comp_args env args sz (comp_primitive p (sz + nargs - 1) args :: cont)
   | Lstaticcatch (body, (i, vars) , handler) ->
       let vars = List.map fst vars in
       let nvars = List.length vars in
diff --git a/bytecomp/emitcode.ml b/bytecomp/emitcode.ml
index 8eefde578c8a..ea423e50b708 100644
--- a/bytecomp/emitcode.ml
+++ b/bytecomp/emitcode.ml
@@ -300,6 +300,10 @@ let emit_instr = function
   | Kgetpubmet tag -> out opGETPUBMET; out_int tag; out_int 0
   | Kgetdynmet -> out opGETDYNMET
   | Kevent ev -> record_event ev
+  | Kperform -> out opPERFORM
+  | Kresume -> out opRESUME
+  | Kresumeterm n -> out opRESUMETERM; out_int n
+  | Kreperformterm n -> out opREPERFORMTERM; out_int n
   | Kstop -> out opSTOP
 
 (* Emission of a list of instructions. Include some peephole optimization. *)
diff --git a/bytecomp/instruct.ml b/bytecomp/instruct.ml
index c684bedf99f0..b790999e059e 100644
--- a/bytecomp/instruct.ml
+++ b/bytecomp/instruct.ml
@@ -104,6 +104,10 @@ type instruction =
   | Kgetpubmet of int
   | Kgetdynmet
   | Kevent of debug_event
+  | Kperform
+  | Kresume
+  | Kresumeterm of int
+  | Kreperformterm of int
   | Kstop
 
 let immed_min = -0x40000000
diff --git a/bytecomp/instruct.mli b/bytecomp/instruct.mli
index e1cae776b0aa..fa875a4a653d 100644
--- a/bytecomp/instruct.mli
+++ b/bytecomp/instruct.mli
@@ -124,6 +124,10 @@ type instruction =
   | Kgetpubmet of int
   | Kgetdynmet
   | Kevent of debug_event
+  | Kperform
+  | Kresume
+  | Kresumeterm of int
+  | Kreperformterm of int
   | Kstop
 
 val immed_min: int
diff --git a/bytecomp/printinstr.ml b/bytecomp/printinstr.ml
index 6b3754cbfc10..564252373c80 100644
--- a/bytecomp/printinstr.ml
+++ b/bytecomp/printinstr.ml
@@ -100,6 +100,10 @@ let instruction ppf = function
   | Kgetmethod -> fprintf ppf "\tgetmethod"
   | Kgetpubmet n -> fprintf ppf "\tgetpubmet %i" n
   | Kgetdynmet -> fprintf ppf "\tgetdynmet"
+  | Kperform -> fprintf ppf "\tperform"
+  | Kresume -> fprintf ppf "\tresume"
+  | Kresumeterm n -> fprintf ppf "\tresumeterm %i" n
+  | Kreperformterm n -> fprintf ppf "\treperformterm %i" n
   | Kstop -> fprintf ppf "\tstop"
   | Kevent ev -> fprintf ppf "\tevent \"%s\" %i-%i"
                          ev.ev_loc.Location.loc_start.Lexing.pos_fname
diff --git a/configure b/configure
index 6fa3a11416cf..80b4ad5667ec 100755
--- a/configure
+++ b/configure
@@ -747,8 +747,7 @@ build_os
 build_vendor
 build_cpu
 build
-naked_pointers_checker
-naked_pointers
+force_instrumented_runtime
 compute_deps
 stdlib_manpages
 PACKLD
@@ -902,7 +901,6 @@ enable_ocamltest
 enable_native_toplevel
 enable_frame_pointers
 enable_naked_pointers
-enable_naked_pointers_checker
 enable_spacetime
 enable_cfi
 enable_imprecise_c99_float_ops
@@ -911,6 +909,7 @@ enable_installing_bytecode_programs
 enable_native_compiler
 enable_flambda
 enable_flambda_invariants
+enable_force_instrumented_runtime
 enable_cmm_invariants
 with_target_bindir
 enable_reserved_header_bits
@@ -1576,10 +1575,6 @@ Optional Features:
   --enable-native-toplevel
                           build the native toplevel
   --enable-frame-pointers use frame pointers in runtime and generated code
-  --disable-naked-pointers
-                          do not allow naked pointers
-  --enable-naked-pointers-checker
-                          enable the naked pointers checker
   --disable-cfi           disable the CFI directives in assembly files
   --enable-imprecise-c99-float-ops
                           enables potentially imprecise replacement
@@ -1594,6 +1589,8 @@ Optional Features:
   --enable-flambda        enable flambda optimizations
   --enable-flambda-invariants
                           enable invariants checks in flambda
+  --force-instrumented-runtime
+                          force the usage of the instrumented runtime
   --enable-cmm-invariants enable invariants checks in Cmm
   --enable-reserved-header-bits=BITS
                           reserve BITS (between 0 and 31) bits in block
@@ -2809,7 +2806,8 @@ toolchain="cc"
 profinfo=false
 profinfo_width=0
 extralibs=
-instrumented_runtime=false
+instrumented_runtime=true
+force_instrumented_runtime=false
 instrumented_runtime_libs=""
 bootstrapping_flexdll=false
 
@@ -3246,13 +3244,10 @@ fi
 
 # Check whether --enable-naked-pointers was given.
 if test "${enable_naked_pointers+set}" = set; then :
-  enableval=$enable_naked_pointers;
-fi
-
+  enableval=$enable_naked_pointers; as_fn_error $? "Naked pointers are not allowed in OCaml Multicore." "$LINENO" 5
+else
+  $as_echo "#define NO_NAKED_POINTERS 1" >>confdefs.h
 
-# Check whether --enable-naked-pointers-checker was given.
-if test "${enable_naked_pointers_checker+set}" = set; then :
-  enableval=$enable_naked_pointers_checker;
 fi
 
 
@@ -3303,6 +3298,12 @@ if test "${enable_flambda_invariants+set}" = set; then :
 fi
 
 
+# Check whether --enable-force-instrumented-runtime was given.
+if test "${enable_force_instrumented_runtime+set}" = set; then :
+  enableval=$enable_force_instrumented_runtime;
+fi
+
+
 # Check whether --enable-cmm-invariants was given.
 if test "${enable_cmm_invariants+set}" = set; then :
   enableval=$enable_cmm_invariants;
@@ -12651,7 +12652,8 @@ case $ocaml_cv_cc_vendor in #(
   msvc-*) :
     CPP="$CC -nologo -EP" ;; #(
   *) :
-     ;;
+    #  TODO: why can we not use $CPP in multicore, fix this?
+  CPP="$CC -E -P" ;;
 esac
 
 # Libraries to build depending on the host
@@ -13475,6 +13477,22 @@ fi
 
 
 
+ac_fn_c_check_header_mongrel "$LINENO" "stdatomic.h" "ac_cv_header_stdatomic_h" "$ac_includes_default"
+if test "x$ac_cv_header_stdatomic_h" = xyes; then :
+  $as_echo "#define HAS_STDATOMIC_H 1" >>confdefs.h
+
+fi
+
+
+
+ac_fn_c_check_header_mongrel "$LINENO" "sys/mman.h" "ac_cv_header_sys_mman_h" "$ac_includes_default"
+if test "x$ac_cv_header_sys_mman_h" = xyes; then :
+  $as_echo "#define HAS_SYS_MMAN_H 1" >>confdefs.h
+
+fi
+
+
+
 # Checks for types
 
 ## off_t
@@ -15000,6 +15018,11 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
  ;;
 esac
 
+if test "x$enable_force_instrumented_runtime" = "xyes"; then :
+  force_instrumented_runtime=true
+
+fi
+
 # The instrumented runtime is built by default
 # if the proper clock source is found.
 # If asked via --enable-instrumented-runtime, configuration fails if the proper
@@ -16648,6 +16671,7 @@ $as_echo "$as_me: the Win32/POSIX threads library is disabled" >&6;} ;; #(
   *-*-mingw32|*-pc-windows) :
     systhread_support=true
       otherlibraries="$otherlibraries systhreads"
+      PTHREAD_LIBS="-lpthread"
       { $as_echo "$as_me:${as_lineno-$LINENO}: the Win32 threads library is supported" >&5
 $as_echo "$as_me: the Win32 threads library is supported" >&6;} ;; #(
   *) :
@@ -17524,38 +17548,6 @@ $as_echo "$as_me: not using frame pointers" >&6;}
   frame_pointers=false
 fi
 
-## No naked pointers
-
-if test x"$enable_naked_pointers" = "xno" ; then :
-  naked_pointers=false
-   $as_echo "#define NO_NAKED_POINTERS 1" >>confdefs.h
-
-else
-  naked_pointers=true
-fi
-
-if test x"$enable_naked_pointers_checker" = "xyes" ; then :
-  if test x"$enable_naked_pointers" = "xno" ; then :
-  as_fn_error $? "--enable-naked-pointers-checker and --disable-naked-pointers are incompatible" "$LINENO" 5
-fi
-   case "$arch","$system" in #(
-  amd64,linux|amd64,macosx \
-    |amd64,openbsd|amd64,win64 \
-    |amd64,freebsd|amd64,solaris \
-    |arm64,linux|arm64,macosx) :
-    naked_pointers_checker=true
-      $as_echo "#define NAKED_POINTERS_CHECKER 1" >>confdefs.h
- ;; #(
-  *) :
-    as_fn_error $? "naked pointers checker not supported on this platform" "$LINENO" 5
-   ;; #(
-  *) :
-     ;;
-esac
-else
-  naked_pointers_checker=false
-fi
-
 ## Check for mmap support for huge pages and contiguous heap
 
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mmap supports huge pages" >&5
@@ -17789,16 +17781,57 @@ ocamlc_cflags="$common_cflags $sharedlib_cflags \$(CFLAGS)"
 ocamlc_cppflags="$common_cppflags \$(CPPFLAGS)"
 cclibs="$cclibs $mathlib"
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for backtrace in -lexecinfo" >&5
+$as_echo_n "checking for backtrace in -lexecinfo... " >&6; }
+if ${ac_cv_lib_execinfo_backtrace+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lexecinfo  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char backtrace ();
+int
+main ()
+{
+return backtrace ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_execinfo_backtrace=yes
+else
+  ac_cv_lib_execinfo_backtrace=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_execinfo_backtrace" >&5
+$as_echo "$ac_cv_lib_execinfo_backtrace" >&6; }
+if test "x$ac_cv_lib_execinfo_backtrace" = xyes; then :
+  cclibs="$cclibs -lexecinfo"
+fi
+
+
 case $host in #(
   *-*-mingw32) :
-    bytecclibs="-lws2_32 -lversion"
-    nativecclibs="-lws2_32 -lversion" ;; #(
+    bytecclibs="-lws2_32 -lversion -lpthread -lgcc_eh -lDbgHelp"
+    nativecclibs="-lws2_32 -lversion -lpthread -lgcc_eh -lDbgHelp" ;; #(
   *-pc-windows) :
     bytecclibs="advapi32.lib ws2_32.lib version.lib"
     nativecclibs="advapi32.lib ws2_32.lib version.lib" ;; #(
   *) :
     bytecclibs="$cclibs $DLLIBS $PTHREAD_LIBS"
-  nativecclibs="$cclibs $DLLIBS" ;;
+  nativecclibs="$cclibs $DLLIBS $PTHREAD_LIBS" ;;
 esac
 
 if test x"$libdir" = x'${exec_prefix}/lib'; then :
diff --git a/configure.ac b/configure.ac
index 9d49d213c447..c999dab3951f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -55,7 +55,8 @@ toolchain="cc"
 profinfo=false
 profinfo_width=0
 extralibs=
-instrumented_runtime=false
+instrumented_runtime=true
+force_instrumented_runtime=false
 instrumented_runtime_libs=""
 bootstrapping_flexdll=false
 
@@ -168,8 +169,7 @@ AC_SUBST([flexlink_flags])
 AC_SUBST([PACKLD])
 AC_SUBST([stdlib_manpages])
 AC_SUBST([compute_deps])
-AC_SUBST([naked_pointers])
-AC_SUBST([naked_pointers_checker])
+AC_SUBST([force_instrumented_runtime])
 
 ## Generated files
 
@@ -300,13 +300,9 @@ AC_ARG_ENABLE([frame-pointers],
   [AS_HELP_STRING([--enable-frame-pointers],
     [use frame pointers in runtime and generated code])])
 
-AC_ARG_ENABLE([naked-pointers],
-  [AS_HELP_STRING([--disable-naked-pointers],
-    [do not allow naked pointers])])
-
-AC_ARG_ENABLE([naked-pointers-checker],
-  [AS_HELP_STRING([--enable-naked-pointers-checker],
-    [enable the naked pointers checker])])
+AC_ARG_ENABLE([naked-pointers], [],
+  [AC_MSG_ERROR([Naked pointers are not allowed in OCaml Multicore.])],
+  [AC_DEFINE([NO_NAKED_POINTERS])])
 
 AC_ARG_ENABLE([spacetime], [],
   [AC_MSG_ERROR([spacetime profiling was deleted in OCaml 4.12.])],
@@ -340,6 +336,10 @@ AC_ARG_ENABLE([flambda-invariants],
   [AS_HELP_STRING([--enable-flambda-invariants],
     [enable invariants checks in flambda])])
 
+AC_ARG_ENABLE([force-instrumented-runtime],
+  [AS_HELP_STRING([--force-instrumented-runtime],
+    [force the usage of the instrumented runtime])])
+
 AC_ARG_ENABLE([cmm-invariants],
   [AS_HELP_STRING([--enable-cmm-invariants],
     [enable invariants checks in Cmm])])
@@ -526,7 +526,9 @@ AS_CASE([$ocaml_cv_cc_vendor],
   [sunc-*],
     [CPP="$CC -E -Qn"], # suppress generation of Sun PRO ident string
   [msvc-*],
-    [CPP="$CC -nologo -EP"])
+    [CPP="$CC -nologo -EP"],
+#  TODO: why can we not use $CPP in multicore, fix this?
+  [CPP="$CC -E -P"])
 
 # Libraries to build depending on the host
 
@@ -872,6 +874,10 @@ AC_CHECK_HEADER([dirent.h], [AC_DEFINE([HAS_DIRENT])], [],
 AC_CHECK_HEADER([sys/select.h], [AC_DEFINE([HAS_SYS_SELECT_H])], [],
   [#include <sys/types.h>])
 
+AC_CHECK_HEADER([stdatomic.h], [AC_DEFINE([HAS_STDATOMIC_H])])
+
+AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAS_SYS_MMAN_H])])
+
 # Checks for types
 
 ## off_t
@@ -1348,6 +1354,10 @@ AS_CASE([$host],
   ]
 )
 
+AS_IF([test "x$enable_force_instrumented_runtime" = "xyes"],
+  [force_instrumented_runtime=true]
+)
+
 # The instrumented runtime is built by default
 # if the proper clock source is found.
 # If asked via --enable-instrumented-runtime, configuration fails if the proper
@@ -1801,6 +1811,7 @@ AS_CASE([$enable_systhreads,$enable_unix_lib],
     [*-*-mingw32|*-pc-windows],
       [systhread_support=true
       otherlibraries="$otherlibraries systhreads"
+      PTHREAD_LIBS="-lpthread"
       AC_MSG_NOTICE([the Win32 threads library is supported])],
     [AX_PTHREAD(
       [systhread_support=true
@@ -1842,30 +1853,6 @@ AS_IF([test x"$enable_frame_pointers" = "xyes"],
   [AC_MSG_NOTICE([not using frame pointers])
   frame_pointers=false])
 
-## No naked pointers
-
-AS_IF([test x"$enable_naked_pointers" = "xno" ],
-  [naked_pointers=false
-   AC_DEFINE([NO_NAKED_POINTERS])],
-  [naked_pointers=true])
-
-AS_IF([test x"$enable_naked_pointers_checker" = "xyes" ],
-  [AS_IF([test x"$enable_naked_pointers" = "xno" ],
-         [AC_MSG_ERROR(m4_normalize([
-               --enable-naked-pointers-checker and --disable-naked-pointers
-               are incompatible]))])
-   AS_CASE(["$arch","$system"],
-    [amd64,linux|amd64,macosx \
-    |amd64,openbsd|amd64,win64 \
-    |amd64,freebsd|amd64,solaris \
-    |arm64,linux|arm64,macosx],
-      [naked_pointers_checker=true
-      AC_DEFINE([NAKED_POINTERS_CHECKER])],
-    [*],
-      [AC_MSG_ERROR([naked pointers checker not supported on this platform])]
-  )],
-  [naked_pointers_checker=false])
-
 ## Check for mmap support for huge pages and contiguous heap
 OCAML_MMAP_SUPPORTS_HUGE_PAGES
 
@@ -1971,15 +1958,17 @@ ocamlc_cflags="$common_cflags $sharedlib_cflags \$(CFLAGS)"
 ocamlc_cppflags="$common_cppflags \$(CPPFLAGS)"
 cclibs="$cclibs $mathlib"
 
+AC_CHECK_LIB(execinfo, backtrace, cclibs="$cclibs -lexecinfo",[])
+
 AS_CASE([$host],
   [*-*-mingw32],
-    [bytecclibs="-lws2_32 -lversion"
-    nativecclibs="-lws2_32 -lversion"],
+    [bytecclibs="-lws2_32 -lversion -lpthread -lgcc_eh -lDbgHelp"
+    nativecclibs="-lws2_32 -lversion -lpthread -lgcc_eh -lDbgHelp"],
   [*-pc-windows],
     [bytecclibs="advapi32.lib ws2_32.lib version.lib"
     nativecclibs="advapi32.lib ws2_32.lib version.lib"],
   [bytecclibs="$cclibs $DLLIBS $PTHREAD_LIBS"
-  nativecclibs="$cclibs $DLLIBS"])
+  nativecclibs="$cclibs $DLLIBS $PTHREAD_LIBS"])
 
 AS_IF([test x"$libdir" = x'${exec_prefix}/lib'],
   [libdir="$libdir"/ocaml])
diff --git a/lambda/lambda.ml b/lambda/lambda.ml
index 44d11aebb9dc..fdc5331054f6 100644
--- a/lambda/lambda.ml
+++ b/lambda/lambda.ml
@@ -48,14 +48,18 @@ type primitive =
   | Psetglobal of Ident.t
   (* Operations on heap blocks *)
   | Pmakeblock of int * mutable_flag * block_shape
-  | Pfield of int
+  | Pfield of int * immediate_or_pointer * mutable_flag
   | Pfield_computed
   | Psetfield of int * immediate_or_pointer * initialization_or_assignment
   | Psetfield_computed of immediate_or_pointer * initialization_or_assignment
   | Pfloatfield of int
   | Psetfloatfield of int * initialization_or_assignment
   | Pduprecord of Types.record_representation * int
-  (* Force lazy values *)
+  (* Context switches *)
+  | Prunstack
+  | Pperform
+  | Presume
+  | Preperform
   (* External call *)
   | Pccall of Primitive.description
   (* Exceptions *)
@@ -138,8 +142,15 @@ type primitive =
   | Pbbswap of boxed_integer
   (* Integer to external pointer *)
   | Pint_as_pointer
+  (* Atomic operations *)
+  | Patomic_load of {immediate_or_pointer : immediate_or_pointer}
+  | Patomic_exchange
+  | Patomic_cas
+  | Patomic_fetch_add
   (* Inhibition of optimisation *)
   | Popaque
+  (* Fetching domain-local state *)
+  | Pdls_get
 
 and integer_comparison =
     Ceq | Cne | Clt | Cgt | Cle | Cge
@@ -649,7 +660,8 @@ let rec transl_address loc = function
       then Lprim(Pgetglobal id, [], loc)
       else Lvar id
   | Env.Adot(addr, pos) ->
-      Lprim(Pfield pos, [transl_address loc addr], loc)
+      Lprim(Pfield(pos, Pointer, Immutable),
+                   [transl_address loc addr], loc)
 
 let transl_path find loc env path =
   match find path env with
diff --git a/lambda/lambda.mli b/lambda/lambda.mli
index 9b25034b9170..d8d0d4dee207 100644
--- a/lambda/lambda.mli
+++ b/lambda/lambda.mli
@@ -54,13 +54,18 @@ type primitive =
   | Psetglobal of Ident.t
   (* Operations on heap blocks *)
   | Pmakeblock of int * mutable_flag * block_shape
-  | Pfield of int
+  | Pfield of int * immediate_or_pointer * mutable_flag
   | Pfield_computed
   | Psetfield of int * immediate_or_pointer * initialization_or_assignment
   | Psetfield_computed of immediate_or_pointer * initialization_or_assignment
   | Pfloatfield of int
   | Psetfloatfield of int * initialization_or_assignment
   | Pduprecord of Types.record_representation * int
+  (* Context switches *)
+  | Prunstack
+  | Pperform
+  | Presume
+  | Preperform
   (* External call *)
   | Pccall of Primitive.description
   (* Exceptions *)
@@ -147,8 +152,15 @@ type primitive =
   | Pbbswap of boxed_integer
   (* Integer to external pointer *)
   | Pint_as_pointer
+  (* Atomic operations *)
+  | Patomic_load of {immediate_or_pointer : immediate_or_pointer}
+  | Patomic_exchange
+  | Patomic_cas
+  | Patomic_fetch_add
   (* Inhibition of optimisation *)
   | Popaque
+  (* Fetching domain-local state *)
+  | Pdls_get
 
 and integer_comparison =
     Ceq | Cne | Clt | Cgt | Cle | Cge
diff --git a/lambda/matching.ml b/lambda/matching.ml
index 7e7fe143350d..43bccf6e2c95 100644
--- a/lambda/matching.ml
+++ b/lambda/matching.ml
@@ -1766,7 +1766,8 @@ let get_expr_args_constr ~scopes head (arg, _mut) rem =
       if pos > last_pos then
         argl
       else
-        (Lprim (Pfield pos, [ arg ], loc), binding_kind) :: make_args (pos + 1)
+        (Lprim (Pfield (pos, Pointer, Immutable), [ arg ], loc),
+               binding_kind) :: make_args (pos + 1)
     in
     make_args first_pos
   in
@@ -1794,7 +1795,7 @@ let get_expr_args_variant_constant = drop_expr_arg
 
 let get_expr_args_variant_nonconst ~scopes head (arg, _mut) rem =
   let loc = head_loc ~scopes head in
-  (Lprim (Pfield 1, [ arg ], loc), Alias) :: rem
+  (Lprim (Pfield (1, Pointer, Immutable), [ arg ], loc), Alias) :: rem
 
 let divide_variant ~scopes row ctx { cases = cl; args; default = def } =
   let rec divide = function
@@ -1876,12 +1877,12 @@ let get_mod_field modname field =
 
 let code_force_lazy_block = get_mod_field "CamlinternalLazy" "force_lazy_block"
 
-let code_force_lazy = get_mod_field "CamlinternalLazy" "force"
+let code_force_lazy = get_mod_field "CamlinternalLazy" "force_gen"
 
 (* inline_lazy_force inlines the beginning of the code of Lazy.force. When
    the value argument is tagged as:
    - forward, take field 0
-   - lazy, call the primitive that forces (without testing again the tag)
+   - lazy || forcing, call the primitive that forces
    - anything else, return it
 
    Using Lswitch below relies on the fact that the GC does not shortcut
@@ -1892,8 +1893,11 @@ let inline_lazy_force_cond arg loc =
   let idarg = Ident.create_local "lzarg" in
   let varg = Lvar idarg in
   let tag = Ident.create_local "tag" in
-  let tag_var = Lvar tag in
   let force_fun = Lazy.force code_force_lazy_block in
+  let test_tag t =
+    Lprim(Pintcomp Ceq, [Lvar tag; Lconst(Const_base(Const_int t))], loc)
+  in
+
   Llet
     ( Strict,
       Pgenval,
@@ -1905,18 +1909,16 @@ let inline_lazy_force_cond arg loc =
           tag,
           Lprim (Pccall prim_obj_tag, [ varg ], loc),
           Lifthenelse
-            (* if (tag == Obj.forward_tag) then varg.(0) else ... *)
-            ( Lprim
-                ( Pintcomp Ceq,
-                  [ tag_var; Lconst (Const_base (Const_int Obj.forward_tag)) ],
-                  loc ),
-              Lprim (Pfield 0, [ varg ], loc),
+            ( (* if (tag == Obj.forward_tag) then varg.(0) else ... *)
+              test_tag Obj.forward_tag,
+              Lprim (Pfield (0, Pointer, Mutable), [ varg ], loc),
               Lifthenelse
-                (* if (tag == Obj.lazy_tag) then Lazy.force varg else ... *)
-                ( Lprim
-                    ( Pintcomp Ceq,
-                      [ tag_var; Lconst (Const_base (Const_int Obj.lazy_tag)) ],
-                      loc ),
+                (
+                  (* ... if tag == Obj.lazy_tag || tag == Obj.forcing_tag then
+                         Lazy.force varg
+                       else ... *)
+                  Lprim (Psequor,
+                       [test_tag Obj.lazy_tag; test_tag Obj.forcing_tag], loc),
                   Lapply
                     { ap_tailcall = Default_tailcall;
                       ap_loc = loc;
@@ -1941,14 +1943,26 @@ let inline_lazy_force_switch arg loc =
         ( Lprim (Pisint, [ varg ], loc),
           varg,
           Lswitch
-            ( varg,
-              { sw_numconsts = 0;
-                sw_consts = [];
-                sw_numblocks = 256;
+            ( Lprim (Pccall prim_obj_tag, [ varg ], loc),
+              { sw_numblocks = 0;
+                sw_blocks = [];
+                sw_numconsts = 256;
                 (* PR#6033 - tag ranges from 0 to 255 *)
-                sw_blocks =
-                  [ (Obj.forward_tag, Lprim (Pfield 0, [ varg ], loc));
-                    ( Obj.lazy_tag,
+                sw_consts =
+                  [ (Obj.forward_tag, Lprim (Pfield(0, Pointer, Mutable),
+                                             [ varg ], loc));
+
+                    (Obj.lazy_tag,
+                      Lapply
+                        { ap_tailcall = Default_tailcall;
+                          ap_loc = loc;
+                          ap_func = force_fun;
+                          ap_args = [varg];
+                          ap_inlined = Default_inline;
+                          ap_specialised = Default_specialise
+                        } );
+
+                    (Obj.forcing_tag,
                       Lapply
                         { ap_tailcall = Default_tailcall;
                           ap_loc = loc;
@@ -1972,7 +1986,7 @@ let inline_lazy_force arg loc =
       { ap_tailcall = Default_tailcall;
         ap_loc = loc;
         ap_func = Lazy.force code_force_lazy;
-        ap_args = [ arg ];
+        ap_args = [ Lconst (Const_base (Const_int 0)); arg ];
         ap_inlined = Default_inline;
         ap_specialised = Default_specialise
       }
@@ -2009,7 +2023,8 @@ let get_expr_args_tuple ~scopes head (arg, _mut) rem =
     if pos >= arity then
       rem
     else
-      (Lprim (Pfield pos, [ arg ], loc), Alias) :: make_args (pos + 1)
+      (Lprim (Pfield (pos, Pointer, Immutable), [ arg ], loc),
+             Alias) :: make_args (pos + 1)
   in
   make_args 0
 
@@ -2049,14 +2064,16 @@ let get_expr_args_record ~scopes head (arg, _mut) rem =
       rem
     else
       let lbl = all_labels.(pos) in
+      let ptr = Typeopt.maybe_pointer_type head.pat_env lbl.lbl_arg in
       let access =
         match lbl.lbl_repres with
         | Record_regular
         | Record_inlined _ ->
-            Lprim (Pfield lbl.lbl_pos, [ arg ], loc)
+            Lprim (Pfield (lbl.lbl_pos, ptr, lbl.lbl_mut), [ arg ], loc)
         | Record_unboxed _ -> arg
         | Record_float -> Lprim (Pfloatfield lbl.lbl_pos, [ arg ], loc)
-        | Record_extension _ -> Lprim (Pfield (lbl.lbl_pos + 1), [ arg ], loc)
+        | Record_extension _ ->
+            Lprim (Pfield (lbl.lbl_pos + 1, ptr, lbl.lbl_mut), [ arg ], loc)
       in
       let str =
         match lbl.lbl_mut with
@@ -2802,7 +2819,8 @@ let combine_constructor loc arg pat_env cstr partial ctx def
                       (Lprim (Pintcomp Ceq, [ Lvar tag; ext ], loc), act, rem))
                   nonconsts default
               in
-              Llet (Alias, Pgenval, tag, Lprim (Pfield 0, [ arg ], loc), tests)
+              Llet (Alias, Pgenval, tag,
+                    Lprim (Pfield (0, Pointer, Immutable), [ arg ], loc), tests)
         in
         List.fold_right
           (fun (path, act) rem ->
@@ -2897,7 +2915,7 @@ let call_switcher_variant_constr loc fail arg int_lambda_list =
     ( Alias,
       Pgenval,
       v,
-      Lprim (Pfield 0, [ arg ], loc),
+      Lprim (Pfield (0, Pointer, Immutable), [ arg ], loc),
       call_switcher loc fail (Lvar v) min_int max_int int_lambda_list )
 
 let combine_variant loc row arg partial ctx def (tag_lambda_list, total1, _pats)
diff --git a/lambda/printlambda.ml b/lambda/printlambda.ml
index 769df2d51fd2..9f16612e2c22 100644
--- a/lambda/printlambda.ml
+++ b/lambda/printlambda.ml
@@ -156,7 +156,14 @@ let primitive ppf = function
       fprintf ppf "makeblock %i%a" tag block_shape shape
   | Pmakeblock(tag, Mutable, shape) ->
       fprintf ppf "makemutable %i%a" tag block_shape shape
-  | Pfield n -> fprintf ppf "field %i" n
+  | Pfield(n, ptr, mut) ->
+      let instr =
+        match ptr, mut with
+        | Immediate, _ -> "field_int "
+        | Pointer, Mutable -> "field_mut "
+        | Pointer, Immutable -> "field_imm "
+      in
+      fprintf ppf "%s%i" instr n
   | Pfield_computed -> fprintf ppf "field_computed"
   | Psetfield(n, ptr, init) ->
       let instr =
@@ -194,6 +201,10 @@ let primitive ppf = function
       in
       fprintf ppf "setfloatfield%s %i" init n
   | Pduprecord (rep, size) -> fprintf ppf "duprecord %a %i" record_rep rep size
+  | Prunstack -> fprintf ppf "runstack"
+  | Pperform -> fprintf ppf "perform"
+  | Presume -> fprintf ppf "resume"
+  | Preperform -> fprintf ppf "reperform"
   | Pccall p -> fprintf ppf "%s" p.prim_name
   | Praise k -> fprintf ppf "%s" (Lambda.raise_kind k)
   | Psequand -> fprintf ppf "&&"
@@ -266,14 +277,14 @@ let primitive ppf = function
   | Paddbint bi -> print_boxed_integer "add" ppf bi
   | Psubbint bi -> print_boxed_integer "sub" ppf bi
   | Pmulbint bi -> print_boxed_integer "mul" ppf bi
-  | Pdivbint { size = bi; is_safe = Safe } ->
-      print_boxed_integer "div" ppf bi
-  | Pdivbint { size = bi; is_safe = Unsafe } ->
-      print_boxed_integer "div_unsafe" ppf bi
-  | Pmodbint { size = bi; is_safe = Safe } ->
-      print_boxed_integer "mod" ppf bi
-  | Pmodbint { size = bi; is_safe = Unsafe } ->
-      print_boxed_integer "mod_unsafe" ppf bi
+  | Pdivbint { size; is_safe = Safe } ->
+      print_boxed_integer "div" ppf size
+  | Pdivbint { size; is_safe = Unsafe } ->
+      print_boxed_integer "div_unsafe" ppf size
+  | Pmodbint { size; is_safe = Safe } ->
+      print_boxed_integer "mod" ppf size
+  | Pmodbint { size; is_safe = Unsafe } ->
+      print_boxed_integer "mod_unsafe" ppf size
   | Pandbint bi -> print_boxed_integer "and" ppf bi
   | Porbint bi -> print_boxed_integer "or" ppf bi
   | Pxorbint bi -> print_boxed_integer "xor" ppf bi
@@ -339,7 +350,15 @@ let primitive ppf = function
   | Pbswap16 -> fprintf ppf "bswap16"
   | Pbbswap(bi) -> print_boxed_integer "bswap" ppf bi
   | Pint_as_pointer -> fprintf ppf "int_as_pointer"
+  | Patomic_load {immediate_or_pointer} ->
+      (match immediate_or_pointer with
+        | Immediate -> fprintf ppf "atomic_load_imm"
+        | Pointer -> fprintf ppf "atomic_load_ptr")
+  | Patomic_exchange -> fprintf ppf "atomic_exchange"
+  | Patomic_cas -> fprintf ppf "atomic_cas"
+  | Patomic_fetch_add -> fprintf ppf "atomic_fetch_add"
   | Popaque -> fprintf ppf "opaque"
+  | Pdls_get -> fprintf ppf "dls_get"
 
 let name_of_primitive = function
   | Pbytes_of_string -> "Pbytes_of_string"
@@ -442,7 +461,19 @@ let name_of_primitive = function
   | Pbswap16 -> "Pbswap16"
   | Pbbswap _ -> "Pbbswap"
   | Pint_as_pointer -> "Pint_as_pointer"
+  | Patomic_load {immediate_or_pointer} ->
+      (match immediate_or_pointer with
+        | Immediate -> "atomic_load_imm"
+        | Pointer -> "atomic_load_ptr")
+  | Patomic_exchange -> "Patomic_exchange"
+  | Patomic_cas -> "Patomic_cas"
+  | Patomic_fetch_add -> "Patomic_fetch_add"
   | Popaque -> "Popaque"
+  | Prunstack -> "Prunstack"
+  | Presume -> "Presume"
+  | Pperform -> "Pperform"
+  | Preperform -> "Preperform"
+  | Pdls_get -> "Pdls_get"
 
 let function_attribute ppf t =
   if t.is_a_functor then
diff --git a/lambda/simplif.ml b/lambda/simplif.ml
index f6fbdce982ac..f301ed2872d7 100644
--- a/lambda/simplif.ml
+++ b/lambda/simplif.ml
@@ -42,7 +42,7 @@ let rec eliminate_ref id = function
   | Lletrec(idel, e2) ->
       Lletrec(List.map (fun (v, e) -> (v, eliminate_ref id e)) idel,
               eliminate_ref id e2)
-  | Lprim(Pfield 0, [Lvar v], _) when Ident.same v id ->
+  | Lprim(Pfield (0, _, _), [Lvar v], _) when Ident.same v id ->
       Lmutvar id
   | Lprim(Psetfield(0, _, _), [Lvar v; e], _) when Ident.same v id ->
       Lassign(id, eliminate_ref id e)
diff --git a/lambda/tmc.ml b/lambda/tmc.ml
index f96bfb98840d..a40864734550 100644
--- a/lambda/tmc.ml
+++ b/lambda/tmc.ml
@@ -869,6 +869,12 @@ let rec choice ctx t =
     | Pignore
     | Pcompare_ints | Pcompare_floats | Pcompare_bints _
 
+    (* we don't handle effect or DLS primitives *)
+    | Prunstack | Pperform | Presume | Preperform | Pdls_get
+
+    (* we don't handle atomic primitives *)
+    | Patomic_exchange | Patomic_cas | Patomic_fetch_add | Patomic_load _
+
     (* we don't handle array indices as destinations yet *)
     | (Pmakearray _ | Pduparray _)
 
diff --git a/lambda/translclass.ml b/lambda/translclass.ml
index 29547d520170..b4b682a1d7fb 100644
--- a/lambda/translclass.ml
+++ b/lambda/translclass.ml
@@ -62,7 +62,8 @@ let mkappl (func, args) =
 let lsequence l1 l2 =
   if l2 = lambda_unit then l1 else Lsequence(l1, l2)
 
-let lfield v i = Lprim(Pfield i, [Lvar v], Loc_unknown)
+let lfield v i = Lprim(Pfield (i, Pointer, Mutable),
+                       [Lvar v], Loc_unknown)
 
 let transl_label l = share (Const_immstring l)
 
@@ -133,7 +134,7 @@ let rec build_object_init ~scopes cl_table obj params inh_init obj_init cl =
       let env =
         match envs with None -> []
         | Some envs ->
-            [Lprim(Pfield (List.length inh_init + 1),
+            [Lprim(Pfield (List.length inh_init + 1, Pointer, Mutable),
                    [Lvar envs],
                    Loc_unknown)]
       in
@@ -278,8 +279,10 @@ let rec build_class_init ~scopes cla cstr super inh_init cl_init msubst top cl =
       | (_, path_lam, obj_init)::inh_init ->
           (inh_init,
            Llet (Strict, Pgenval, obj_init,
-                 mkappl(Lprim(Pfield 1, [path_lam], Loc_unknown), Lvar cla ::
-                        if top then [Lprim(Pfield 3, [path_lam], Loc_unknown)]
+                 mkappl(Lprim(Pfield (1, Pointer, Mutable),
+                              [path_lam], Loc_unknown), Lvar cla ::
+                        if top then [Lprim(Pfield (3, Pointer, Mutable),
+                                     [path_lam], Loc_unknown)]
                         else []),
                  bind_super cla super cl_init))
       | _ ->
@@ -544,7 +547,7 @@ let rec builtin_meths self env env2 body =
     | p when const_path p -> "const", [p]
     | Lprim(Parrayrefu _, [Lvar s; Lvar n], _) when List.mem s self ->
         "var", [Lvar n]
-    | Lprim(Pfield n, [Lvar e], _) when Ident.same e env ->
+    | Lprim(Pfield(n, _, _), [Lvar e], _) when Ident.same e env ->
         "env", [Lvar env2; Lconst(const_int n)]
     | Lsend(Self, met, Lvar s, [], _) when List.mem s self ->
         "meth", [met]
@@ -841,7 +844,8 @@ let transl_class ~scopes ids cl_id pub_meths cl vflag =
           Loc_unknown)
   and linh_envs =
     List.map
-      (fun (_, path_lam, _) -> Lprim(Pfield 3, [path_lam], Loc_unknown))
+      (fun (_, path_lam, _) ->
+        Lprim(Pfield (3, Pointer, Mutable), [path_lam], Loc_unknown))
       (List.rev inh_init)
   in
   let make_envs lam =
@@ -861,7 +865,8 @@ let transl_class ~scopes ids cl_id pub_meths cl vflag =
   in
   let inh_keys =
     List.map
-      (fun (_, path_lam, _) -> Lprim(Pfield 1, [path_lam], Loc_unknown))
+      (fun (_, path_lam, _) ->
+        Lprim(Pfield (1, Pointer, Mutable), [path_lam], Loc_unknown))
       inh_paths
   in
   let lclass lam =
diff --git a/lambda/translcore.ml b/lambda/translcore.ml
index f718ba0c6a00..1914bf8e097a 100644
--- a/lambda/translcore.ml
+++ b/lambda/translcore.ml
@@ -373,14 +373,14 @@ and transl_exp0 ~in_new_scope ~scopes e =
       let targ = transl_exp ~scopes arg in
       begin match lbl.lbl_repres with
           Record_regular | Record_inlined _ ->
-          Lprim (Pfield lbl.lbl_pos, [targ],
+          Lprim (Pfield (lbl.lbl_pos, maybe_pointer e, lbl.lbl_mut), [targ],
                  of_location ~scopes e.exp_loc)
         | Record_unboxed _ -> targ
         | Record_float ->
           Lprim (Pfloatfield lbl.lbl_pos, [targ],
                  of_location ~scopes e.exp_loc)
         | Record_extension _ ->
-          Lprim (Pfield (lbl.lbl_pos + 1), [targ],
+          Lprim (Pfield (lbl.lbl_pos + 1, maybe_pointer e, lbl.lbl_mut), [targ],
                  of_location ~scopes e.exp_loc)
       end
   | Texp_setfield(arg, _, lbl, newval) ->
@@ -489,7 +489,8 @@ and transl_exp0 ~in_new_scope ~scopes e =
       Lapply{
         ap_loc=loc;
         ap_func=
-          Lprim(Pfield 0, [transl_class_path loc e.exp_env cl], loc);
+          Lprim(Pfield (0, Pointer, Mutable),
+                [transl_class_path loc e.exp_env cl], loc);
         ap_args=[lambda_unit];
         ap_tailcall=Default_tailcall;
         ap_inlined=Default_inline;
@@ -622,7 +623,7 @@ and transl_exp0 ~in_new_scope ~scopes e =
           let body, _ =
             List.fold_left (fun (body, pos) id ->
               Llet(Alias, Pgenval, id,
-                   Lprim(Pfield pos, [Lvar oid],
+                   Lprim(Pfield (pos, Pointer, Mutable), [Lvar oid],
                          of_location ~scopes od.open_loc), body),
               pos + 1
             ) (transl_exp ~scopes e, 0)
@@ -657,7 +658,7 @@ and transl_guard ~scopes guard rhs =
         (Lifthenelse(transl_exp ~scopes cond, expr, staticfail))
 
 and transl_case ~scopes {c_lhs; c_guard; c_rhs} =
-  c_lhs, transl_guard ~scopes c_guard c_rhs
+  (c_lhs, transl_guard ~scopes c_guard c_rhs)
 
 and transl_cases ~scopes cases =
   let cases =
@@ -951,13 +952,15 @@ and transl_record ~scopes loc env fields repres opt_init_expr =
       Array.mapi
         (fun i (_, definition) ->
            match definition with
-           | Kept typ ->
+           | Kept (typ, mut) ->
                let field_kind = value_kind env typ in
                let access =
                  match repres with
-                   Record_regular | Record_inlined _ -> Pfield i
+                   Record_regular | Record_inlined _ ->
+                     Pfield (i, maybe_pointer_type env typ, mut)
                  | Record_unboxed _ -> assert false
-                 | Record_extension _ -> Pfield (i + 1)
+                 | Record_extension _ ->
+                     Pfield (i + 1, maybe_pointer_type env typ, mut)
                  | Record_float -> Pfloatfield i in
                Lprim(access, [Lvar init_id],
                      of_location ~scopes loc),
@@ -1009,7 +1012,7 @@ and transl_record ~scopes loc env fields repres opt_init_expr =
     let copy_id = Ident.create_local "newrecord" in
     let update_field cont (lbl, definition) =
       match definition with
-      | Kept _type -> cont
+      | Kept _ -> cont
       | Overridden (_lid, expr) ->
           let upd =
             match repres with
diff --git a/lambda/translmod.ml b/lambda/translmod.ml
index bd9b7bedeb53..6a3f2caecc35 100644
--- a/lambda/translmod.ml
+++ b/lambda/translmod.ml
@@ -83,7 +83,7 @@ let rec apply_coercion loc strict restr arg =
       name_lambda strict arg (fun id ->
         let get_field pos =
           if pos < 0 then lambda_unit
-          else Lprim(Pfield pos,[Lvar id], loc)
+          else Lprim(Pfield (pos, Pointer, Mutable), [Lvar id], loc)
         in
         let lam =
           Lprim(Pmakeblock(0, Immutable, None),
@@ -720,8 +720,8 @@ and transl_structure ~scopes loc fields cc rootpath final_env = function
                   rebind_idents (pos + 1) (id :: newfields) ids
                 in
                 Llet(Alias, Pgenval, id,
-                     Lprim(Pfield pos, [Lvar mid],
-                           of_location ~scopes incl.incl_loc), body),
+                     Lprim(Pfield (pos, Pointer, Mutable),
+                        [Lvar mid], of_location ~scopes incl.incl_loc), body),
                 size
           in
           let body, size = rebind_idents 0 fields ids in
@@ -749,7 +749,7 @@ and transl_structure ~scopes loc fields cc rootpath final_env = function
                     rebind_idents (pos + 1) (id :: newfields) ids
                   in
                   Llet(Alias, Pgenval, id,
-                      Lprim(Pfield pos, [Lvar mid],
+                      Lprim(Pfield (pos, Pointer, Mutable), [Lvar mid],
                             of_location ~scopes od.open_loc), body),
                   size
               in
@@ -968,7 +968,8 @@ let transl_store_subst = ref Ident.Map.empty
 
 let nat_toplevel_name id =
   try match Ident.Map.find id !transl_store_subst with
-    | Lprim(Pfield pos, [Lprim(Pgetglobal glob, [], _)], _) -> (glob,pos)
+    | Lprim(Pfield (pos, _, _),
+            [Lprim(Pgetglobal glob, [], _)], _) -> (glob,pos)
     | _ -> raise Not_found
   with Not_found ->
     fatal_error("Translmod.nat_toplevel_name: " ^ Ident.unique_name id)
@@ -1204,7 +1205,8 @@ let transl_store_structure ~scopes glob map prims aliases str =
               | [] -> transl_store
                         ~scopes rootpath (add_idents true ids subst) cont rem
               | id :: idl ->
-                  Llet(Alias, Pgenval, id, Lprim(Pfield pos, [Lvar mid],
+                  Llet(Alias, Pgenval, id,
+                       Lprim(Pfield (pos, Pointer, Mutable), [Lvar mid],
                                                  of_location ~scopes loc),
                        Lsequence(store_ident (of_location ~scopes loc) id,
                                  store_idents (pos + 1) idl))
@@ -1250,8 +1252,9 @@ let transl_store_structure ~scopes glob map prims aliases str =
                         [] -> transl_store ~scopes rootpath
                                 (add_idents true ids subst) cont rem
                       | id :: idl ->
-                          Llet(Alias, Pgenval, id, Lprim(Pfield pos, [Lvar mid],
-                                                         loc),
+                          Llet(Alias, Pgenval, id,
+                               Lprim(Pfield (pos, Pointer, Mutable),
+                                     [Lvar mid], loc),
                                Lsequence(store_ident loc id,
                                          store_idents (pos + 1) idl))
                     in
@@ -1285,7 +1288,7 @@ let transl_store_structure ~scopes glob map prims aliases str =
       match cc with
         Tcoerce_none ->
           Ident.Map.add id
-            (Lprim(Pfield pos,
+            (Lprim(Pfield (pos, Pointer, Immutable),
                    [Lprim(Pgetglobal glob, [], Loc_unknown)],
                    Loc_unknown))
             subst
@@ -1424,7 +1427,7 @@ let toplevel_name id =
 let toploop_getvalue id =
   Lapply{
     ap_loc=Loc_unknown;
-    ap_func=Lprim(Pfield toploop_getvalue_pos,
+    ap_func=Lprim(Pfield (toploop_getvalue_pos, Pointer, Mutable),
                   [Lprim(Pgetglobal toploop_ident, [], Loc_unknown)],
                   Loc_unknown);
     ap_args=[Lconst(Const_base(
@@ -1437,7 +1440,7 @@ let toploop_getvalue id =
 let toploop_setvalue id lam =
   Lapply{
     ap_loc=Loc_unknown;
-    ap_func=Lprim(Pfield toploop_setvalue_pos,
+    ap_func=Lprim(Pfield (toploop_setvalue_pos, Pointer, Mutable),
                   [Lprim(Pgetglobal toploop_ident, [], Loc_unknown)],
                   Loc_unknown);
     ap_args=
@@ -1522,7 +1525,8 @@ let transl_toplevel_item ~scopes item =
           lambda_unit
       | id :: ids ->
           Lsequence(toploop_setvalue id
-                      (Lprim(Pfield pos, [Lvar mid], Loc_unknown)),
+                      (Lprim(Pfield (pos, Pointer, Mutable),
+                             [Lvar mid], Loc_unknown)),
                     set_idents (pos + 1) ids) in
       Llet(Strict, Pgenval, mid,
            transl_module ~scopes Tcoerce_none None modl, set_idents 0 ids)
@@ -1545,7 +1549,8 @@ let transl_toplevel_item ~scopes item =
                 lambda_unit
             | id :: ids ->
                 Lsequence(toploop_setvalue id
-                            (Lprim(Pfield pos, [Lvar mid], Loc_unknown)),
+                            (Lprim(Pfield (pos, Pointer, Mutable),
+                                  [Lvar mid], Loc_unknown)),
                           set_idents (pos + 1) ids)
           in
           Llet(pure, Pgenval, mid,
@@ -1648,7 +1653,8 @@ let transl_store_package component_names target_name coercion =
                (fun pos _id ->
                  Lprim(Psetfield(pos, Pointer, Root_initialization),
                        [Lprim(Pgetglobal target_name, [], Loc_unknown);
-                        Lprim(Pfield pos, [Lvar blk], Loc_unknown)],
+                        Lprim(Pfield (pos, Pointer, Mutable),
+                              [Lvar blk], Loc_unknown)],
                        Loc_unknown))
                0 pos_cc_list))
   (*
diff --git a/lambda/translobj.ml b/lambda/translobj.ml
index d7f11beaca4a..422c55c3df36 100644
--- a/lambda/translobj.ml
+++ b/lambda/translobj.ml
@@ -125,7 +125,8 @@ let transl_label_init_flambda f =
 let transl_store_label_init glob size f arg =
   assert(not Config.flambda);
   assert(!Clflags.native_code);
-  method_cache := Lprim(Pfield size,
+  method_cache := Lprim(Pfield (size, Pointer, Mutable),
+                        (* XXX KC: conservative *)
                         [Lprim(Pgetglobal glob, [], Loc_unknown)],
                         Loc_unknown);
   let expr = f arg in
diff --git a/lambda/translprim.ml b/lambda/translprim.ml
index 5770aa6c620d..dd1114c4e7bf 100644
--- a/lambda/translprim.ml
+++ b/lambda/translprim.ml
@@ -127,8 +127,8 @@ let primitives_table =
     "%loc_POS", Loc Loc_POS;
     "%loc_MODULE", Loc Loc_MODULE;
     "%loc_FUNCTION", Loc Loc_FUNCTION;
-    "%field0", Primitive ((Pfield 0), 1);
-    "%field1", Primitive ((Pfield 1), 1);
+    "%field0", Primitive (Pfield(0, Pointer, Mutable), 1);
+    "%field1", Primitive (Pfield(1, Pointer, Mutable), 1);
     "%setfield0", Primitive ((Psetfield(0, Pointer, Assignment)), 2);
     "%makeblock", Primitive ((Pmakeblock(0, Immutable, None)), 1);
     "%makemutable", Primitive ((Pmakeblock(0, Mutable, None)), 1);
@@ -363,6 +363,16 @@ let primitives_table =
     "%greaterequal", Comparison(Greater_equal, Compare_generic);
     "%greaterthan", Comparison(Greater_than, Compare_generic);
     "%compare", Comparison(Compare, Compare_generic);
+    "%atomic_load",
+    Primitive ((Patomic_load {immediate_or_pointer=Pointer}), 1);
+    "%atomic_exchange", Primitive (Patomic_exchange, 2);
+    "%atomic_cas", Primitive (Patomic_cas, 3);
+    "%atomic_fetch_add", Primitive (Patomic_fetch_add, 2);
+    "%runstack", Primitive (Prunstack, 3);
+    "%reperform", Primitive (Preperform, 3);
+    "%perform", Primitive (Pperform, 1);
+    "%resume", Primitive (Presume, 3);
+    "%dls_get", Primitive (Pdls_get, 1);
   ]
 
 
@@ -427,6 +437,12 @@ let specialize_primitive env ty ~has_constant_constructor prim =
       | Pointer -> None
       | Immediate -> Some (Primitive (Psetfield(n, Immediate, init), arity))
     end
+  | Primitive (Pfield (n, Pointer, mut), arity), _ ->
+      (* try strength reduction based on the *result type* *)
+      let is_int = match is_function_type env ty with
+        | None -> Pointer
+        | Some (_p1, rhs) -> maybe_pointer_type env rhs in
+      Some (Primitive (Pfield (n, is_int, mut), arity))
   | Primitive (Parraylength t, arity), [p] -> begin
       let array_type = glb_array_type t (array_type_kind env p) in
       if t = array_type then None
@@ -472,6 +488,13 @@ let specialize_primitive env ty ~has_constant_constructor prim =
       if useful then Some (Primitive (Pmakeblock(tag, mut, Some shape), arity))
       else None
     end
+  | Primitive (Patomic_load { immediate_or_pointer = Pointer },
+               arity), _ ->begin
+      let is_int = match is_function_type env ty with
+        | None -> Pointer
+        | Some (_p1, rhs) -> maybe_pointer_type env rhs in
+      Some (Primitive (Patomic_load {immediate_or_pointer = is_int}, arity))
+    end
   | Comparison(comp, Compare_generic), p1 :: _ ->
     if (has_constant_constructor
         && simplify_constant_constructor comp) then begin
@@ -788,6 +811,7 @@ let lambda_primitive_needs_event_after = function
   | Pbytes_load_64 _ | Pbytes_set_16 _ | Pbytes_set_32 _ | Pbytes_set_64 _
   | Pbigstring_load_16 _ | Pbigstring_load_32 _ | Pbigstring_load_64 _
   | Pbigstring_set_16 _ | Pbigstring_set_32 _ | Pbigstring_set_64 _
+  | Prunstack | Pperform | Preperform | Presume
   | Pbbswap _ -> true
 
   | Pbytes_to_string | Pbytes_of_string | Pignore | Psetglobal _
@@ -800,7 +824,9 @@ let lambda_primitive_needs_event_after = function
   | Pfloatcomp _ | Pstringlength | Pstringrefu | Pbyteslength | Pbytesrefu
   | Pbytessetu | Pmakearray ((Pintarray | Paddrarray | Pfloatarray), _)
   | Parraylength _ | Parrayrefu _ | Parraysetu _ | Pisint | Pisout
-  | Pintofbint _ | Pctconst _ | Pbswap16 | Pint_as_pointer | Popaque -> false
+  | Patomic_exchange | Patomic_cas | Patomic_fetch_add | Patomic_load _
+  | Pintofbint _ | Pctconst _ | Pbswap16 | Pint_as_pointer | Popaque | Pdls_get
+      -> false
 
 (* Determine if a primitive should be surrounded by an "after" debug event *)
 let primitive_needs_event_after = function
diff --git a/manual/src/library/libthreads.etex b/manual/src/library/libthreads.etex
index 18f18402ed62..50173a21e289 100644
--- a/manual/src/library/libthreads.etex
+++ b/manual/src/library/libthreads.etex
@@ -28,8 +28,5 @@ the "-I +threads" option (see chapter~\ref{c:camlc}).
 
 \begin{linklist}
 \libdocitem{Thread}{lightweight threads}
-\libdocitem{Mutex}{locks for mutual exclusion}
-\libdocitem{Condition}{condition variables to synchronize between threads}
-\libdocitem{Semaphore}{semaphores, another thread synchronization mechanism}
 \libdocitem{Event}{first-class synchronous communication}
 \end{linklist}
diff --git a/manual/src/library/stdlib-blurb.etex b/manual/src/library/stdlib-blurb.etex
index 8f2e00dfe0ab..f175b0efa117 100644
--- a/manual/src/library/stdlib-blurb.etex
+++ b/manual/src/library/stdlib-blurb.etex
@@ -103,6 +103,14 @@ be called from C \\
 "Printexc" & p.~\stdpageref{Printexc} & a catch-all exception handler \\
 "Sys" & p.~\stdpageref{Sys} & system interface \\
 \end{tabular}
+\subsubsection*{sss:stdlib-multicore}{Multicore interface:}
+\begin{tabular}{lll}
+"Domain" & p.~\stdpageref{Domain} & domain spawn and join \\
+"Mutex" & p.~\stdpageref{Mutex} & mutual exclusion locks \\
+"Condition" & p.~\stdpageref{Condition} & condition variables \\
+"Semaphore" & p.~\stdpageref{Semaphore} & semaphores \\
+"EffectHandlers" & p.~\stdpageref{EffectHandlers} & deep and shallow effect handlers \\
+\end{tabular}
 \subsubsection*{sss:stdlib-misc}{Misc:}
 \begin{tabular}{lll}
 "Fun" & p.~\stdpageref{Fun} & function values \\
@@ -122,7 +130,10 @@ be called from C \\
 \stddocitem{Callback}{registering OCaml values with the C runtime}
 \stddocitem{Char}{character operations}
 \stddocitem{Complex}{complex numbers}
+\stddocitem{Condition}{condition variables to synchronize between threads}
+\stddocitem{Domain}{Domain spawn/join and domain local variables}
 \stddocitem{Digest}{MD5 message digest}
+\stddocitem{EffectHandlers}{deep and shallow effect handlers}
 \stddocitem{Either}{either values}
 \stddocitem{Ephemeron}{Ephemerons and weak hash table}
 \stddocitem{Filename}{operations on file names}
@@ -143,6 +154,7 @@ be called from C \\
 \stddocitem{Map}{association tables over ordered types}
 \stddocitem{Marshal}{marshaling of data structures}
 \stddocitem{MoreLabels}{include modules \texttt{Hashtbl}, \texttt{Map} and \texttt{Set} with labels}
+\stddocitem{Mutex}{locks for mutual exclusion}
 \stddocitem{Nativeint}{processor-native integers}
 \stddocitem{Oo}{object-oriented extension}
 \stddocitem{Option}{option values}
@@ -156,6 +168,7 @@ be called from C \\
 \stddocitem{Scanf}{formatted input functions}
 \stddocitem{Seq}{functional iterators}
 \stddocitem{Set}{sets over ordered types}
+\stddocitem{Semaphore}{semaphores, another thread synchronization mechanism}
 \stddocitem{Stack}{last-in first-out stacks}
 \stddocitem{StdLabels}{include modules \texttt{Array}, \texttt{List} and \texttt{String} with labels}
 \stddocitem{Stream}{streams and parsers}
diff --git a/middle_end/clambda_primitives.ml b/middle_end/clambda_primitives.ml
index 3dd0587972a5..14758dcae4cd 100644
--- a/middle_end/clambda_primitives.ml
+++ b/middle_end/clambda_primitives.ml
@@ -34,13 +34,18 @@ type primitive =
   | Pread_symbol of string
   (* Operations on heap blocks *)
   | Pmakeblock of int * mutable_flag * block_shape
-  | Pfield of int
+  | Pfield of int * immediate_or_pointer * mutable_flag
   | Pfield_computed
   | Psetfield of int * immediate_or_pointer * initialization_or_assignment
   | Psetfield_computed of immediate_or_pointer * initialization_or_assignment
   | Pfloatfield of int
   | Psetfloatfield of int * initialization_or_assignment
   | Pduprecord of Types.record_representation * int
+  (* Context switches *)
+  | Prunstack
+  | Pperform
+  | Presume
+  | Preperform
   (* External call *)
   | Pccall of Primitive.description
   (* Exceptions *)
@@ -114,8 +119,15 @@ type primitive =
   | Pbbswap of boxed_integer
   (* Integer to external pointer *)
   | Pint_as_pointer
+  (* Atomic operations *)
+  | Patomic_load of {immediate_or_pointer : immediate_or_pointer}
+  | Patomic_exchange
+  | Patomic_cas
+  | Patomic_fetch_add
   (* Inhibition of optimisation *)
   | Popaque
+  (* Fetch domain-local state *)
+  | Pdls_get
 
 and integer_comparison = Lambda.integer_comparison =
     Ceq | Cne | Clt | Cgt | Cle | Cge
diff --git a/middle_end/clambda_primitives.mli b/middle_end/clambda_primitives.mli
index a75cd048149d..3ada2d8d8ed8 100644
--- a/middle_end/clambda_primitives.mli
+++ b/middle_end/clambda_primitives.mli
@@ -34,13 +34,18 @@ type primitive =
   | Pread_symbol of string
   (* Operations on heap blocks *)
   | Pmakeblock of int * mutable_flag * block_shape
-  | Pfield of int
+  | Pfield of int * immediate_or_pointer * mutable_flag
   | Pfield_computed
   | Psetfield of int * immediate_or_pointer * initialization_or_assignment
   | Psetfield_computed of immediate_or_pointer * initialization_or_assignment
   | Pfloatfield of int
   | Psetfloatfield of int * initialization_or_assignment
   | Pduprecord of Types.record_representation * int
+  (* Context switches *)
+  | Prunstack
+  | Pperform
+  | Presume
+  | Preperform
   (* External call *)
   | Pccall of Primitive.description
   (* Exceptions *)
@@ -117,8 +122,16 @@ type primitive =
   | Pbbswap of boxed_integer
   (* Integer to external pointer *)
   | Pint_as_pointer
+  (* Atomic operations *)
+  | Patomic_load of {immediate_or_pointer : immediate_or_pointer}
+  | Patomic_exchange
+  | Patomic_cas
+  | Patomic_fetch_add
   (* Inhibition of optimisation *)
   | Popaque
+  (* Fetch domain-local state *)
+  | Pdls_get
+
 
 and integer_comparison = Lambda.integer_comparison =
     Ceq | Cne | Clt | Cgt | Cle | Cge
diff --git a/middle_end/closure/closure.ml b/middle_end/closure/closure.ml
index 74d8819525e0..bf73f487c62b 100644
--- a/middle_end/closure/closure.ml
+++ b/middle_end/closure/closure.ml
@@ -54,7 +54,8 @@ let rec build_closure_env env_param pos = function
     [] -> V.Map.empty
   | id :: rem ->
       V.Map.add id
-        (Uprim(P.Pfield pos, [Uvar env_param], Debuginfo.none))
+        (Uprim(P.Pfield(pos, Pointer, Immutable),
+              [Uvar env_param], Debuginfo.none))
           (build_closure_env env_param (pos+1) rem)
 
 (* Auxiliary for accessing globals.  We change the name of the global
@@ -479,10 +480,11 @@ let simplif_prim_pure ~backend fpc p (args, approxs) dbg =
         (Uprim(p, args, dbg), Value_tuple (Array.of_list approxs))
       end
   (* Field access *)
-  | Pfield n, _, [ Value_const(Uconst_ref(_, Some (Uconst_block(_, l)))) ]
+  | Pfield (n, _, _), _,
+            [ Value_const(Uconst_ref(_, Some (Uconst_block(_, l)))) ]
     when n < List.length l ->
       make_const (List.nth l n)
-  | Pfield n, [ Uprim(P.Pmakeblock _, ul, _) ], [approx]
+  | Pfield(n, _, _), [ Uprim(P.Pmakeblock _, ul, _) ], [approx]
     when n < List.length ul ->
       (* This case is particularly useful for removing allocations
          for optional parameters *)
@@ -849,7 +851,7 @@ let check_constant_result ulam approx =
           let glb =
             Uprim(P.Pread_symbol id, [], Debuginfo.none)
           in
-          Uprim(P.Pfield i, [glb], Debuginfo.none), approx
+          Uprim(P.Pfield(i, Pointer, Immutable), [glb], Debuginfo.none), approx
       end
   | _ -> (ulam, approx)
 
@@ -1102,10 +1104,10 @@ let rec close ({ backend; fenv; cenv ; mutable_vars } as env) lam =
       let dbg = Debuginfo.from_location loc in
       check_constant_result (getglobal dbg id)
                             (Compilenv.global_approx id)
-  | Lprim(Pfield n, [lam], loc) ->
+  | Lprim(Pfield (n, ptr, mut), [lam], loc) ->
       let (ulam, approx) = close env lam in
       let dbg = Debuginfo.from_location loc in
-      check_constant_result (Uprim(P.Pfield n, [ulam], dbg))
+      check_constant_result (Uprim(P.Pfield (n, ptr, mut), [ulam], dbg))
                             (field_approx n approx)
   | Lprim(Psetfield(n, is_ptr, init), [Lprim(Pgetglobal id, [], _); lam], loc)->
       let (ulam, approx) = close env lam in
diff --git a/middle_end/convert_primitives.ml b/middle_end/convert_primitives.ml
index 479663b941be..667465b7cbfd 100644
--- a/middle_end/convert_primitives.ml
+++ b/middle_end/convert_primitives.ml
@@ -26,7 +26,8 @@ let convert (prim : Lambda.primitive) : Clambda_primitives.primitive =
   match prim with
   | Pmakeblock (tag, mutability, shape) ->
       Pmakeblock (tag, mutability, shape)
-  | Pfield field -> Pfield field
+  | Pfield (field, imm_or_pointer, mutability) ->
+      Pfield (field, imm_or_pointer, mutability)
   | Pfield_computed -> Pfield_computed
   | Psetfield (field, imm_or_pointer, init_or_assign) ->
       Psetfield (field, imm_or_pointer, init_or_assign)
@@ -36,6 +37,10 @@ let convert (prim : Lambda.primitive) : Clambda_primitives.primitive =
   | Psetfloatfield (field, init_or_assign) ->
       Psetfloatfield (field, init_or_assign)
   | Pduprecord (repr, size) -> Pduprecord (repr, size)
+  | Prunstack -> Prunstack
+  | Pperform -> Pperform
+  | Presume -> Presume
+  | Preperform -> Preperform
   | Pccall prim -> Pccall prim
   | Praise kind -> Praise kind
   | Psequand -> Psequand
@@ -139,8 +144,13 @@ let convert (prim : Lambda.primitive) : Clambda_primitives.primitive =
   | Pbigarraydim dim -> Pbigarraydim dim
   | Pbswap16 -> Pbswap16
   | Pint_as_pointer -> Pint_as_pointer
+  | Patomic_load { immediate_or_pointer } ->
+      Patomic_load { immediate_or_pointer }
+  | Patomic_exchange -> Patomic_exchange
+  | Patomic_cas -> Patomic_cas
+  | Patomic_fetch_add -> Patomic_fetch_add
   | Popaque -> Popaque
-
+  | Pdls_get -> Pdls_get
   | Pbytes_to_string
   | Pbytes_of_string
   | Pctconst _
diff --git a/middle_end/flambda/build_export_info.ml b/middle_end/flambda/build_export_info.ml
index 554b69a2e245..a3cb96d2519e 100644
--- a/middle_end/flambda/build_export_info.ml
+++ b/middle_end/flambda/build_export_info.ml
@@ -286,7 +286,7 @@ and descr_of_named (env : Env.t) (named : Flambda.named)
       Value_block (Tag.create_exn tag, Array.of_list approxs)
     in
     Value_id (Env.new_descr env descr)
-  | Prim (Pfield i, [arg], _) ->
+  | Prim (Pfield (i, _, _), [arg], _) ->
     begin match Env.get_descr env (Env.find_approx env arg) with
     | Some (Value_block (_, fields)) when Array.length fields > i -> fields.(i)
     | _ -> Value_unknown
diff --git a/middle_end/flambda/closure_conversion.ml b/middle_end/flambda/closure_conversion.ml
index b218ed40dcca..7b811166afa9 100644
--- a/middle_end/flambda/closure_conversion.ml
+++ b/middle_end/flambda/closure_conversion.ml
@@ -91,7 +91,8 @@ let tupled_function_call_stub original_params unboxed_version ~closure_bound_var
   let _, body =
     List.fold_left (fun (pos, body) param ->
         let lam : Flambda.named =
-          Prim (Pfield pos, [tuple_param_var], Debuginfo.none)
+          Prim (Pfield (pos, Pointer, Mutable),
+                [tuple_param_var], Debuginfo.none)
         in
         pos + 1, Flambda.create_let param lam body)
       (0, call) params
@@ -698,9 +699,10 @@ let lambda_to_flambda ~backend ~module_ident ~size lam
       Flambda.create_let
         sym_v (Symbol block_symbol)
          (Flambda.create_let result_v
-            (Prim (Pfield 0, [sym_v], Debuginfo.none))
+            (Prim (Pfield (0, Pointer, Mutable), [sym_v], Debuginfo.none))
             (Flambda.create_let value_v
-              (Prim (Pfield pos, [result_v], Debuginfo.none))
+              (Prim (Pfield (pos, Pointer, Mutable),
+                     [result_v], Debuginfo.none))
               (Var value_v))))
   in
   let module_initializer : Flambda.program_body =
diff --git a/middle_end/flambda/extract_projections.ml b/middle_end/flambda/extract_projections.ml
index 33cd473ecdf2..a368b3bd7f7c 100644
--- a/middle_end/flambda/extract_projections.ml
+++ b/middle_end/flambda/extract_projections.ml
@@ -124,7 +124,7 @@ let rec analyse_expr ~which_variables expr =
         when Variable.Map.mem move.closure which_variables ->
       projections :=
         Projection.Set.add (Move_within_set_of_closures move) !projections
-    | Prim (Pfield field_index, [var], _dbg)
+    | Prim (Pfield (field_index, _, _), [var], _dbg)
         when Variable.Map.mem var which_variables ->
       projections :=
         Projection.Set.add (Field (field_index, var)) !projections
diff --git a/middle_end/flambda/flambda_to_clambda.ml b/middle_end/flambda/flambda_to_clambda.ml
index 79ae25caa3f4..70d0f72ee4ce 100644
--- a/middle_end/flambda/flambda_to_clambda.ml
+++ b/middle_end/flambda/flambda_to_clambda.ml
@@ -369,7 +369,8 @@ and to_clambda_named t env var (named : Flambda.named) : Clambda.ulambda =
         Flambda.print_named named
     end
   | Read_symbol_field (symbol, field) ->
-    Uprim (Pfield field, [to_clambda_symbol env symbol], Debuginfo.none)
+    Uprim (Pfield (field, Pointer, Mutable),
+           [to_clambda_symbol env symbol], Debuginfo.none)
   | Set_of_closures set_of_closures ->
     to_clambda_set_of_closures t env set_of_closures
   | Project_closure { set_of_closures; closure_id } ->
@@ -394,12 +395,13 @@ and to_clambda_named t env var (named : Flambda.named) : Clambda.ulambda =
     let fun_offset = get_fun_offset t closure_id in
     let var_offset = get_fv_offset t var in
     let pos = var_offset - fun_offset in
-    Uprim (Pfield pos,
+    Uprim (Pfield (pos, Pointer, Mutable),
       [check_field t (check_closure t ulam (Expr (Var closure)))
          pos (Some named)],
       Debuginfo.none)
-  | Prim (Pfield index, [block], dbg) ->
-    Uprim (Pfield index, [check_field t (subst_var env block) index None], dbg)
+  | Prim (Pfield (index, ptr, mut), [block], dbg) ->
+    Uprim (Pfield (index, ptr, mut),
+           [check_field t (subst_var env block) index None], dbg)
   | Prim (Psetfield (index, maybe_ptr, init), [block; new_value], dbg) ->
     Uprim (Psetfield (index, maybe_ptr, init), [
         check_field t (subst_var env block) index None;
@@ -512,7 +514,8 @@ and to_clambda_set_of_closures t env
         in
         let pos = var_offset - fun_offset in
         Env.add_subst env id
-          (Uprim (Pfield pos, [Clambda.Uvar env_var], Debuginfo.none))
+          (Uprim (Pfield (pos, Pointer, Mutable),
+                  [Clambda.Uvar env_var], Debuginfo.none))
       in
       let env = Variable.Map.fold add_env_free_variable free_vars env in
       (* Add the Clambda expressions for all functions defined in the current
diff --git a/middle_end/flambda/flambda_utils.ml b/middle_end/flambda/flambda_utils.ml
index 39b299eff24c..01172ecb6046 100644
--- a/middle_end/flambda/flambda_utils.ml
+++ b/middle_end/flambda/flambda_utils.ml
@@ -546,7 +546,7 @@ let substitute_read_symbol_field_for_variables
           Expr (
             Flambda.create_let block (make_named t)
               (Flambda.create_let field
-                 (Prim (Pfield h, [block], Debuginfo.none))
+                 (Prim (Pfield (h, Pointer, Mutable), [block], Debuginfo.none))
                  (Var field)))
     in
     Flambda.create_let fresh_var (make_named path) expr
@@ -905,7 +905,7 @@ let projection_to_named (projection : Projection.t) : Flambda.named =
   | Project_closure project_closure -> Project_closure project_closure
   | Move_within_set_of_closures move -> Move_within_set_of_closures move
   | Field (field_index, var) ->
-    Prim (Pfield field_index, [var], Debuginfo.none)
+    Prim (Pfield (field_index, Pointer, Mutable), [var], Debuginfo.none)
 
 type specialised_to_same_as =
   | Not_specialised
diff --git a/middle_end/flambda/inline_and_simplify.ml b/middle_end/flambda/inline_and_simplify.ml
index e2f8c3f4a89a..c5c51bf8af07 100644
--- a/middle_end/flambda/inline_and_simplify.ml
+++ b/middle_end/flambda/inline_and_simplify.ml
@@ -991,7 +991,7 @@ and simplify_named env r (tree : Flambda.named) : Flambda.named * R.t =
       let tree = Flambda.Prim (prim, args, dbg) in
       begin match prim, args, args_approxs with
       (* CR-someday mshinwell: Optimise [Pfield_computed]. *)
-      | Pfield field_index, [arg], [arg_approx] ->
+      | Pfield (field_index, _, _), [arg], [arg_approx] ->
         let projection : Projection.t = Field (field_index, arg) in
         begin match E.find_projection env ~projection with
         | Some var ->
diff --git a/middle_end/flambda/lift_constants.ml b/middle_end/flambda/lift_constants.ml
index dd60de9ce2fc..e514a31d906b 100644
--- a/middle_end/flambda/lift_constants.ml
+++ b/middle_end/flambda/lift_constants.ml
@@ -89,7 +89,7 @@ let assign_symbols_and_collect_constant_definitions
       | Project_closure ({ closure_id } as project_closure) ->
         assign_existing_symbol (closure_symbol ~backend  closure_id);
         record_definition (AA.Project_closure project_closure)
-      | Prim (Pfield index, [block], _) ->
+      | Prim (Pfield (index, _, _), [block], _) ->
         record_definition (AA.Field (block, index))
       | Prim (Pfield _, _, _) ->
         Misc.fatal_errorf "[Pfield] with the wrong number of arguments"
diff --git a/middle_end/flambda/ref_to_variables.ml b/middle_end/flambda/ref_to_variables.ml
index aa2a73c63055..746374e88521 100644
--- a/middle_end/flambda/ref_to_variables.ml
+++ b/middle_end/flambda/ref_to_variables.ml
@@ -155,7 +155,7 @@ let eliminate_ref_of_expr flam =
         flam
     and aux_named (named : Flambda.named) : Flambda.named =
       match named with
-      | Prim(Pfield field, [v], _)
+      | Prim(Pfield (field, _, _), [v], _)
         when convertible_variable v ->
         (match get_variable v field with
          | None -> Expr Proved_unreachable
diff --git a/middle_end/internal_variable_names.ml b/middle_end/internal_variable_names.ml
index 1b7dd6254c4c..55a1deee1487 100644
--- a/middle_end/internal_variable_names.ml
+++ b/middle_end/internal_variable_names.ml
@@ -169,6 +169,16 @@ let psubfloat = "Psubfloat"
 let psubint = "Psubint"
 let pxorbint = "Pxorbint"
 let pxorint = "Pxorint"
+let patomic_cas = "Patomic_cas"
+let patomic_exchange = "Patomic_exchange"
+let patomic_fetch_add = "Patomic_fetch_add"
+let patomic_load = "Patomic_load"
+let prunstack = "Prunstack"
+let pperform = "Pperform"
+let presume = "Presume"
+let preperform = "Preperform"
+let pdls_get = "Pdls_get"
+
 let pabsfloat_arg = "Pabsfloat_arg"
 let paddbint_arg = "Paddbint_arg"
 let paddfloat_arg = "Paddfloat_arg"
@@ -271,6 +281,16 @@ let psubfloat_arg = "Psubfloat_arg"
 let psubint_arg = "Psubint_arg"
 let pxorbint_arg = "Pxorbint_arg"
 let pxorint_arg = "Pxorint_arg"
+let patomic_cas_arg = "Patomic_cas_arg"
+let patomic_exchange_arg = "Patomic_exchange_arg"
+let patomic_fetch_add_arg = "Patomic_fetch_add_arg"
+let patomic_load_arg = "Patomic_load_arg"
+let prunstack_arg = "Prunstack_arg"
+let pperform_arg = "Pperform_arg"
+let presume_arg = "Presume_arg"
+let preperform_arg = "Preperform_arg"
+let pdls_get_arg = "Pdls_get_arg"
+
 let raise = "raise"
 let raise_arg = "raise_arg"
 let read_mutable = "read_mutable"
@@ -405,6 +425,15 @@ let of_primitive : Lambda.primitive -> string = function
   | Pbbswap _ -> pbbswap
   | Pint_as_pointer -> pint_as_pointer
   | Popaque -> popaque
+  | Patomic_cas -> patomic_cas
+  | Patomic_exchange -> patomic_exchange
+  | Patomic_fetch_add -> patomic_fetch_add
+  | Patomic_load _ -> patomic_load
+  | Prunstack -> prunstack
+  | Pperform -> pperform
+  | Presume -> presume
+  | Preperform -> preperform
+  | Pdls_get -> pdls_get
 
 let of_primitive_arg : Lambda.primitive -> string = function
   | Pbytes_of_string -> pbytes_of_string_arg
@@ -508,3 +537,12 @@ let of_primitive_arg : Lambda.primitive -> string = function
   | Pbbswap _ -> pbbswap_arg
   | Pint_as_pointer -> pint_as_pointer_arg
   | Popaque -> popaque_arg
+  | Patomic_cas -> patomic_cas_arg
+  | Patomic_exchange -> patomic_exchange_arg
+  | Patomic_fetch_add -> patomic_fetch_add_arg
+  | Patomic_load _ -> patomic_load_arg
+  | Prunstack -> prunstack_arg
+  | Pperform -> pperform_arg
+  | Presume -> presume_arg
+  | Preperform -> preperform_arg
+  | Pdls_get -> pdls_get_arg
diff --git a/middle_end/printclambda_primitives.ml b/middle_end/printclambda_primitives.ml
index 2e94989155f0..52060b800df4 100644
--- a/middle_end/printclambda_primitives.ml
+++ b/middle_end/printclambda_primitives.ml
@@ -61,7 +61,14 @@ let primitive ppf (prim:Clambda_primitives.primitive) =
       fprintf ppf "makeblock %i%a" tag Printlambda.block_shape shape
   | Pmakeblock(tag, Mutable, shape) ->
       fprintf ppf "makemutable %i%a" tag Printlambda.block_shape shape
-  | Pfield n -> fprintf ppf "field %i" n
+  | Pfield(n, ptr, mut) ->
+      let instr =
+        match ptr, mut with
+        | Immediate, _ -> "field_int "
+        | Pointer, Mutable -> "field_mut "
+        | Pointer, Immutable -> "field_imm "
+      in
+      fprintf ppf "%s%i" instr n
   | Pfield_computed -> fprintf ppf "field_computed"
   | Psetfield(n, ptr, init) ->
       let instr =
@@ -100,6 +107,10 @@ let primitive ppf (prim:Clambda_primitives.primitive) =
       fprintf ppf "setfloatfield%s %i" init n
   | Pduprecord (rep, size) ->
       fprintf ppf "duprecord %a %i" Printlambda.record_rep rep size
+  | Prunstack -> fprintf ppf "runstack"
+  | Pperform -> fprintf ppf "perform"
+  | Presume -> fprintf ppf "resume"
+  | Preperform -> fprintf ppf "reperform"
   | Pccall p -> fprintf ppf "%s" p.Primitive.prim_name
   | Praise k -> fprintf ppf "%s" (Lambda.raise_kind k)
   | Psequand -> fprintf ppf "&&"
@@ -202,4 +213,12 @@ let primitive ppf (prim:Clambda_primitives.primitive) =
   | Pbswap16 -> fprintf ppf "bswap16"
   | Pbbswap(bi) -> print_boxed_integer "bswap" ppf bi
   | Pint_as_pointer -> fprintf ppf "int_as_pointer"
+  | Patomic_load {immediate_or_pointer} ->
+      (match immediate_or_pointer with
+        | Immediate -> fprintf ppf "atomic_load_imm"
+        | Pointer -> fprintf ppf "atomic_load_ptr")
+  | Patomic_exchange -> fprintf ppf "atomic_exchange"
+  | Patomic_cas -> fprintf ppf "atomic_cas"
+  | Patomic_fetch_add -> fprintf ppf "atomic_fetch_add"
   | Popaque -> fprintf ppf "opaque"
+  | Pdls_get -> fprintf ppf "dls_get"
diff --git a/middle_end/semantics_of_primitives.ml b/middle_end/semantics_of_primitives.ml
index 47ed8c3e594d..c4efbf3cd1f9 100644
--- a/middle_end/semantics_of_primitives.ml
+++ b/middle_end/semantics_of_primitives.ml
@@ -35,6 +35,8 @@ let for_primitive (prim : Clambda_primitives.primitive) =
       No_effects, No_coeffects
   | Pccall _ -> Arbitrary_effects, Has_coeffects
   | Praise _ -> Arbitrary_effects, No_coeffects
+  | Prunstack | Pperform | Presume | Preperform ->
+      Arbitrary_effects, Has_coeffects
   | Pnot
   | Pnegint
   | Paddint
@@ -115,6 +117,10 @@ let for_primitive (prim : Clambda_primitives.primitive) =
   | Psetfield _
   | Psetfield_computed _
   | Psetfloatfield _
+  | Patomic_load _
+  | Patomic_exchange
+  | Patomic_cas
+  | Patomic_fetch_add
   | Parraysetu _
   | Parraysets _
   | Pbytessetu
@@ -133,6 +139,9 @@ let for_primitive (prim : Clambda_primitives.primitive) =
   | Psequor ->
       (* Removed by [Closure_conversion] in the flambda pipeline. *)
       No_effects, No_coeffects
+  | Pdls_get ->
+      (* only read *)
+      No_effects, No_coeffects
 
 type return_type =
   | Float
diff --git a/ocaml-variants.opam b/ocaml-variants.opam
index 261ca9b8adaf..2ff79defd4be 100644
--- a/ocaml-variants.opam
+++ b/ocaml-variants.opam
@@ -7,14 +7,57 @@ depends: [
   "base-unix" {post}
   "base-bigarray" {post}
   "base-threads" {post}
+  "base-domains" {post}
+  "ocaml-option-nnp"
 ]
 conflict-class: "ocaml-core-compiler"
 flags: compiler
 build: [
-  ["./configure" "--prefix=%{prefix}%" "--docdir=%{doc}%/ocaml"]
+  [
+    "./configure"
+    "--prefix=%{prefix}%"
+    "--docdir=%{doc}%/ocaml"
+    "--with-afl" {ocaml-option-afl:installed}
+    "--disable-native-compiler" {ocaml-option-bytecode-only:installed}
+    "--disable-force-safe-string" {ocaml-option-default-unsafe-string:installed}
+    "DEFAULT_STRING=unsafe" {ocaml-option-default-unsafe-string:installed}
+    "--disable-flat-float-array" {ocaml-option-no-flat-float-array:installed}
+    "--enable-flambda" {ocaml-option-flambda:installed}
+    "--enable-frame-pointers" {ocaml-option-fp:installed}
+    "CC=cc" {!ocaml-option-32bit:installed & !ocaml-option-musl:installed & (os="openbsd"|os="macos")}
+    "CC=musl-gcc" {ocaml-option-musl:installed & os-distribution!="alpine"}
+    "CFLAGS=-Os" {ocaml-option-musl:installed}
+    #"CC=gcc -m32" {ocaml-option-32bit:installed & os="linux"}
+    #"CC=gcc -Wl,-read_only_relocs,suppress -arch i386 -m32" {ocaml-option-32bit:installed & os="macos"}
+    "ASPP=cc -c" {!ocaml-option-32bit:installed & !ocaml-option-musl:installed & (os="openbsd"|os="macos")}
+    "ASPP=musl-gcc -c" {ocaml-option-musl:installed & os-distribution!="alpine"}
+    #"ASPP=gcc -m32 -c" {ocaml-option-32bit:installed & os="linux"}
+    #"ASPP=gcc -arch i386 -m32 -c" {ocaml-option-32bit:installed & os="macos"}
+    #"AS=as --32" {ocaml-option-32bit:installed & os="linux"}
+    #"AS=as -arch i386" {ocaml-option-32bit:installed & os="macos"}
+    #"--host=i386-linux" {ocaml-option-32bit:installed & os="linux"}
+    #"--host=i386-apple-darwin13.2.0" {ocaml-option-32bit:installed & os="macos"}
+    #"PARTIALLD=ld -r -melf_i386" {ocaml-option-32bit:installed & os="linux"}
+    # 32bit options above commented out just to reduce diff with ocaml-variants.4.12.0+options
+    "LIBS=-static" {ocaml-option-static:installed}
+  ]
   [make "-j%{jobs}%"]
 ]
 install: [make "install"]
+conflicts: [
+  "ocaml-option-32bit"      # Not yet implemented
+  "ocaml-option-nnpchecker" # Fundamentally not possible
+]
+depopts: [
+  "ocaml-option-afl"
+  "ocaml-option-bytecode-only"
+  "ocaml-option-default-unsafe-string"
+  "ocaml-option-no-flat-float-array"
+  "ocaml-option-flambda"
+  "ocaml-option-fp"
+  "ocaml-option-musl"
+  "ocaml-option-static"
+]
 maintainer: "caml-list@inria.fr"
 homepage: "https://github.com/ocaml/ocaml/"
 bug-reports: "https://github.com/ocaml/ocaml/issues"
diff --git a/ocamltest/main.ml b/ocamltest/main.ml
index ea7a99d6bdee..7d1c48d61a31 100644
--- a/ocamltest/main.ml
+++ b/ocamltest/main.ml
@@ -67,24 +67,27 @@ let tsl_block_of_file_safe test_filename =
 let print_usage () =
   Printf.printf "%s\n%!" Options.usage
 
-type result_summary = No_failure | Some_failure
+type result_summary = No_failure | Some_failure | All_skipped
+let join_result summary result =
+  let open Result in
+  match result.status, summary with
+  | Fail, _
+  | _, Some_failure -> Some_failure
+  | Skip, All_skipped -> All_skipped
+  | _ -> No_failure
+
 let join_summaries sa sb =
   match sa, sb with
-  | Some_failure, _ | _, Some_failure -> Some_failure
-  | No_failure, No_failure -> No_failure
-
-let summary_of_result res =
-  let open Result in
-  match res.status with
-  | Pass -> No_failure
-  | Skip -> No_failure
-  | Fail -> Some_failure
+  | Some_failure, _
+  | _, Some_failure -> Some_failure
+  | All_skipped, All_skipped -> All_skipped
+  | _ -> No_failure
 
 let rec run_test log common_prefix path behavior = function
   Node (testenvspec, test, env_modifiers, subtrees) ->
   Printf.printf "%s %s (%s) => %!" common_prefix path test.Tests.test_name;
-  let (msg, children_behavior, summary) = match behavior with
-    | Skip_all_tests -> "n/a", Skip_all_tests, No_failure
+  let (msg, children_behavior, result) = match behavior with
+    | Skip_all_tests -> "n/a", Skip_all_tests, Result.skip
     | Run env ->
       let testenv0 = interpret_environment_statements env testenvspec in
       let testenv = List.fold_left apply_modifiers testenv0 env_modifiers in
@@ -92,14 +95,13 @@ let rec run_test log common_prefix path behavior = function
       let msg = Result.string_of_result result in
       let children_behavior =
         if Result.is_pass result then Run newenv else Skip_all_tests in
-      let summary = summary_of_result result in
-      (msg, children_behavior, summary) in
+      (msg, children_behavior, result) in
   Printf.printf "%s\n%!" msg;
-  join_summaries summary
-    (run_test_trees log common_prefix path children_behavior subtrees)
+  join_result
+    (run_test_trees log common_prefix path children_behavior subtrees) result
 
 and run_test_trees log common_prefix path behavior trees =
-  List.fold_left join_summaries No_failure
+  List.fold_left join_summaries All_skipped
     (List.mapi (run_test_i log common_prefix path behavior) trees)
 
 and run_test_i log common_prefix path behavior i test_tree =
@@ -127,6 +129,7 @@ let init_tests_to_skip () =
   tests_to_skip := String.words (Sys.safe_getenv "OCAMLTEST_SKIP_TESTS")
 
 let test_file test_filename =
+  let start = Unix.gettimeofday () in
   let skip_test = List.mem test_filename !tests_to_skip in
   let tsl_block = tsl_block_of_file_safe test_filename in
   let (rootenv_statements, test_trees) = test_trees_of_tsl_block tsl_block in
@@ -209,10 +212,14 @@ let test_file test_filename =
   | Some_failure ->
       if not Options.log_to_stderr then
         Sys.dump_file stderr ~prefix:"> " log_filename
-  | No_failure ->
+  | No_failure | All_skipped ->
       if not Options.keep_test_dir_on_success then
         clean_test_build_directory ()
-  end
+  end;
+  if Options.show_timings && summary = No_failure then
+    let wall_clock_duration = Unix.gettimeofday () -. start in
+    Printf.eprintf "Wall clock: %s took %.02fs\n%!"
+                   test_filename wall_clock_duration
 
 let is_test s =
   match tsl_block_of_file s with
diff --git a/ocamltest/ocaml_compilers.ml b/ocamltest/ocaml_compilers.ml
index a47c2ae6f721..51f00a017143 100644
--- a/ocamltest/ocaml_compilers.ml
+++ b/ocamltest/ocaml_compilers.ml
@@ -40,12 +40,12 @@ class compiler
   method target = target
 
   method program_variable =
-    if Ocaml_backends.is_native host
+    if Ocaml_backends.is_native host && not Sys.win32
     then Builtin_variables.program2
     else Builtin_variables.program
 
   method program_output_variable =
-    if Ocaml_backends.is_native host
+    if Ocaml_backends.is_native host && not Sys.win32
     then None
     else Some Builtin_variables.output
 
diff --git a/ocamltest/ocaml_tests.ml b/ocamltest/ocaml_tests.ml
index 47603f660d4f..a1d0602b7aa6 100644
--- a/ocamltest/ocaml_tests.ml
+++ b/ocamltest/ocaml_tests.ml
@@ -20,38 +20,65 @@ open Builtin_actions
 open Ocaml_actions
 
 let bytecode =
-  let opt_actions =
+  let byte_build =
+  [
+    setup_ocamlc_byte_build_env;
+    ocamlc_byte;
+    check_ocamlc_byte_output
+  ] in
+  let opt_build =
   [
     setup_ocamlc_opt_build_env;
     ocamlc_opt;
-    check_ocamlc_opt_output;
-    compare_bytecode_programs
+    check_ocamlc_opt_output
   ] in
 {
   test_name = "bytecode";
   test_run_by_default = true;
   test_actions =
+  (if Sys.win32 && Ocamltest_config.arch<>"none" then
+    opt_build
+  else
+    byte_build) @
   [
-    setup_ocamlc_byte_build_env;
-    ocamlc_byte;
-    check_ocamlc_byte_output;
     run;
     check_program_output;
-  ] @ (if Ocamltest_config.native_compiler then opt_actions else [])
+  ] @
+  (if not Sys.win32 && Ocamltest_config.native_compiler then
+    opt_build @ [compare_bytecode_programs]
+  else
+    []
+  )
 }
 
 let native =
-  let opt_actions =
+  let byte_build =
   [
     setup_ocamlopt_byte_build_env;
     ocamlopt_byte;
     check_ocamlopt_byte_output;
-    run;
-    check_program_output;
+  ] in
+  let opt_build =
+  [
     setup_ocamlopt_opt_build_env;
     ocamlopt_opt;
     check_ocamlopt_opt_output;
   ] in
+  let opt_actions =
+  (if Sys.win32 then
+    opt_build
+  else
+    byte_build
+  ) @
+  [
+    run;
+    check_program_output;
+  ] @
+  (if not Sys.win32 then
+    opt_build
+  else
+    []
+  ) in
   {
     test_name = "native";
     test_run_by_default = true;
diff --git a/ocamltest/ocamltest_unix.mli b/ocamltest/ocamltest_unix.mli
index 1a111fd9d02d..2a7fc4156b7c 100644
--- a/ocamltest/ocamltest_unix.mli
+++ b/ocamltest/ocamltest_unix.mli
@@ -18,3 +18,4 @@
 val has_symlink : unit -> bool
 val symlink : ?to_dir:bool -> string -> string -> unit
 val chmod : string -> int -> unit
+val gettimeofday : unit -> float
diff --git a/ocamltest/ocamltest_unix_dummy.ml b/ocamltest/ocamltest_unix_dummy.ml
index 32b805992c89..fc5d351d5f2e 100644
--- a/ocamltest/ocamltest_unix_dummy.ml
+++ b/ocamltest/ocamltest_unix_dummy.ml
@@ -16,3 +16,4 @@
 let has_symlink () = false
 let symlink ?to_dir:_ _ _ = invalid_arg "symlink not available"
 let chmod _ _ = invalid_arg "chmod not available"
+let gettimeofday () = invalid_arg "gettimeofday not available"
diff --git a/ocamltest/ocamltest_unix_real.ml b/ocamltest/ocamltest_unix_real.ml
index 322b911f9a5a..6a270906a81f 100644
--- a/ocamltest/ocamltest_unix_real.ml
+++ b/ocamltest/ocamltest_unix_real.ml
@@ -12,8 +12,9 @@
 (*                                                                        *)
 (**************************************************************************)
 
-(* Unix.has_symlink never raises *)
+(* Unix.gettimeofday and Unix.has_symlink never raise *)
 let has_symlink = Unix.has_symlink
+let gettimeofday = Unix.gettimeofday
 
 (* Convert Unix_error to Sys_error *)
 let wrap f x =
diff --git a/ocamltest/options.ml b/ocamltest/options.ml
index d10820193e33..4df329614b17 100644
--- a/ocamltest/options.ml
+++ b/ocamltest/options.ml
@@ -57,6 +57,8 @@ let find_test_dirs = ref []
 
 let list_tests = ref []
 
+let show_timings = ref false
+
 let add_to_list r x =
   r := !r @ [x]
 
@@ -68,6 +70,8 @@ let commandline_options =
   ("-show-actions", Arg.Unit show_actions, " Show available actions.");
   ("-show-tests", Arg.Unit show_tests, " Show available tests.");
   ("-show-variables", Arg.Unit show_variables, " Show available variables.");
+  ("-show-timings", Arg.Set show_timings,
+   " Show the wall clock time taken for each test file.");
   ("-timeout",
      Arg.Int (fun t -> if t >= 0
                        then default_timeout := t
@@ -95,3 +99,4 @@ let default_timeout = !default_timeout
 let find_test_dirs = !find_test_dirs
 let list_tests = !list_tests
 let keep_test_dir_on_success = !keep_test_dir_on_success
+let show_timings = !show_timings
diff --git a/ocamltest/options.mli b/ocamltest/options.mli
index 56da374e6762..7b7c5943a488 100644
--- a/ocamltest/options.mli
+++ b/ocamltest/options.mli
@@ -30,3 +30,5 @@ val find_test_dirs : string list
 val list_tests : string list
 
 val keep_test_dir_on_success : bool
+
+val show_timings : bool
diff --git a/otherlibs/dynlink/dynlink_common.ml b/otherlibs/dynlink/dynlink_common.ml
index c7247c4644dd..3f5933aeb024 100644
--- a/otherlibs/dynlink/dynlink_common.ml
+++ b/otherlibs/dynlink/dynlink_common.ml
@@ -20,6 +20,14 @@
 
 open! Dynlink_compilerlibs
 
+(* Dynlink is only allowed on the main domain.
+   Entrypoints to public functions should check for this. *)
+let is_dynlink_allowed () =
+  if not (Domain.is_main_domain ()) then
+    failwith "Dynlink can only be called from the main domain."
+  else
+    ()
+
 module String = struct
   include Misc.Stdlib.String
 
@@ -79,6 +87,7 @@ module Make (P : Dynlink_platform_intf.S) = struct
   let unsafe_allowed = ref false
 
   let allow_unsafe_modules b =
+    is_dynlink_allowed();
     unsafe_allowed := b
 
   let check_symbols_disjoint ~descr syms1 syms2 =
@@ -137,6 +146,7 @@ module Make (P : Dynlink_platform_intf.S) = struct
     global_state := state
 
   let init () =
+    is_dynlink_allowed();
     if not !inited then begin
       P.init ();
       default_available_units ();
@@ -270,6 +280,7 @@ module Make (P : Dynlink_platform_intf.S) = struct
     end
 
   let set_allowed_units allowed_units =
+    is_dynlink_allowed();
     let allowed_units = String.Set.of_list allowed_units in
     let state =
       let state = !global_state in
@@ -280,6 +291,7 @@ module Make (P : Dynlink_platform_intf.S) = struct
     global_state := state
 
   let allow_only units =
+    is_dynlink_allowed();
     let allowed_units =
       String.Set.inter (!global_state).allowed_units
         (String.Set.of_list units)
@@ -293,6 +305,7 @@ module Make (P : Dynlink_platform_intf.S) = struct
     global_state := state
 
   let prohibit units =
+    is_dynlink_allowed();
     let allowed_units =
       String.Set.diff (!global_state).allowed_units
         (String.Set.of_list units)
diff --git a/otherlibs/str/Makefile b/otherlibs/str/Makefile
index e49e9f6ec326..8b93f67dcc82 100644
--- a/otherlibs/str/Makefile
+++ b/otherlibs/str/Makefile
@@ -24,6 +24,8 @@ include ../Makefile.otherlibs.common
 str.cmo: str.cmi
 str.cmx: str.cmi
 
+LDOPTS = $(PTHREAD_LINK)
+
 .PHONY: depend
 depend:
 	$(OCAMLRUN) $(ROOTDIR)/boot/ocamlc -depend -slash *.mli *.ml > .depend
diff --git a/otherlibs/str/str.ml b/otherlibs/str/str.ml
index b8acd97c1bfb..5be37c18feb6 100644
--- a/otherlibs/str/str.ml
+++ b/otherlibs/str/str.ml
@@ -595,51 +595,54 @@ external re_search_forward: regexp -> string -> int -> int array
 external re_search_backward: regexp -> string -> int -> int array
      = "re_search_backward"
 
-let last_search_result = ref [||]
+let last_search_result_key = Domain.DLS.new_key (fun () -> [||])
 
 let string_match re s pos =
   let res = re_string_match re s pos in
-  last_search_result := res;
+  Domain.DLS.set last_search_result_key res;
   Array.length res > 0
 
 let string_partial_match re s pos =
   let res = re_partial_match re s pos in
-  last_search_result := res;
+  Domain.DLS.set last_search_result_key res;
   Array.length res > 0
 
 let search_forward re s pos =
   let res = re_search_forward re s pos in
-  last_search_result := res;
+  Domain.DLS.set last_search_result_key res;
   if Array.length res = 0 then raise Not_found else res.(0)
 
 let search_backward re s pos =
   let res = re_search_backward re s pos in
-  last_search_result := res;
+  Domain.DLS.set last_search_result_key res;
   if Array.length res = 0 then raise Not_found else res.(0)
 
 let group_beginning n =
+  let last_search_result = Domain.DLS.get last_search_result_key in
   let n2 = n + n in
-  if n < 0 || n2 >= Array.length !last_search_result then
+  if n < 0 || n2 >= Array.length last_search_result then
     invalid_arg "Str.group_beginning"
   else
-    let pos = !last_search_result.(n2) in
+    let pos = last_search_result.(n2) in
     if pos = -1 then raise Not_found else pos
 
 let group_end n =
+  let last_search_result = Domain.DLS.get last_search_result_key in
   let n2 = n + n in
-  if n < 0 || n2 >= Array.length !last_search_result then
+  if n < 0 || n2 >= Array.length last_search_result then
     invalid_arg "Str.group_end"
   else
-    let pos = !last_search_result.(n2 + 1) in
+    let pos = last_search_result.(n2 + 1) in
     if pos = -1 then raise Not_found else pos
 
 let matched_group n txt =
+  let last_search_result = Domain.DLS.get last_search_result_key in
   let n2 = n + n in
-  if n < 0 || n2 >= Array.length !last_search_result then
+  if n < 0 || n2 >= Array.length last_search_result then
     invalid_arg "Str.matched_group"
   else
-    let b = !last_search_result.(n2)
-    and e = !last_search_result.(n2 + 1) in
+    let b = last_search_result.(n2)
+    and e = last_search_result.(n2 + 1) in
     if b = -1 then raise Not_found else String.sub txt b (e - b)
 
 let match_beginning () = group_beginning 0
@@ -652,7 +655,8 @@ external re_replacement_text: string -> int array -> string -> string
     = "re_replacement_text"
 
 let replace_matched repl matched =
-  re_replacement_text repl !last_search_result matched
+  let last_search_result = Domain.DLS.get last_search_result_key in
+  re_replacement_text repl last_search_result matched
 
 let substitute_first expr repl_fun text =
   try
diff --git a/otherlibs/systhreads/.depend b/otherlibs/systhreads/.depend
index 3bd0a0078f52..68f4b2f351ee 100644
--- a/otherlibs/systhreads/.depend
+++ b/otherlibs/systhreads/.depend
@@ -1,34 +1,8 @@
-condition.cmo : \
-    mutex.cmi \
-    condition.cmi
-condition.cmx : \
-    mutex.cmx \
-    condition.cmi
-condition.cmi : \
-    mutex.cmi
 event.cmo : \
-    mutex.cmi \
-    condition.cmi \
     event.cmi
 event.cmx : \
-    mutex.cmx \
-    condition.cmx \
     event.cmi
 event.cmi :
-mutex.cmo : \
-    mutex.cmi
-mutex.cmx : \
-    mutex.cmi
-mutex.cmi :
-semaphore.cmo : \
-    mutex.cmi \
-    condition.cmi \
-    semaphore.cmi
-semaphore.cmx : \
-    mutex.cmx \
-    condition.cmx \
-    semaphore.cmi
-semaphore.cmi :
 thread.cmo : \
     thread.cmi
 thread.cmx : \
diff --git a/otherlibs/systhreads/Makefile b/otherlibs/systhreads/Makefile
index 8fc1bdb92631..4e8a53257f3c 100644
--- a/otherlibs/systhreads/Makefile
+++ b/otherlibs/systhreads/Makefile
@@ -22,7 +22,7 @@ ifneq "$(CCOMPTYPE)" "msvc"
 OC_CFLAGS += -g
 endif
 
-OC_CFLAGS += $(SHAREDLIB_CFLAGS)
+OC_CFLAGS += $(SHAREDLIB_CFLAGS) $(PTHREAD_CFLAGS)
 
 OC_CPPFLAGS += -I$(ROOTDIR)/runtime
 
@@ -50,14 +50,12 @@ LIBNAME=threads
 BYTECODE_C_OBJS=st_stubs.b.$(O)
 NATIVECODE_C_OBJS=st_stubs.n.$(O)
 
-THREADS_SOURCES = thread.ml mutex.ml condition.ml event.ml threadUnix.ml \
-  semaphore.ml
+THREADS_SOURCES = thread.ml event.ml threadUnix.ml
 
 THREADS_BCOBJS = $(THREADS_SOURCES:.ml=.cmo)
 THREADS_NCOBJS = $(THREADS_SOURCES:.ml=.cmx)
 
-MLIFILES=thread.mli mutex.mli condition.mli event.mli threadUnix.mli \
-  semaphore.mli
+MLIFILES=thread.mli event.mli threadUnix.mli
 
 CMIFILES=$(MLIFILES:.mli=.cmi)
 
diff --git a/otherlibs/systhreads/st_posix.h b/otherlibs/systhreads/st_posix.h
index 957f4717b6fe..f7fdf14ea124 100644
--- a/otherlibs/systhreads/st_posix.h
+++ b/otherlibs/systhreads/st_posix.h
@@ -23,6 +23,7 @@
 #include <pthread.h>
 #include <signal.h>
 #include <time.h>
+#include <caml/sync.h>
 #include <sys/time.h>
 #ifdef __linux__
 #include <unistd.h>
@@ -30,13 +31,15 @@
 
 typedef int st_retcode;
 
-#define SIGPREEMPTION SIGVTALRM
+/* Variables used to stop "tick" threads */
+static atomic_uintnat tick_thread_stop[Max_domains];
+#define Tick_thread_stop tick_thread_stop[Caml_state->id]
 
 /* OS-specific initialization */
 
 static int st_initialize(void)
 {
-  caml_sigmask_hook = pthread_sigmask;
+  atomic_store_rel(&Tick_thread_stop, 0);
   return 0;
 }
 
@@ -44,6 +47,7 @@ static int st_initialize(void)
 
 typedef pthread_t st_thread_id;
 
+
 static int st_thread_create(st_thread_id * res,
                             void * (*fn)(void *), void * arg)
 {
@@ -75,7 +79,11 @@ CAMLnoreturn_end;
 
 static void st_thread_exit(void)
 {
+#ifdef _WIN32
+  ExitThread(0);
+#else
   pthread_exit(NULL);
+#endif
 }
 
 static void st_thread_join(st_thread_id thr)
@@ -115,44 +123,76 @@ Caml_inline void st_thread_set_id(intnat id)
    threads. */
 
 typedef struct {
-  pthread_mutex_t lock;         /* to protect contents  */
-  int busy;                     /* 0 = free, 1 = taken */
-  volatile int waiters;         /* number of threads waiting on master lock */
-  pthread_cond_t is_free;       /* signaled when free */
+  pthread_mutex_t lock;           /* to protect contents */
+  uintnat busy;                   /* 0 = free, 1 = taken */
+  atomic_uintnat waiters;         /* number of threads waiting on master lock */
+  pthread_cond_t is_free;         /* signaled when free */
 } st_masterlock;
 
 static void st_masterlock_init(st_masterlock * m)
 {
+
   pthread_mutex_init(&m->lock, NULL);
   pthread_cond_init(&m->is_free, NULL);
   m->busy = 1;
-  m->waiters = 0;
+  atomic_store_rel(&m->waiters, 0);
+
+  return;
+};
+
+static void st_bt_lock_acquire(st_masterlock *m) {
+
+  /* We do not want to signal the backup thread is it is not "working"
+     as it may very well not be, because we could have just resumed
+     execution from another thread right away. */
+  if (caml_bt_is_in_blocking_section()) {
+    caml_bt_enter_ocaml();
+  }
+
+  caml_acquire_domain_lock();
+
+  return;
 }
 
-static void st_masterlock_acquire(st_masterlock * m)
+static void st_bt_lock_release(st_masterlock *m) {
+
+  /* Here we do want to signal the backup thread iff there's
+     no thread waiting to be scheduled, and the backup thread is currently
+     idle. */
+  if (atomic_load_acq(&m->waiters) == 0 &&
+      caml_bt_is_in_blocking_section() == 0) {
+    caml_bt_exit_ocaml();
+  }
+
+  caml_release_domain_lock();
+
+  return;
+}
+
+static void st_masterlock_acquire(st_masterlock *m)
 {
   pthread_mutex_lock(&m->lock);
   while (m->busy) {
-    m->waiters ++;
+    atomic_fetch_add(&m->waiters, +1);
     pthread_cond_wait(&m->is_free, &m->lock);
-    m->waiters --;
+    atomic_fetch_add(&m->waiters, -1);
   }
   m->busy = 1;
+  st_bt_lock_acquire(m);
   pthread_mutex_unlock(&m->lock);
+
+  return;
 }
 
 static void st_masterlock_release(st_masterlock * m)
 {
   pthread_mutex_lock(&m->lock);
   m->busy = 0;
-  pthread_mutex_unlock(&m->lock);
+  st_bt_lock_release(m);
   pthread_cond_signal(&m->is_free);
-}
+  pthread_mutex_unlock(&m->lock);
 
-CAMLno_tsan  /* This can be called for reading [waiters] without locking. */
-Caml_inline int st_masterlock_waiters(st_masterlock * m)
-{
-  return m->waiters;
+  return;
 }
 
 /* Scheduling hints */
@@ -166,21 +206,30 @@ Caml_inline int st_masterlock_waiters(st_masterlock * m)
 */
 Caml_inline void st_thread_yield(st_masterlock * m)
 {
+  uintnat waiters;
+
   pthread_mutex_lock(&m->lock);
   /* We must hold the lock to call this. */
-  assert(m->busy);
 
   /* We already checked this without the lock, but we might have raced--if
      there's no waiter, there's nothing to do and no one to wake us if we did
      wait, so just keep going. */
-  if (m->waiters == 0) {
+  waiters = atomic_load_acq(&m->waiters);
+
+  if (waiters == 0) {
     pthread_mutex_unlock(&m->lock);
     return;
   }
 
   m->busy = 0;
+  atomic_fetch_add(&m->waiters, +1);
   pthread_cond_signal(&m->is_free);
-  m->waiters++;
+  /* releasing the domain lock but not triggering bt messaging
+     messaging the bt should not be required because yield assumes
+     that a thread will resume execution (be it the yielding thread
+     or a waiting thread */
+  caml_release_domain_lock();
+
   do {
     /* Note: the POSIX spec prevents the above signal from pairing with this
        wait, which is good: we'll reliably continue waiting until the next
@@ -188,106 +237,15 @@ Caml_inline void st_thread_yield(st_masterlock * m)
        wakeup, which are rare at best.) */
        pthread_cond_wait(&m->is_free, &m->lock);
   } while (m->busy);
-  m->busy = 1;
-  m->waiters--;
-  pthread_mutex_unlock(&m->lock);
-}
-
-/* Mutexes */
-
-typedef pthread_mutex_t * st_mutex;
-
-static int st_mutex_create(st_mutex * res)
-{
-  int rc;
-  pthread_mutexattr_t attr;
-  st_mutex m;
-
-  rc = pthread_mutexattr_init(&attr);
-  if (rc != 0) goto error1;
-  rc = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
-  if (rc != 0) goto error2;
-  m = caml_stat_alloc_noexc(sizeof(pthread_mutex_t));
-  if (m == NULL) { rc = ENOMEM; goto error2; }
-  rc = pthread_mutex_init(m, &attr);
-  if (rc != 0) goto error3;
-  pthread_mutexattr_destroy(&attr);
-  *res = m;
-  return 0;
-error3:
-  caml_stat_free(m);
-error2:
-  pthread_mutexattr_destroy(&attr);
-error1:
-  return rc;
-}
-
-static int st_mutex_destroy(st_mutex m)
-{
-  int rc;
-  rc = pthread_mutex_destroy(m);
-  caml_stat_free(m);
-  return rc;
-}
 
-#define MUTEX_DEADLOCK EDEADLK
-
-Caml_inline int st_mutex_lock(st_mutex m)
-{
-  return pthread_mutex_lock(m);
-}
-
-#define MUTEX_PREVIOUSLY_UNLOCKED 0
-#define MUTEX_ALREADY_LOCKED EBUSY
-
-Caml_inline int st_mutex_trylock(st_mutex m)
-{
-  return pthread_mutex_trylock(m);
-}
-
-#define MUTEX_NOT_OWNED EPERM
-
-Caml_inline int st_mutex_unlock(st_mutex m)
-{
-  return pthread_mutex_unlock(m);
-}
-
-/* Condition variables */
-
-typedef pthread_cond_t * st_condvar;
-
-static int st_condvar_create(st_condvar * res)
-{
-  int rc;
-  st_condvar c = caml_stat_alloc_noexc(sizeof(pthread_cond_t));
-  if (c == NULL) return ENOMEM;
-  rc = pthread_cond_init(c, NULL);
-  if (rc != 0) { caml_stat_free(c); return rc; }
-  *res = c;
-  return 0;
-}
-
-static int st_condvar_destroy(st_condvar c)
-{
-  int rc;
-  rc = pthread_cond_destroy(c);
-  caml_stat_free(c);
-  return rc;
-}
+  m->busy = 1;
+  atomic_fetch_add(&m->waiters, -1);
 
-Caml_inline int st_condvar_signal(st_condvar c)
-{
-  return pthread_cond_signal(c);
-}
+  caml_acquire_domain_lock();
 
-Caml_inline int st_condvar_broadcast(st_condvar c)
-{
-  return pthread_cond_broadcast(c);
-}
+  pthread_mutex_unlock(&m->lock);
 
-Caml_inline int st_condvar_wait(st_condvar c, st_mutex m)
-{
-  return pthread_cond_wait(c, m);
+  return;
 }
 
 /* Triggered events */
@@ -298,6 +256,7 @@ typedef struct st_event_struct {
   pthread_cond_t triggered;     /* signaled when triggered */
 } * st_event;
 
+
 static int st_event_create(st_event * res)
 {
   int rc;
@@ -347,49 +306,27 @@ static int st_event_wait(st_event e)
   return rc;
 }
 
-/* Reporting errors */
-
-static void st_check_error(int retcode, char * msg)
-{
-  char * err;
-  int errlen, msglen;
-  value str;
-
-  if (retcode == 0) return;
-  if (retcode == ENOMEM) caml_raise_out_of_memory();
-  err = strerror(retcode);
-  msglen = strlen(msg);
-  errlen = strlen(err);
-  str = caml_alloc_string(msglen + 2 + errlen);
-  memmove (&Byte(str, 0), msg, msglen);
-  memmove (&Byte(str, msglen), ": ", 2);
-  memmove (&Byte(str, msglen + 2), err, errlen);
-  caml_raise_sys_error(str);
-}
-
-/* Variable used to stop the "tick" thread */
-static volatile int caml_tick_thread_stop = 0;
-
-/* The tick thread: posts a SIGPREEMPTION signal periodically */
+/* The tick thread: interrupt the domain periodically to force preemption  */
 
 static void * caml_thread_tick(void * arg)
 {
+  caml_domain_state *domain;
+  uintnat *domain_id = (uintnat *) arg;
   struct timeval timeout;
-  sigset_t mask;
 
-  /* Block all signals so that we don't try to execute an OCaml signal handler*/
-  sigfillset(&mask);
-  pthread_sigmask(SIG_BLOCK, &mask, NULL);
-  while(! caml_tick_thread_stop) {
+  caml_init_domain_self(*domain_id);
+  domain = Caml_state;
+
+  caml_domain_set_name("Tick");
+  while(! atomic_load_acq(&Tick_thread_stop)) {
     /* select() seems to be the most efficient way to suspend the
        thread for sub-second intervals */
     timeout.tv_sec = 0;
     timeout.tv_usec = Thread_timeout * 1000;
     select(0, NULL, NULL, NULL, &timeout);
-    /* The preemption signal should never cause a callback, so don't
-     go through caml_handle_signal(), just record signal delivery via
-     caml_record_signal(). */
-    caml_record_signal(SIGPREEMPTION);
+
+    atomic_store_rel((atomic_uintnat*)&domain->requested_external_interrupt, 1);
+    caml_interrupt_self();
   }
   return NULL;
 }
@@ -408,11 +345,13 @@ int pthread_atfork(void (*prepare)(void), void (*parent)(void),
 
 static int st_atfork(void (*fn)(void))
 {
-  return pthread_atfork(NULL, NULL, fn);
+  caml_atfork_hook = fn;
+  return 0;
 }
 
 /* Signal handling */
 
+#ifndef _WIN32
 static void st_decode_sigset(value vset, sigset_t * set)
 {
   sigemptyset(set);
@@ -445,9 +384,11 @@ static value st_encode_sigset(sigset_t * set)
 }
 
 static int sigmask_cmd[3] = { SIG_SETMASK, SIG_BLOCK, SIG_UNBLOCK };
+#endif
 
 value caml_thread_sigmask(value cmd, value sigs) /* ML */
 {
+#ifndef _WIN32
   int how;
   sigset_t set, oldset;
   int retcode;
@@ -457,10 +398,14 @@ value caml_thread_sigmask(value cmd, value sigs) /* ML */
   caml_enter_blocking_section();
   retcode = pthread_sigmask(how, &set, &oldset);
   caml_leave_blocking_section();
-  st_check_error(retcode, "Thread.sigmask");
+  sync_check_error(retcode, "Thread.sigmask");
   /* Run any handlers for just-unmasked pending signals */
-  caml_process_pending_actions();
+  caml_process_pending_signals();
   return st_encode_sigset(&oldset);
+#else
+  caml_invalid_argument("Thread.sigmask not implemented");
+  return Val_int(0);            /* not reached */
+#endif
 }
 
 value caml_wait_signal(value sigs) /* ML */
@@ -473,7 +418,7 @@ value caml_wait_signal(value sigs) /* ML */
   caml_enter_blocking_section();
   retcode = sigwait(&set, &signo);
   caml_leave_blocking_section();
-  st_check_error(retcode, "Thread.wait_signal");
+  sync_check_error(retcode, "Thread.wait_signal");
   return Val_int(caml_rev_convert_signal_number(signo));
 #else
   caml_invalid_argument("Thread.wait_signal not implemented");
diff --git a/otherlibs/systhreads/st_stubs.c b/otherlibs/systhreads/st_stubs.c
index b7a6a9a6bb5e..29b169486161 100644
--- a/otherlibs/systhreads/st_stubs.c
+++ b/otherlibs/systhreads/st_stubs.c
@@ -22,6 +22,7 @@
 #include "caml/debugger.h"
 #include "caml/domain.h"
 #include "caml/fail.h"
+#include "caml/fiber.h"
 #include "caml/io.h"
 #include "caml/memory.h"
 #include "caml/misc.h"
@@ -29,22 +30,12 @@
 #include "caml/printexc.h"
 #include "caml/roots.h"
 #include "caml/signals.h"
-#ifdef NATIVE_CODE
-#include "caml/stack.h"
-#else
-#include "caml/stacks.h"
-#endif
 #include "caml/sys.h"
 #include "caml/memprof.h"
 
 /* threads.h is *not* included since it contains the _external_ declarations for
    the caml_c_thread_register and caml_c_thread_unregister functions. */
 
-#ifndef NATIVE_CODE
-/* Initial size of bytecode stack when a thread is created (4 Ko) */
-#define Thread_stack_size (Stack_size / 4)
-#endif
-
 /* Max computation time before rescheduling, in milliseconds */
 #define Thread_timeout 50
 
@@ -70,59 +61,72 @@ struct caml_thread_descr {
 /* The infos on threads (allocated via caml_stat_alloc()) */
 
 struct caml_thread_struct {
+
   value descr;              /* The heap-allocated descriptor (root) */
-  struct caml_thread_struct * next;  /* Double linking of running threads */
+  struct caml_thread_struct * next; /* Doubly-linked list of running threads */
   struct caml_thread_struct * prev;
-#ifdef NATIVE_CODE
-  char * top_of_stack;      /* Top of stack for this thread (approx.) */
-  char * bottom_of_stack;   /* Saved value of Caml_state->bottom_of_stack */
-  uintnat last_retaddr;     /* Saved value of Caml_state->last_return_address */
-  value * gc_regs;          /* Saved value of Caml_state->gc_regs */
-  char * exception_pointer; /* Saved value of Caml_state->exception_pointer */
-  struct caml__roots_block * local_roots; /* Saved value of local_roots */
-  struct longjmp_buffer * exit_buf; /* For thread exit */
-#else
-  value * stack_low; /* The execution stack for this thread */
-  value * stack_high;
-  value * stack_threshold;
-  value * sp;        /* Saved value of Caml_state->extern_sp for this thread */
-  value * trapsp;    /* Saved value of Caml_state->trapsp for this thread */
-  /* Saved value of Caml_state->local_roots */
-  struct caml__roots_block * local_roots;
-  struct longjmp_buffer * external_raise; /* Saved Caml_state->external_raise */
+  int domain_id;      /* The id of the domain to which this thread belongs */
+  struct stack_info* current_stack;      /* saved Caml_state->current_stack */
+  struct c_stack_link* c_stack;          /* saved Caml_state->c_stack */
+  struct caml__roots_block *local_roots; /* saved value of local_roots */
+  struct longjmp_buffer *exit_buf;       /* For thread exit */
+  int backtrace_pos;           /* saved value of Caml_state->backtrace_pos */
+  code_t * backtrace_buffer;   /* saved value of Caml_state->backtrace_buffer */
+  value backtrace_last_exn;
+    /* saved value of Caml_state->backtrace_last_exn (root) */
+  value * gc_regs;           /* saved value of Caml_state->gc_regs */
+  value * gc_regs_buckets;   /* saved value of Caml_state->gc_regs_buckets */
+  value ** gc_regs_slot;     /* saved value of Caml_state->gc_regs_slot */
+  void * exn_handler;        /* saved value of Caml_state->exn_handler */
+
+#ifndef NATIVE_CODE
+  intnat trap_sp_off;      /* saved value of Caml_state->trap_sp_off */
+  intnat trap_barrier_off; /* saved value of Caml_state->trap_barrier_off */
+  struct caml_exception_context* external_raise;
+    /* saved value of Caml_state->external_raise */
+#endif
+
+#ifdef POSIX_SIGNALS
+  sigset_t init_mask;
 #endif
-  int backtrace_pos; /* Saved Caml_state->backtrace_pos */
-  backtrace_slot * backtrace_buffer; /* Saved Caml_state->backtrace_buffer */
-  value backtrace_last_exn;  /* Saved Caml_state->backtrace_last_exn (root) */
-  struct caml_memprof_th_ctx *memprof_ctx;
 };
 
-typedef struct caml_thread_struct * caml_thread_t;
+typedef struct caml_thread_struct* caml_thread_t;
+
+/* overall table for threads accross domains */
+struct caml_thread_table {
+  caml_thread_t all_threads;
+  caml_thread_t current_thread;
+  st_tlskey thread_key;
+  st_masterlock thread_lock;
+  int tick_thread_running;
+  st_thread_id tick_thread_id;
+};
+
+/* thread_table instance, up to Max_domains */
+static struct caml_thread_table thread_table[Max_domains];
 
-/* The "head" of the circular list of thread descriptors */
-static caml_thread_t all_threads = NULL;
+/* the "head" of the circular list of thread descriptors for this domain */
+#define All_threads thread_table[Caml_state->id].all_threads
 
-/* The descriptor for the currently executing thread */
-static caml_thread_t curr_thread = NULL;
+/* The descriptor for the currently executing thread for this domain */
+#define Current_thread thread_table[Caml_state->id].current_thread
 
-/* The master lock protecting the OCaml runtime system */
-static st_masterlock caml_master_lock;
+/* The master lock protecting this domain's thread chaining */
+#define Thread_main_lock thread_table[Caml_state->id].thread_lock
 
-/* Whether the "tick" thread is already running */
-static int caml_tick_thread_running = 0;
+/* Whether the "tick" thread is already running for this domain */
+#define Tick_thread_running thread_table[Caml_state->id].tick_thread_running
 
-/* The thread identifier of the "tick" thread */
-static st_thread_id caml_tick_thread_id;
+/* The thread identifier of the "tick" thread for this domain */
+#define Tick_thread_id thread_table[Caml_state->id].tick_thread_id
 
 /* The key used for storing the thread descriptor in the specific data
    of the corresponding system thread. */
-static st_tlskey thread_descriptor_key;
-
-/* The key used for unlocking I/O channels on exceptions */
-static st_tlskey last_channel_locked_key;
+#define Thread_key thread_table[Caml_state->id].thread_key
 
 /* Identifier for next thread creation */
-static intnat thread_next_ident = 0;
+static atomic_uintnat thread_next_id = 0;
 
 /* Forward declarations */
 static value caml_threadstatus_new (void);
@@ -137,99 +141,86 @@ extern void (*caml_termination_hook)(void);
 
 /* Hook for scanning the stacks of the other threads */
 
-static void (*prev_scan_roots_hook) (scanning_action);
-
-static void caml_thread_scan_roots(scanning_action action)
-{
-  caml_thread_t th = curr_thread;
-  do {
-    (*action)(th->descr, &th->descr);
-    (*action)(th->backtrace_last_exn, &th->backtrace_last_exn);
-    /* Don't rescan the stack of the current thread, it was done already */
-    if (th != curr_thread) {
-#ifdef NATIVE_CODE
-      if (th->bottom_of_stack != NULL)
-        caml_do_local_roots(action, th->bottom_of_stack, th->last_retaddr,
-                       th->gc_regs, th->local_roots);
-#else
-      caml_do_local_roots(action, th->sp, th->stack_high, th->local_roots);
-#endif
-    }
-    th = th->next;
-  } while (th != curr_thread);
-  /* Hook */
-  if (prev_scan_roots_hook != NULL) (*prev_scan_roots_hook)(action);
-}
-
-/* Hook for iterating over Memprof's entries arrays */
+static void (*prev_scan_roots_hook) (scanning_action, void *,
+                                     caml_domain_state *);
 
-static void memprof_ctx_iter(th_ctx_action f, void* data)
+static void caml_thread_scan_roots(scanning_action action,
+                                   void *fdata,
+                                   caml_domain_state *domain_state)
 {
-  caml_thread_t th = curr_thread;
-  do {
-    f(th->memprof_ctx, data);
-    th = th->next;
-  } while (th != curr_thread);
-}
-
-/* Saving and restoring runtime state in curr_thread */
+  caml_thread_t th;
 
-Caml_inline void caml_thread_save_runtime_state(void)
-{
-#ifdef NATIVE_CODE
-  curr_thread->top_of_stack = Caml_state->top_of_stack;
-  curr_thread->bottom_of_stack = Caml_state->bottom_of_stack;
-  curr_thread->last_retaddr = Caml_state->last_return_address;
-  curr_thread->gc_regs = Caml_state->gc_regs;
-  curr_thread->exception_pointer = Caml_state->exception_pointer;
-#else
-  curr_thread->stack_low = Caml_state->stack_low;
-  curr_thread->stack_high = Caml_state->stack_high;
-  curr_thread->stack_threshold = Caml_state->stack_threshold;
-  curr_thread->sp = Caml_state->extern_sp;
-  curr_thread->trapsp = Caml_state->trapsp;
-  curr_thread->external_raise = Caml_state->external_raise;
+  th = Current_thread;
+
+  /* GC could be triggered before [Current_thread] is initialized */
+  if (th != NULL) {
+    do {
+      (*action)(fdata, th->descr, &th->descr);
+      (*action)(fdata, th->backtrace_last_exn, &th->backtrace_last_exn);
+      if (th != Current_thread) {
+        if (th->current_stack != NULL)
+          caml_do_local_roots(action, fdata, th->local_roots,
+                              th->current_stack, th->gc_regs);
+      }
+      th = th->next;
+    } while (th != Current_thread);
+
+  };
+
+  if (prev_scan_roots_hook != NULL)
+    (*prev_scan_roots_hook)(action, fdata, domain_state);
+
+  return;
+}
+
+void caml_thread_save_runtime_state(void)
+{
+  Current_thread->current_stack = Caml_state->current_stack;
+  Current_thread->c_stack = Caml_state->c_stack;
+  Current_thread->gc_regs = Caml_state->gc_regs;
+  Current_thread->gc_regs_buckets = Caml_state->gc_regs_buckets;
+  Current_thread->gc_regs_slot = Caml_state->gc_regs_slot;
+  Current_thread->exn_handler = Caml_state->exn_handler;
+  Current_thread->local_roots = Caml_state->local_roots;
+  Current_thread->backtrace_pos = Caml_state->backtrace_pos;
+  Current_thread->backtrace_buffer = Caml_state->backtrace_buffer;
+  Current_thread->backtrace_last_exn = Caml_state->backtrace_last_exn;
+#ifndef NATIVE_CODE
+  Current_thread->trap_sp_off = Caml_state->trap_sp_off;
+  Current_thread->trap_barrier_off = Caml_state->trap_barrier_off;
+  Current_thread->external_raise = Caml_state->external_raise;
 #endif
-  curr_thread->local_roots = Caml_state->local_roots;
-  curr_thread->backtrace_pos = Caml_state->backtrace_pos;
-  curr_thread->backtrace_buffer = Caml_state->backtrace_buffer;
-  curr_thread->backtrace_last_exn = Caml_state->backtrace_last_exn;
-  caml_memprof_leave_thread();
 }
 
-Caml_inline void caml_thread_restore_runtime_state(void)
+void caml_thread_restore_runtime_state(void)
 {
-#ifdef NATIVE_CODE
-  Caml_state->top_of_stack = curr_thread->top_of_stack;
-  Caml_state->bottom_of_stack= curr_thread->bottom_of_stack;
-  Caml_state->last_return_address = curr_thread->last_retaddr;
-  Caml_state->gc_regs = curr_thread->gc_regs;
-  Caml_state->exception_pointer = curr_thread->exception_pointer;
-#else
-  Caml_state->stack_low = curr_thread->stack_low;
-  Caml_state->stack_high = curr_thread->stack_high;
-  Caml_state->stack_threshold = curr_thread->stack_threshold;
-  Caml_state->extern_sp = curr_thread->sp;
-  Caml_state->trapsp = curr_thread->trapsp;
-  Caml_state->external_raise = curr_thread->external_raise;
+  Caml_state->current_stack = Current_thread->current_stack;
+  Caml_state->c_stack = Current_thread->c_stack;
+  Caml_state->gc_regs = Current_thread->gc_regs;
+  Caml_state->gc_regs_buckets = Current_thread->gc_regs_buckets;
+  Caml_state->gc_regs_slot = Current_thread->gc_regs_slot;
+  Caml_state->exn_handler = Current_thread->exn_handler;
+  Caml_state->local_roots = Current_thread->local_roots;
+  Caml_state->backtrace_pos = Current_thread->backtrace_pos;
+  Caml_state->backtrace_buffer = Current_thread->backtrace_buffer;
+  Caml_state->backtrace_last_exn = Current_thread->backtrace_last_exn;
+#ifndef NATIVE_CODE
+  Caml_state->trap_sp_off = Current_thread->trap_sp_off;
+  Caml_state->trap_barrier_off = Current_thread->trap_barrier_off;
+  Caml_state->external_raise = Current_thread->external_raise;
 #endif
-  Caml_state->local_roots = curr_thread->local_roots;
-  Caml_state->backtrace_pos = curr_thread->backtrace_pos;
-  Caml_state->backtrace_buffer = curr_thread->backtrace_buffer;
-  Caml_state->backtrace_last_exn = curr_thread->backtrace_last_exn;
-  caml_memprof_enter_thread(curr_thread->memprof_ctx);
 }
 
 /* Hooks for caml_enter_blocking_section and caml_leave_blocking_section */
 
-
 static void caml_thread_enter_blocking_section(void)
 {
   /* Save the current runtime state in the thread descriptor
      of the current thread */
+  Current_thread = st_tls_get(Thread_key);
   caml_thread_save_runtime_state();
   /* Tell other threads that the runtime is free */
-  st_masterlock_release(&caml_master_lock);
+  st_masterlock_release(&Thread_main_lock);
 }
 
 static void caml_thread_leave_blocking_section(void)
@@ -241,10 +232,10 @@ static void caml_thread_leave_blocking_section(void)
   DWORD error = GetLastError();
 #endif
   /* Wait until the runtime is free */
-  st_masterlock_acquire(&caml_master_lock);
-  /* Update curr_thread to point to the thread descriptor corresponding
-     to the thread currently executing */
-  curr_thread = st_tls_get(thread_descriptor_key);
+  st_masterlock_acquire(&Thread_main_lock);
+  /* Update Current_thread to point to the thread descriptor corresponding to
+     the thread currently executing */
+  Current_thread = st_tls_get(Thread_key);
   /* Restore the runtime state from the curr_thread descriptor */
   caml_thread_restore_runtime_state();
 #ifdef _WIN32
@@ -252,80 +243,6 @@ static void caml_thread_leave_blocking_section(void)
 #endif
 }
 
-/* Hooks for I/O locking */
-
-static void caml_io_mutex_free(struct channel *chan)
-{
-  st_mutex mutex = chan->mutex;
-  if (mutex != NULL) {
-    st_mutex_destroy(mutex);
-    chan->mutex = NULL;
-  }
-}
-
-static void caml_io_mutex_lock(struct channel *chan)
-{
-  st_mutex mutex = chan->mutex;
-
-  if (mutex == NULL) {
-    st_check_error(st_mutex_create(&mutex), "channel locking"); /*PR#7038*/
-    chan->mutex = mutex;
-  }
-  /* PR#4351: first try to acquire mutex without releasing the master lock */
-  if (st_mutex_trylock(mutex) == MUTEX_PREVIOUSLY_UNLOCKED) {
-    st_tls_set(last_channel_locked_key, (void *) chan);
-    return;
-  }
-  /* If unsuccessful, block on mutex */
-  caml_enter_blocking_section();
-  st_mutex_lock(mutex);
-  /* Problem: if a signal occurs at this point,
-     and the signal handler raises an exception, we will not
-     unlock the mutex.  The alternative (doing the setspecific
-     before locking the mutex is also incorrect, since we could
-     then unlock a mutex that is unlocked or locked by someone else. */
-  st_tls_set(last_channel_locked_key, (void *) chan);
-  caml_leave_blocking_section();
-}
-
-static void caml_io_mutex_unlock(struct channel *chan)
-{
-  st_mutex_unlock(chan->mutex);
-  st_tls_set(last_channel_locked_key, NULL);
-}
-
-static void caml_io_mutex_unlock_exn(void)
-{
-  struct channel * chan = st_tls_get(last_channel_locked_key);
-  if (chan != NULL) caml_io_mutex_unlock(chan);
-}
-
-/* Hook for estimating stack usage */
-
-static uintnat (*prev_stack_usage_hook)(void);
-
-static uintnat caml_thread_stack_usage(void)
-{
-  uintnat sz;
-  caml_thread_t th;
-
-  /* Don't add stack for current thread, this is done elsewhere */
-  for (sz = 0, th = curr_thread->next;
-       th != curr_thread;
-       th = th->next) {
-#ifdef NATIVE_CODE
-  if(th->top_of_stack != NULL && th->bottom_of_stack != NULL &&
-     th->top_of_stack > th->bottom_of_stack)
-       sz += (value *) th->top_of_stack - (value *) th->bottom_of_stack;
-#else
-    sz += th->stack_high - th->sp;
-#endif
-  }
-  if (prev_stack_usage_hook != NULL)
-    sz += prev_stack_usage_hook();
-  return sz;
-}
-
 /* Create and setup a new thread info block.
    This block has no associated thread descriptor and
    is not inserted in the list of threads. */
@@ -333,30 +250,38 @@ static uintnat caml_thread_stack_usage(void)
 static caml_thread_t caml_thread_new_info(void)
 {
   caml_thread_t th;
-  th = (caml_thread_t) caml_stat_alloc_noexc(sizeof(struct caml_thread_struct));
+  caml_domain_state *domain_state;
+
+  domain_state = Caml_state;
+  th = NULL;
+  th = (caml_thread_t)caml_stat_alloc_noexc(sizeof(struct caml_thread_struct));
   if (th == NULL) return NULL;
-  th->descr = Val_unit;         /* filled later */
-#ifdef NATIVE_CODE
-  th->bottom_of_stack = NULL;
-  th->top_of_stack = NULL;
-  th->last_retaddr = 1;
-  th->exception_pointer = NULL;
+  th->descr = Val_unit;
+  th->next = NULL;
+  th->prev = NULL;
+  th->domain_id = domain_state->id;
+  th->current_stack = caml_alloc_main_stack(Stack_size / sizeof(value));;
+  if (th->current_stack == NULL) {
+    caml_stat_free(th);
+    return NULL;
+  }
+  th->c_stack = NULL;
   th->local_roots = NULL;
   th->exit_buf = NULL;
-#else
-  /* Allocate the stacks */
-  th->stack_low = (value *) caml_stat_alloc(Thread_stack_size);
-  th->stack_high = th->stack_low + Thread_stack_size / sizeof(value);
-  th->stack_threshold = th->stack_low + Stack_threshold / sizeof(value);
-  th->sp = th->stack_high;
-  th->trapsp = th->stack_high;
-  th->local_roots = NULL;
-  th->external_raise = NULL;
-#endif
   th->backtrace_pos = 0;
   th->backtrace_buffer = NULL;
   th->backtrace_last_exn = Val_unit;
-  th->memprof_ctx = caml_memprof_new_th_ctx();
+  th->gc_regs = NULL;
+  th->gc_regs_buckets = NULL;
+  th->gc_regs_slot = NULL;
+  th->exn_handler = NULL;
+
+#ifndef NATIVE_CODE
+  th->trap_sp_off = 1;
+  th->trap_barrier_off = 2;
+  th->external_raise = NULL;
+#endif
+
   return th;
 }
 
@@ -371,10 +296,9 @@ static value caml_thread_new_descriptor(value clos)
     mu = caml_threadstatus_new();
     /* Create a descriptor for the new thread */
     descr = caml_alloc_small(3, 0);
-    Ident(descr) = Val_long(thread_next_ident);
+    Ident(descr) = Val_long(atomic_fetch_add(&thread_next_id, +1));
     Start_closure(descr) = clos;
     Terminated(descr) = mu;
-    thread_next_ident++;
   End_roots();
   return descr;
 }
@@ -385,180 +309,279 @@ static value caml_thread_new_descriptor(value clos)
 static void caml_thread_remove_info(caml_thread_t th)
 {
   if (th->next == th)
-    all_threads = NULL; /* last OCaml thread exiting */
-  else if (all_threads == th)
-    all_threads = th->next;     /* PR#5295 */
+    All_threads = NULL; /* last OCaml thread exiting */
+  else if (All_threads == th)
+    All_threads = th->next;     /* PR#5295 */
   th->next->prev = th->prev;
   th->prev->next = th->next;
-#ifndef NATIVE_CODE
-  caml_stat_free(th->stack_low);
-#endif
-  if (th->backtrace_buffer != NULL) caml_stat_free(th->backtrace_buffer);
+  caml_free_stack(th->current_stack);
   caml_stat_free(th);
+  return;
 }
 
 /* Reinitialize the thread machinery after a fork() (PR#4577) */
+/* TODO(engil): more work on the multicore fork machinery. */
 
 static void caml_thread_reinitialize(void)
 {
-  struct channel * chan;
+  caml_thread_t th, next;
 
-  /* Remove all other threads (now nonexistent)
-     from the doubly-linked list of threads */
-  while (curr_thread->next != curr_thread) {
-    caml_memprof_delete_th_ctx(curr_thread->next->memprof_ctx);
-    caml_thread_remove_info(curr_thread->next);
+  th = Current_thread->next;
+  while (th != Current_thread) {
+    next = th->next;
+    caml_free_stack(th->current_stack);
+    caml_stat_free(th);
+    th = next;
   }
-
-  /* Reinitialize the master lock machinery,
-     just in case the fork happened while other threads were doing
-     caml_leave_blocking_section */
-  st_masterlock_init(&caml_master_lock);
-  /* Tick thread is not currently running in child process, will be
-     re-created at next Thread.create */
-  caml_tick_thread_running = 0;
-  /* Destroy all IO mutexes; will be reinitialized on demand */
-  for (chan = caml_all_opened_channels;
-       chan != NULL;
-       chan = chan->next) {
-    if (chan->mutex != NULL) {
-      st_mutex_destroy(chan->mutex);
-      chan->mutex = NULL;
+  Current_thread->next = Current_thread;
+  Current_thread->prev = Current_thread;
+  All_threads = Current_thread;
+
+  /* Within the child, the domain_lock needs to be reset and acquired. */
+  caml_reset_domain_lock();
+  caml_acquire_domain_lock();
+  /* The master lock needs to be initialized again. This process will also be
+     the effective owner of the lock. So there is no need to run
+     st_masterlock_acquire (busy = 1) */
+  st_masterlock_init(&Thread_main_lock);
+}
+
+CAMLprim value caml_thread_join(value th);
+
+/* This hook is run when a domain shuts down (see domains.c).
+
+   When a domain shuts down, the state must be cleared to allow proper reuse of
+   the domain slot the next time a domain is started on this slot. If a program
+   is single-domain, we mimic OCaml 4's behavior and do not care about ongoing
+   thread: the program will exit. */
+static void caml_thread_domain_stop_hook(void) {
+  /* If the program runs multiple domains, we should not let systhreads to hang
+     around when a domain exit. If the domain is not the last one (and the last
+     one will always be domain 0) we force the domain to join on every thread
+     on its chain before wrapping up. */
+  if (!caml_domain_alone()) {
+
+    while (Current_thread->next != Current_thread) {
+      caml_thread_join(Current_thread->next->descr);
     }
-  }
+
+    /* another domain thread may be joining on this domain's descriptor */
+    caml_threadstatus_terminate(Terminated(Current_thread->descr));
+
+    caml_stat_free(Current_thread);
+    Current_thread = NULL;
+    All_threads = NULL;
+  };
 }
 
-/* Initialize the thread machinery */
+#ifdef NATIVE_CODE
+static void caml_thread_termination_hook(void) {
+  st_thread_exit();
+}
+#endif /* NATIVE_CODE */
 
-CAMLprim value caml_thread_initialize(value unit)   /* ML */
+CAMLprim value caml_thread_initialize_domain(value v)
 {
-  /* Protect against repeated initialization (PR#3532) */
-  if (curr_thread != NULL) return Val_unit;
+  CAMLparam0();
+
+  caml_thread_t new_thread;
+
   /* OS-specific initialization */
   st_initialize();
-  /* Initialize and acquire the master lock */
-  st_masterlock_init(&caml_master_lock);
-  /* Initialize the keys */
-  st_tls_newkey(&thread_descriptor_key);
-  st_tls_newkey(&last_channel_locked_key);
-  /* Set up a thread info block for the current thread */
-  curr_thread =
+
+  st_masterlock_init(&Thread_main_lock);
+
+  new_thread =
     (caml_thread_t) caml_stat_alloc(sizeof(struct caml_thread_struct));
-  curr_thread->descr = caml_thread_new_descriptor(Val_unit);
-  curr_thread->next = curr_thread;
-  curr_thread->prev = curr_thread;
-  all_threads = curr_thread;
-  curr_thread->backtrace_last_exn = Val_unit;
+
+  new_thread->descr = caml_thread_new_descriptor(Val_unit);
+  new_thread->next = new_thread;
+  new_thread->prev = new_thread;
+  new_thread->backtrace_last_exn = Val_unit;
 #ifdef NATIVE_CODE
-  curr_thread->exit_buf = &caml_termination_jmpbuf;
+  new_thread->exit_buf = &caml_termination_jmpbuf;
 #endif
-  curr_thread->memprof_ctx = &caml_memprof_main_ctx;
-  /* The stack-related fields will be filled in at the next
-     caml_enter_blocking_section */
-  /* Associate the thread descriptor with the thread */
-  st_tls_set(thread_descriptor_key, (void *) curr_thread);
-  st_thread_set_id(Ident(curr_thread->descr));
-  /* Set up the hooks */
+
+  st_tls_newkey(&Thread_key);
+  st_tls_set(Thread_key, (void *) new_thread);
+  st_thread_set_id(Ident(new_thread->descr));
+
+  All_threads = new_thread;
+  Current_thread = new_thread;
+  Tick_thread_running = 0;
+
+  CAMLreturn(Val_unit);
+}
+
+CAMLprim value caml_thread_yield(value unit);
+
+void caml_thread_interrupt_hook(void)
+{
+  uintnat is_on = 1;
+  caml_domain_state *domain = Caml_state;
+  atomic_uintnat* req_external_interrupt =
+    (atomic_uintnat*)&domain->requested_external_interrupt;
+
+  if (atomic_compare_exchange_strong(req_external_interrupt, &is_on, 0)) {
+    caml_thread_yield(Val_unit);
+  }
+
+  return;
+}
+
+/* [caml_thread_initialize] initialises the systhreads infrastructure. This
+   function first sets up the chain for systhreads on this domain, then setup
+   the global variables and hooks for systhreads to cooperate with the runtime
+   system. */
+CAMLprim value caml_thread_initialize(value unit)   /* ML */
+{
+  CAMLparam0();
+
+  /* First initialise the systhread chain on this domain */
+  caml_thread_initialize_domain(Val_unit);
+
   prev_scan_roots_hook = caml_scan_roots_hook;
   caml_scan_roots_hook = caml_thread_scan_roots;
   caml_enter_blocking_section_hook = caml_thread_enter_blocking_section;
   caml_leave_blocking_section_hook = caml_thread_leave_blocking_section;
 #ifdef NATIVE_CODE
-  caml_termination_hook = st_thread_exit;
+  caml_termination_hook = caml_thread_termination_hook;
 #endif
-  caml_channel_mutex_free = caml_io_mutex_free;
-  caml_channel_mutex_lock = caml_io_mutex_lock;
-  caml_channel_mutex_unlock = caml_io_mutex_unlock;
-  caml_channel_mutex_unlock_exn = caml_io_mutex_unlock_exn;
-  prev_stack_usage_hook = caml_stack_usage_hook;
-  caml_stack_usage_hook = caml_thread_stack_usage;
-  caml_memprof_th_ctx_iter_hook = memprof_ctx_iter;
-  /* Set up fork() to reinitialize the thread machinery in the child
-     (PR#4577) */
+  caml_domain_external_interrupt_hook = caml_thread_interrupt_hook;
+  caml_domain_stop_hook = caml_thread_domain_stop_hook;
+
   st_atfork(caml_thread_reinitialize);
-  return Val_unit;
-}
 
-/* Cleanup the thread machinery when the runtime is shut down. Joining the tick
-   thread take 25ms on average / 50ms in the worst case, so we don't do it on
-   program exit. */
+  CAMLreturn(Val_unit);
+}
 
 CAMLprim value caml_thread_cleanup(value unit)   /* ML */
 {
-  if (caml_tick_thread_running){
-    caml_tick_thread_stop = 1;
-    st_thread_join(caml_tick_thread_id);
-    caml_tick_thread_stop = 0;
-    caml_tick_thread_running = 0;
+  if (Tick_thread_running){
+    atomic_store_rel(&Tick_thread_stop, 1);
+    st_thread_join(Tick_thread_id);
+    atomic_store_rel(&Tick_thread_stop, 0);
+    Tick_thread_running = 0;
   }
+
   return Val_unit;
 }
-
 /* Thread cleanup at termination */
 
 static void caml_thread_stop(void)
 {
+  caml_thread_t next;
+
   /* PR#5188, PR#7220: some of the global runtime state may have
      changed as the thread was running, so we save it in the
      curr_thread data to make sure that the cleanup logic
      below uses accurate information. */
   caml_thread_save_runtime_state();
-  /* Tell memprof that this thread is terminating. */
-  caml_memprof_delete_th_ctx(curr_thread->memprof_ctx);
-  /* Signal that the thread has terminated */
-  caml_threadstatus_terminate(Terminated(curr_thread->descr));
-  /* Remove th from the doubly-linked list of threads and free its info block */
-  caml_thread_remove_info(curr_thread);
+
+  next = Current_thread->next;
+
+  /* The main domain thread does not go through [caml_thread_stop]. There is
+     always one more thread in the chain at this point in time. */
+  CAMLassert(next != Current_thread);
+
+  caml_threadstatus_terminate(Terminated(Current_thread->descr));
+  caml_thread_remove_info(Current_thread);
+
+  /* FIXME: tricky bit with backup thread
+
+     Normally we expect the next thread to kick in and resume operation by
+     first setting Current_thread to the right TLS dec data. However it may
+     very well be that there's no runnable dec next (eg: next dec is
+     blocking.), so we set it to next for now to give a valid state to the
+     backup thread. */
+  Current_thread = next;
+
+  caml_thread_restore_runtime_state();
+
   /* If no other OCaml thread remains, ask the tick thread to stop
      so that it does not prevent the whole process from exiting (#9971) */
-  if (all_threads == NULL) caml_thread_cleanup(Val_unit);
-  /* OS-specific cleanups */
-  st_thread_cleanup();
-  /* Release the runtime system */
-  st_masterlock_release(&caml_master_lock);
+  if (All_threads == NULL) caml_thread_cleanup(Val_unit);
+
+  st_masterlock_release(&Thread_main_lock);
 }
 
 /* Create a thread */
 
-static ST_THREAD_FUNCTION caml_thread_start(void * arg)
+static void * caml_thread_start(void * v)
 {
-  caml_thread_t th = (caml_thread_t) arg;
+  caml_thread_t th = (caml_thread_t) v;
   value clos;
 #ifdef NATIVE_CODE
   struct longjmp_buffer termination_buf;
-  char tos;
-  /* Record top of stack (approximative) */
-  th->top_of_stack = &tos;
 #endif
 
-  /* Associate the thread descriptor with the thread */
-  st_tls_set(thread_descriptor_key, (void *) th);
-  /* Acquire the global mutex */
-  caml_leave_blocking_section();
+  caml_init_domain_self(th->domain_id);
+
+  st_tls_set(Thread_key, th);
+
+  caml_domain_set_name("Domain");
+
+  st_masterlock_acquire(&Thread_main_lock);
+  Current_thread = st_tls_get(Thread_key);
+  caml_thread_restore_runtime_state();
+
   st_thread_set_id(Ident(th->descr));
-  caml_setup_stack_overflow_detection();
+
+#ifdef POSIX_SIGNALS
+  /* restore the signal mask from the spawning thread, now it is safe for the
+     signal handler to run (as Caml_state is initialised) */
+  pthread_sigmask(SIG_SETMASK, &th->init_mask, NULL);
+#endif
+
 #ifdef NATIVE_CODE
   /* Setup termination handler (for caml_thread_exit) */
   if (sigsetjmp(termination_buf.buf, 0) == 0) {
-    th->exit_buf = &termination_buf;
+    Current_thread->exit_buf = &termination_buf;
 #endif
-    /* Callback the closure */
-    clos = Start_closure(th->descr);
-    caml_modify(&(Start_closure(th->descr)), Val_unit);
-    caml_callback_exn(clos, Val_unit);
-    caml_thread_stop();
+  clos = Start_closure(Current_thread->descr);
+  caml_modify(&(Start_closure(Current_thread->descr)), Val_unit);
+  caml_callback_exn(clos, Val_unit);
+  caml_thread_stop();
 #ifdef NATIVE_CODE
   }
 #endif
-  caml_stop_stack_overflow_detection();
-  /* The thread now stops running */
+
   return 0;
 }
 
+static int create_tick_thread()
+{
+  int err;
+#ifdef POSIX_SIGNALS
+  sigset_t mask, old_mask;
+
+  /* Block all signals so that we don't try to execute an OCaml signal
+     handler in the new tick thread */
+  sigfillset(&mask);
+  pthread_sigmask(SIG_BLOCK, &mask, &old_mask);
+#endif
+
+  err = st_thread_create(&Tick_thread_id, caml_thread_tick,
+                         (void *) &Caml_state->id);
+
+#ifdef POSIX_SIGNALS
+  pthread_sigmask(SIG_SETMASK, &old_mask, NULL);
+#endif
+
+  return err;
+}
+
 CAMLprim value caml_thread_new(value clos)          /* ML */
 {
+  CAMLparam1(clos);
   caml_thread_t th;
   st_retcode err;
+#ifdef POSIX_SIGNALS
+  sigset_t mask, old_mask;
+
+  sigfillset(&mask);
+  pthread_sigmask(SIG_BLOCK, &mask, &old_mask);
+#endif
 
 #ifndef NATIVE_CODE
   if (caml_debugger_in_use)
@@ -566,30 +589,41 @@ CAMLprim value caml_thread_new(value clos)          /* ML */
 #endif
   /* Create a thread info block */
   th = caml_thread_new_info();
-  if (th == NULL) caml_raise_out_of_memory();
-  /* Equip it with a thread descriptor */
+
+  if (th == NULL)
+    caml_raise_out_of_memory();
+
   th->descr = caml_thread_new_descriptor(clos);
-  /* Add thread info block to the list of threads */
-  th->next = curr_thread->next;
-  th->prev = curr_thread;
-  curr_thread->next->prev = th;
-  curr_thread->next = th;
-  /* Create the new thread */
+
+#ifdef POSIX_SIGNALS
+  th->init_mask = mask;
+#endif
+
+  th->next = Current_thread->next;
+  th->prev = Current_thread;
+
+  Current_thread->next->prev = th;
+  Current_thread->next = th;
+
   err = st_thread_create(NULL, caml_thread_start, (void *) th);
+
+#ifdef POSIX_SIGNALS
+  /* regardless of error, return our sigmask to the original state */
+  pthread_sigmask(SIG_SETMASK, &old_mask, NULL);
+#endif
+
   if (err != 0) {
     /* Creation failed, remove thread info block from list of threads */
     caml_thread_remove_info(th);
-    st_check_error(err, "Thread.create");
+    sync_check_error(err, "Thread.create");
   }
-  /* Create the tick thread if not already done.
-     Because of PR#4666, we start the tick thread late, only when we create
-     the first additional thread in the current process*/
-  if (! caml_tick_thread_running) {
-    err = st_thread_create(&caml_tick_thread_id, caml_thread_tick, NULL);
-    st_check_error(err, "Thread.create");
-    caml_tick_thread_running = 1;
+
+  if (! Tick_thread_running) {
+    err = create_tick_thread();
+    sync_check_error(err, "Thread.create");
+    Tick_thread_running = 1;
   }
-  return th->descr;
+  CAMLreturn(th->descr);
 }
 
 /* Register a thread already created from C */
@@ -600,41 +634,44 @@ CAMLexport int caml_c_thread_register(void)
   st_retcode err;
 
   /* Already registered? */
-  if (st_tls_get(thread_descriptor_key) != NULL) return 0;
+  if (Caml_state == NULL) {
+    caml_init_domain_self(0);
+  };
+  if (st_tls_get(Thread_key) != NULL) return 0;
+  /* Take master lock to protect access to the runtime */
+  st_masterlock_acquire(&Thread_main_lock);
   /* Create a thread info block */
   th = caml_thread_new_info();
-  if (th == NULL) return 0;
-#ifdef NATIVE_CODE
-  th->top_of_stack = (char *) &err;
-#endif
-  /* Take master lock to protect access to the chaining of threads */
-  st_masterlock_acquire(&caml_master_lock);
+  /* If it fails, we release the lock and return an error. */
+  if (th == NULL) {
+    st_masterlock_release(&Thread_main_lock);
+    return 0;
+  }
   /* Add thread info block to the list of threads */
-  if (all_threads == NULL) {
+  if (All_threads == NULL) {
     th->next = th;
     th->prev = th;
-    all_threads = th;
+    All_threads = th;
   } else {
-    th->next = all_threads->next;
-    th->prev = all_threads;
-    all_threads->next->prev = th;
-    all_threads->next = th;
+    th->next = All_threads->next;
+    th->prev = All_threads;
+    All_threads->next->prev = th;
+    All_threads->next = th;
   }
   /* Associate the thread descriptor with the thread */
-  st_tls_set(thread_descriptor_key, (void *) th);
-  /* Release the master lock */
-  st_masterlock_release(&caml_master_lock);
-  /* Now we can re-enter the run-time system and heap-allocate the descriptor */
-  caml_leave_blocking_section();
+  st_tls_set(Thread_key, (void *) th);
+  /* Allocate the thread descriptor on the heap */
   th->descr = caml_thread_new_descriptor(Val_unit);  /* no closure */
   st_thread_set_id(Ident(th->descr));
-  /* Create the tick thread if not already done.  */
-  if (! caml_tick_thread_running) {
-    err = st_thread_create(&caml_tick_thread_id, caml_thread_tick, NULL);
-    if (err == 0) caml_tick_thread_running = 1;
+
+  if (! Tick_thread_running) {
+    err = create_tick_thread();
+    sync_check_error(err, "caml_register_c_thread");
+    Tick_thread_running = 1;
   }
-  /* Exit the run-time system */
-  caml_enter_blocking_section();
+
+  /* Release the master lock */
+  st_masterlock_release(&Thread_main_lock);
   return 1;
 }
 
@@ -643,20 +680,32 @@ CAMLexport int caml_c_thread_register(void)
 
 CAMLexport int caml_c_thread_unregister(void)
 {
-  caml_thread_t th = st_tls_get(thread_descriptor_key);
+  caml_thread_t th;
+
+  /* If Caml_state is not set, this thread was likely not registered */
+  if (Caml_state == NULL) return 0;
+
+  th = st_tls_get(Thread_key);
   /* Not registered? */
   if (th == NULL) return 0;
   /* Wait until the runtime is available */
-  st_masterlock_acquire(&caml_master_lock);
-  /* Forget the thread descriptor */
-  st_tls_set(thread_descriptor_key, NULL);
+  st_masterlock_acquire(&Thread_main_lock);
+  /*  Forget the thread descriptor */
+  st_tls_set(Thread_key, NULL);
   /* Remove thread info block from list of threads, and free it */
   caml_thread_remove_info(th);
+
+  Current_thread = All_threads;
+
   /* If no other OCaml thread remains, ask the tick thread to stop
      so that it does not prevent the whole process from exiting (#9971) */
-  if (all_threads == NULL) caml_thread_cleanup(Val_unit);
+  if (All_threads == NULL)
+    caml_thread_cleanup(Val_unit);
+  else
+    caml_thread_restore_runtime_state();
+
   /* Release the runtime */
-  st_masterlock_release(&caml_master_lock);
+  st_masterlock_release(&Thread_main_lock);
   return 1;
 }
 
@@ -664,9 +713,7 @@ CAMLexport int caml_c_thread_unregister(void)
 
 CAMLprim value caml_thread_self(value unit)         /* ML */
 {
-  if (curr_thread == NULL)
-    caml_invalid_argument("Thread.self: not initialized");
-  return curr_thread->descr;
+  return Current_thread->descr;
 }
 
 /* Return the identifier of a thread */
@@ -682,7 +729,7 @@ CAMLprim value caml_thread_uncaught_exception(value exn)  /* ML */
 {
   char * msg = caml_format_exception(exn);
   fprintf(stderr, "Thread %d killed on uncaught exception %s\n",
-          Int_val(Ident(curr_thread->descr)), msg);
+          Int_val(Ident(Current_thread->descr)), msg);
   caml_stat_free(msg);
   if (Caml_state->backtrace_active) caml_print_exception_backtrace();
   fflush(stderr);
@@ -695,7 +742,12 @@ CAMLprim value caml_thread_exit(value unit)   /* ML */
 {
   struct longjmp_buffer * exit_buf = NULL;
 
-  if (curr_thread == NULL)
+  /* we check if another domain was ever started */
+  if (caml_domain_is_multicore())
+    caml_invalid_argument
+      ("Thread.exit: unsupported call under multiple domains");
+
+  if (Current_thread == NULL)
     caml_invalid_argument("Thread.exit: not initialized");
 
   /* In native code, we cannot call pthread_exit here because on some
@@ -707,7 +759,7 @@ CAMLprim value caml_thread_exit(value unit)   /* ML */
      a creation point (exit_buf == NULL).
  */
 #ifdef NATIVE_CODE
-  exit_buf = curr_thread->exit_buf;
+  exit_buf = Current_thread->exit_buf;
 #endif
   caml_thread_stop();
   if (exit_buf != NULL) {
@@ -724,19 +776,14 @@ CAMLprim value caml_thread_exit(value unit)   /* ML */
 
 CAMLprim value caml_thread_yield(value unit)        /* ML */
 {
-  if (st_masterlock_waiters(&caml_master_lock) == 0) return Val_unit;
-
-  /* Do all the parts of a blocking section enter/leave except lock
-     manipulation, which we'll do more efficiently in st_thread_yield. (Since
-     our blocking section doesn't contain anything interesting, don't bother
-     with saving errno.)
-  */
-  caml_raise_if_exception(caml_process_pending_signals_exn());
+  if (atomic_load_acq(&Thread_main_lock.waiters) == 0) return Val_unit;
+
+  caml_process_pending_signals();
   caml_thread_save_runtime_state();
-  st_thread_yield(&caml_master_lock);
-  curr_thread = st_tls_get(thread_descriptor_key);
+  st_thread_yield(&Thread_main_lock);
+  Current_thread = st_tls_get(Thread_key);
   caml_thread_restore_runtime_state();
-  caml_raise_if_exception(caml_process_pending_signals_exn());
+  caml_process_pending_signals();
 
   return Val_unit;
 }
@@ -746,159 +793,7 @@ CAMLprim value caml_thread_yield(value unit)        /* ML */
 CAMLprim value caml_thread_join(value th)          /* ML */
 {
   st_retcode rc = caml_threadstatus_wait(Terminated(th));
-  st_check_error(rc, "Thread.join");
-  return Val_unit;
-}
-
-/* Mutex operations */
-
-#define Mutex_val(v) (* ((st_mutex *) Data_custom_val(v)))
-
-static void caml_mutex_finalize(value wrapper)
-{
-  st_mutex_destroy(Mutex_val(wrapper));
-}
-
-static int caml_mutex_compare(value wrapper1, value wrapper2)
-{
-  st_mutex mut1 = Mutex_val(wrapper1);
-  st_mutex mut2 = Mutex_val(wrapper2);
-  return mut1 == mut2 ? 0 : mut1 < mut2 ? -1 : 1;
-}
-
-static intnat caml_mutex_hash(value wrapper)
-{
-  return (intnat) (Mutex_val(wrapper));
-}
-
-static struct custom_operations caml_mutex_ops = {
-  "_mutex",
-  caml_mutex_finalize,
-  caml_mutex_compare,
-  caml_mutex_hash,
-  custom_serialize_default,
-  custom_deserialize_default,
-  custom_compare_ext_default,
-  custom_fixed_length_default
-};
-
-CAMLprim value caml_mutex_new(value unit)        /* ML */
-{
-  st_mutex mut = NULL;          /* suppress warning */
-  value wrapper;
-  st_check_error(st_mutex_create(&mut), "Mutex.create");
-  wrapper = caml_alloc_custom(&caml_mutex_ops, sizeof(st_mutex *),
-                              0, 1);
-  Mutex_val(wrapper) = mut;
-  return wrapper;
-}
-
-CAMLprim value caml_mutex_lock(value wrapper)     /* ML */
-{
-  st_mutex mut = Mutex_val(wrapper);
-  st_retcode retcode;
-
-  /* PR#4351: first try to acquire mutex without releasing the master lock */
-  if (st_mutex_trylock(mut) == MUTEX_PREVIOUSLY_UNLOCKED) return Val_unit;
-  /* If unsuccessful, block on mutex */
-  Begin_root(wrapper)           /* prevent the deallocation of mutex */
-    caml_enter_blocking_section();
-    retcode = st_mutex_lock(mut);
-    caml_leave_blocking_section();
-  End_roots();
-  st_check_error(retcode, "Mutex.lock");
-  return Val_unit;
-}
-
-CAMLprim value caml_mutex_unlock(value wrapper)           /* ML */
-{
-  st_mutex mut = Mutex_val(wrapper);
-  st_retcode retcode;
-  /* PR#4351: no need to release and reacquire master lock */
-  retcode = st_mutex_unlock(mut);
-  st_check_error(retcode, "Mutex.unlock");
-  return Val_unit;
-}
-
-CAMLprim value caml_mutex_try_lock(value wrapper)           /* ML */
-{
-  st_mutex mut = Mutex_val(wrapper);
-  st_retcode retcode;
-  retcode = st_mutex_trylock(mut);
-  if (retcode == MUTEX_ALREADY_LOCKED) return Val_false;
-  st_check_error(retcode, "Mutex.try_lock");
-  return Val_true;
-}
-
-/* Conditions operations */
-
-#define Condition_val(v) (* (st_condvar *) Data_custom_val(v))
-
-static void caml_condition_finalize(value wrapper)
-{
-  st_condvar_destroy(Condition_val(wrapper));
-}
-
-static int caml_condition_compare(value wrapper1, value wrapper2)
-{
-  st_condvar cond1 = Condition_val(wrapper1);
-  st_condvar cond2 = Condition_val(wrapper2);
-  return cond1 == cond2 ? 0 : cond1 < cond2 ? -1 : 1;
-}
-
-static intnat caml_condition_hash(value wrapper)
-{
-  return (intnat) (Condition_val(wrapper));
-}
-
-static struct custom_operations caml_condition_ops = {
-  "_condition",
-  caml_condition_finalize,
-  caml_condition_compare,
-  caml_condition_hash,
-  custom_serialize_default,
-  custom_deserialize_default,
-  custom_compare_ext_default,
-  custom_fixed_length_default
-};
-
-CAMLprim value caml_condition_new(value unit)        /* ML */
-{
-  st_condvar cond = NULL;       /* suppress warning */
-  value wrapper;
-  st_check_error(st_condvar_create(&cond), "Condition.create");
-  wrapper = caml_alloc_custom(&caml_condition_ops, sizeof(st_condvar *),
-                              0, 1);
-  Condition_val(wrapper) = cond;
-  return wrapper;
-}
-
-CAMLprim value caml_condition_wait(value wcond, value wmut)           /* ML */
-{
-  st_condvar cond = Condition_val(wcond);
-  st_mutex mut = Mutex_val(wmut);
-  st_retcode retcode;
-
-  Begin_roots2(wcond, wmut)     /* prevent deallocation of cond and mutex */
-    caml_enter_blocking_section();
-    retcode = st_condvar_wait(cond, mut);
-    caml_leave_blocking_section();
-  End_roots();
-  st_check_error(retcode, "Condition.wait");
-  return Val_unit;
-}
-
-CAMLprim value caml_condition_signal(value wrapper)           /* ML */
-{
-  st_check_error(st_condvar_signal(Condition_val(wrapper)),
-                 "Condition.signal");
-  return Val_unit;
-}
-
-CAMLprim value caml_condition_broadcast(value wrapper)           /* ML */
-{
-  st_check_error(st_condvar_broadcast(Condition_val(wrapper)),
-                 "Condition.broadcast");
+  sync_check_error(rc, "Thread.join");
   return Val_unit;
 }
 
@@ -933,8 +828,9 @@ static value caml_threadstatus_new (void)
 {
   st_event ts = NULL;           /* suppress warning */
   value wrapper;
-  st_check_error(st_event_create(&ts), "Thread.create");
-  wrapper = caml_alloc_custom(&caml_threadstatus_ops, sizeof(st_event *),
+  sync_check_error(st_event_create(&ts), "Thread.create");
+  wrapper = caml_alloc_custom(&caml_threadstatus_ops,
+                              sizeof(st_event *),
                               0, 1);
   Threadstatus_val(wrapper) = ts;
   return wrapper;
@@ -955,5 +851,6 @@ static st_retcode caml_threadstatus_wait (value wrapper)
     retcode = st_event_wait(ts);
     caml_leave_blocking_section();
   End_roots();
+
   return retcode;
 }
diff --git a/otherlibs/systhreads/st_win32.h b/otherlibs/systhreads/st_win32.h
index 3f598a715dee..a04c8f6802d8 100644
--- a/otherlibs/systhreads/st_win32.h
+++ b/otherlibs/systhreads/st_win32.h
@@ -18,521 +18,6 @@
 #undef _WIN32_WINNT
 #define _WIN32_WINNT 0x0400
 #include <windows.h>
-#include <winerror.h>
-#include <stdio.h>
-#include <signal.h>
 
-#include <caml/osdeps.h>
-
-#if 1
-#define TRACE(x)
-#define TRACE1(x,y)
-#else
-#include <stdio.h>
-#define TRACE(x) printf("%d: %s\n", GetCurrentThreadId(), x); fflush(stdout)
-#define TRACE1(x,y) printf("%d: %s %p\n", GetCurrentThreadId(), x, (void *)y); \
-                    fflush(stdout)
-#endif
-
-typedef DWORD st_retcode;
-
-#define SIGPREEMPTION SIGTERM
-
-/* Unique thread identifiers and atomic operations over them */
-#ifdef ARCH_SIXTYFOUR
-typedef LONG64 st_tid;
-#define Tid_Atomic_Exchange InterlockedExchange64
-#define Tid_Atomic_Compare_Exchange InterlockedCompareExchange64
-#else
-typedef LONG st_tid;
-#define Tid_Atomic_Exchange InterlockedExchange
-#define Tid_Atomic_Compare_Exchange InterlockedCompareExchange
-#endif
-
-/* Thread-local storage associating a Win32 event to every thread. */
-static DWORD st_thread_sem_key;
-
-/* Thread-local storage for the OCaml thread ID. */
-static DWORD st_thread_id_key;
-
-/* OS-specific initialization */
-
-static DWORD st_initialize(void)
-{
-  DWORD result = 0;
-  st_thread_sem_key = TlsAlloc();
-  if (st_thread_sem_key == TLS_OUT_OF_INDEXES)
-    return GetLastError();
-  st_thread_id_key = TlsAlloc();
-  if (st_thread_id_key == TLS_OUT_OF_INDEXES) {
-    result = GetLastError();
-    TlsFree(st_thread_sem_key);
-  }
-  return result;
-}
-
-/* Thread creation.  Created in detached mode if [res] is NULL. */
-
-typedef HANDLE st_thread_id;
-
-static DWORD st_thread_create(st_thread_id * res,
-                              LPTHREAD_START_ROUTINE fn, void * arg)
-{
-  HANDLE h = CreateThread(NULL, 0, fn, arg, 0, NULL);
-  TRACE1("st_thread_create", h);
-  if (h == NULL) return GetLastError();
-  if (res == NULL)
-    CloseHandle(h);
-  else
-    *res = h;
-  return 0;
-}
-
-#define ST_THREAD_FUNCTION DWORD WINAPI
-
-/* Cleanup at thread exit */
-
-static void st_thread_cleanup(void)
-{
-  HANDLE ev = (HANDLE) TlsGetValue(st_thread_sem_key);
-  if (ev != NULL) CloseHandle(ev);
-}
-
-/* Thread termination */
-
-CAMLnoreturn_start
-static void st_thread_exit(void)
-CAMLnoreturn_end;
-
-static void st_thread_exit(void)
-{
-  TRACE("st_thread_exit");
-  ExitThread(0);
-}
-
-static void st_thread_join(st_thread_id thr)
-{
-  TRACE1("st_thread_join", h);
-  WaitForSingleObject(thr, INFINITE);
-}
-
-/* Thread-specific state */
-
-typedef DWORD st_tlskey;
-
-static DWORD st_tls_newkey(st_tlskey * res)
-{
-  *res = TlsAlloc();
-  if (*res == TLS_OUT_OF_INDEXES)
-    return GetLastError();
-  else
-    return 0;
-}
-
-Caml_inline void * st_tls_get(st_tlskey k)
-{
-  return TlsGetValue(k);
-}
-
-Caml_inline void st_tls_set(st_tlskey k, void * v)
-{
-  TlsSetValue(k, v);
-}
-
-/* OS-specific handling of the OCaml thread ID (must be called with the runtime
-   lock). */
-Caml_inline void st_thread_set_id(intnat id)
-{
-  CAMLassert(id != 0);
-  st_tls_set(st_thread_id_key, (void *)id);
-}
-
-/* Return the identifier for the current thread. The 0 value is reserved. */
-Caml_inline intnat st_current_thread_id(void)
-{
-  intnat id = (intnat)st_tls_get(st_thread_id_key);
-  CAMLassert(id != 0);
-  return id;
-}
-
-/* The master lock.  */
-
-typedef CRITICAL_SECTION st_masterlock;
-
-static void st_masterlock_init(st_masterlock * m)
-{
-  TRACE("st_masterlock_init");
-  InitializeCriticalSection(m);
-  EnterCriticalSection(m);
-}
-
-Caml_inline void st_masterlock_acquire(st_masterlock * m)
-{
-  TRACE("st_masterlock_acquire");
-  EnterCriticalSection(m);
-  TRACE("st_masterlock_acquire (done)");
-}
-
-Caml_inline void st_masterlock_release(st_masterlock * m)
-{
-  LeaveCriticalSection(m);
-  TRACE("st_masterlock_released");
-}
-
-Caml_inline int st_masterlock_waiters(st_masterlock * m)
-{
-  return 1;                     /* info not maintained */
-}
-
-/* Scheduling hints */
-
-Caml_inline void st_thread_yield(st_masterlock * m)
-{
-  LeaveCriticalSection(m);
-  Sleep(0);
-  EnterCriticalSection(m);
-}
-
-/* Mutexes */
-
-struct st_mutex_ {
-  CRITICAL_SECTION crit;
-  volatile st_tid owner;    /* 0 if unlocked */
-  /* The "owner" field is not always protected by "crit"; it is also
-     accessed without holding "crit", using the Interlocked API for
-     atomic accesses */
-};
-
-typedef struct st_mutex_ * st_mutex;
-
-static DWORD st_mutex_create(st_mutex * res)
-{
-  st_mutex m = caml_stat_alloc_noexc(sizeof(struct st_mutex_));
-  if (m == NULL) return ERROR_NOT_ENOUGH_MEMORY;
-  InitializeCriticalSection(&m->crit);
-  m->owner = 0;
-  *res = m;
-  return 0;
-}
-
-static DWORD st_mutex_destroy(st_mutex m)
-{
-  DeleteCriticalSection(&m->crit);
-  caml_stat_free(m);
-  return 0;
-}
-
-/* Error codes with the 29th bit set are reserved for the application */
-
-#define MUTEX_DEADLOCK (1<<29 | 1)
-#define MUTEX_PREVIOUSLY_UNLOCKED 0
-#define MUTEX_ALREADY_LOCKED (1 << 29)
-#define MUTEX_NOT_OWNED (1<<29 | 2)
-
-Caml_inline DWORD st_mutex_lock(st_mutex m)
-{
-  st_tid self, prev;
-  TRACE1("st_mutex_lock", m);
-  self = st_current_thread_id();
-  /* Critical sections are recursive locks, so this will succeed
-     if we already own the lock */
-  EnterCriticalSection(&m->crit);
-  /* Record that we are the owner of the lock */
-  prev = Tid_Atomic_Exchange(&m->owner, self);
-  if (prev != 0) {
-    /* The mutex was already locked by ourselves.
-       Cancel the EnterCriticalSection above and return an error. */
-    TRACE1("st_mutex_lock (deadlock)", m);
-    LeaveCriticalSection(&m->crit);
-    return MUTEX_DEADLOCK;
-  }
-  TRACE1("st_mutex_lock (done)", m);
-  return 0;
-}
-
-Caml_inline DWORD st_mutex_trylock(st_mutex m)
-{
-  st_tid self, prev;
-  TRACE1("st_mutex_trylock", m);
-  self = st_current_thread_id();
-  if (! TryEnterCriticalSection(&m->crit)) {
-    TRACE1("st_mutex_trylock (failure)", m);
-    return MUTEX_ALREADY_LOCKED;
-  }
-  /* Record that we are the owner of the lock */
-  prev = Tid_Atomic_Exchange(&m->owner, self);
-  if (prev != 0) {
-    /* The mutex was already locked by ourselves.
-       Cancel the EnterCriticalSection above and return "already locked". */
-    TRACE1("st_mutex_trylock (already locked by self)", m);
-    LeaveCriticalSection(&m->crit);
-    return MUTEX_ALREADY_LOCKED;
-  }
-  TRACE1("st_mutex_trylock (done)", m);
-  return MUTEX_PREVIOUSLY_UNLOCKED;
-}
-
-Caml_inline DWORD st_mutex_unlock(st_mutex m)
-{
-  st_tid self, prev;
-  /* If the calling thread holds the lock, m->owner is stable and equal
-     to st_current_thread_id().
-     Otherwise, the value of m->owner can be 0 (if the mutex is unlocked)
-     or some other thread ID (if the mutex is held by another thread),
-     but is never equal to st_current_thread_id(). */
-  self = st_current_thread_id();
-  prev = Tid_Atomic_Compare_Exchange(&m->owner, 0, self);
-  if (prev != self) {
-    /* The value of m->owner is unchanged */
-    TRACE1("st_mutex_unlock (error)", m);
-    return MUTEX_NOT_OWNED;
-  }
-  TRACE1("st_mutex_unlock", m);
-  LeaveCriticalSection(&m->crit);
-  return 0;
-}
-
-/* Condition variables */
-
-/* A condition variable is just a list of threads currently
-   waiting on this c.v.  Each thread is represented by its
-   associated event. */
-
-struct st_wait_list {
-  HANDLE event;                  /* event of the first waiting thread */
-  struct st_wait_list * next;
-};
-
-typedef struct st_condvar_struct {
-  CRITICAL_SECTION lock;         /* protect the data structure */
-  struct st_wait_list * waiters; /* list of threads waiting */
-} * st_condvar;
-
-static DWORD st_condvar_create(st_condvar * res)
-{
-  st_condvar c = caml_stat_alloc_noexc(sizeof(struct st_condvar_struct));
-  if (c == NULL) return ERROR_NOT_ENOUGH_MEMORY;
-  InitializeCriticalSection(&c->lock);
-  c->waiters = NULL;
-  *res = c;
-  return 0;
-}
-
-static DWORD st_condvar_destroy(st_condvar c)
-{
-  TRACE1("st_condvar_destroy", c);
-  DeleteCriticalSection(&c->lock);
-  caml_stat_free(c);
-  return 0;
-}
-
-static DWORD st_condvar_signal(st_condvar c)
-{
-  DWORD rc = 0;
-  struct st_wait_list * curr, * next;
-
-  TRACE1("st_condvar_signal", c);
-  EnterCriticalSection(&c->lock);
-  curr = c->waiters;
-  if (curr != NULL) {
-    next = curr->next;
-    /* Wake up the first waiting thread */
-    TRACE1("st_condvar_signal: waking up", curr->event);
-    if (! SetEvent(curr->event)) rc = GetLastError();
-    /* Remove it from the waiting list */
-    c->waiters = next;
-  }
-  LeaveCriticalSection(&c->lock);
-  return rc;
-}
-
-static DWORD st_condvar_broadcast(st_condvar c)
-{
-  DWORD rc = 0;
-  struct st_wait_list * curr, * next;
-
-  TRACE1("st_condvar_broadcast", c);
-  EnterCriticalSection(&c->lock);
-  /* Wake up all waiting threads */
-  curr = c->waiters;
-  while (curr != NULL) {
-    next = curr->next;
-    TRACE1("st_condvar_signal: waking up", curr->event);
-    if (! SetEvent(curr->event)) rc = GetLastError();
-    curr = next;
-  }
-  /* Remove them all from the waiting list */
-  c->waiters = NULL;
-  LeaveCriticalSection(&c->lock);
-  return rc;
-}
-
-static DWORD st_condvar_wait(st_condvar c, st_mutex m)
-{
-  HANDLE ev;
-  struct st_wait_list wait;
-  DWORD rc;
-  st_tid self, prev;
-
-  TRACE1("st_condvar_wait", c);
-  /* Recover (or create) the event associated with the calling thread */
-  ev = (HANDLE) TlsGetValue(st_thread_sem_key);
-  if (ev == 0) {
-    ev = CreateEvent(NULL,
-                     FALSE /*auto reset*/,
-                     FALSE /*initially unset*/,
-                     NULL);
-    if (ev == NULL) return GetLastError();
-    TlsSetValue(st_thread_sem_key, (void *) ev);
-  }
-  /* Get ready to release the mutex */
-  self = st_current_thread_id();
-  prev = Tid_Atomic_Compare_Exchange(&m->owner, 0, self);
-  if (prev != self) {
-    /* The value of m->owner is unchanged */
-    TRACE1("st_condvar_wait: error: mutex not held", m);
-    return MUTEX_NOT_OWNED;
-  }
-  /* Insert the current thread in the waiting list (atomically) */
-  EnterCriticalSection(&c->lock);
-  wait.event = ev;
-  wait.next = c->waiters;
-  c->waiters = &wait;
-  LeaveCriticalSection(&c->lock);
-  /* Finish releasing the mutex m (like st_mutex_unlock does, minus
-     the error checking, which we've already done above). */
-  LeaveCriticalSection(&m->crit);
-  /* Wait for our event to be signaled.  There is no risk of lost
-     wakeup, since we inserted ourselves on the waiting list of c
-     before releasing m */
-  TRACE1("st_condvar_wait: blocking on event", ev);
-  if (WaitForSingleObject(ev, INFINITE) == WAIT_FAILED)
-    return GetLastError();
-  /* Reacquire the mutex m */
-  TRACE1("st_condvar_wait: restarted, acquiring mutex", c);
-  rc = st_mutex_lock(m);
-  if (rc != 0) return rc;
-  TRACE1("st_condvar_wait: acquired mutex", c);
-  return 0;
-}
-
-/* Triggered events */
-
-typedef HANDLE st_event;
-
-static DWORD st_event_create(st_event * res)
-{
-  st_event m =
-    CreateEvent(NULL, TRUE/*manual reset*/, FALSE/*initially unset*/, NULL);
-  TRACE1("st_event_create", m);
-  if (m == NULL) return GetLastError();
-  *res = m;
-  return 0;
-}
-
-static DWORD st_event_destroy(st_event e)
-{
-  TRACE1("st_event_destroy", e);
-  if (CloseHandle(e))
-    return 0;
-  else
-    return GetLastError();
-}
-
-static DWORD st_event_trigger(st_event e)
-{
-  TRACE1("st_event_trigger", e);
-  if (SetEvent(e))
-    return 0;
-  else
-    return GetLastError();
-}
-
-static DWORD st_event_wait(st_event e)
-{
-  TRACE1("st_event_wait", e);
-  if (WaitForSingleObject(e, INFINITE) == WAIT_FAILED)
-    return GetLastError();
-  else
-    return 0;
-}
-
-/* Reporting errors */
-
-static void st_check_error(DWORD retcode, char * msg)
-{
-  wchar_t err[1024];
-  int errlen, msglen, ret;
-  value str;
-
-  if (retcode == 0) return;
-  if (retcode == ERROR_NOT_ENOUGH_MEMORY) caml_raise_out_of_memory();
-  switch (retcode) {
-  case MUTEX_DEADLOCK:
-    ret = swprintf(err, sizeof(err)/sizeof(wchar_t),
-                   L"Mutex is already locked by calling thread");
-    break;
-  case MUTEX_NOT_OWNED:
-    ret = swprintf(err, sizeof(err)/sizeof(wchar_t),
-                   L"Mutex is not locked by calling thread");
-    break;
-  default:
-    ret = FormatMessage(
-             FORMAT_MESSAGE_FROM_SYSTEM|FORMAT_MESSAGE_IGNORE_INSERTS,
-             NULL,
-             retcode,
-             0,
-             err,
-             sizeof(err)/sizeof(wchar_t),
-             NULL);
-    if (! ret) {
-      ret =
-        swprintf(err, sizeof(err)/sizeof(wchar_t), L"error code %lx", retcode);
-    }
-  }
-  msglen = strlen(msg);
-  errlen = win_wide_char_to_multi_byte(err, ret, NULL, 0);
-  str = caml_alloc_string(msglen + 2 + errlen);
-  memmove (&Byte(str, 0), msg, msglen);
-  memmove (&Byte(str, msglen), ": ", 2);
-  win_wide_char_to_multi_byte(err, ret, &Byte(str, msglen + 2), errlen);
-  caml_raise_sys_error(str);
-}
-
-/* Variable used to stop the "tick" thread */
-static volatile int caml_tick_thread_stop = 0;
-
-/* The tick thread: posts a SIGPREEMPTION signal periodically */
-
-static DWORD WINAPI caml_thread_tick(void * arg)
-{
-  while(! caml_tick_thread_stop) {
-    Sleep(Thread_timeout);
-    /* The preemption signal should never cause a callback, so don't
-     go through caml_handle_signal(), just record signal delivery via
-     caml_record_signal(). */
-    caml_record_signal(SIGPREEMPTION);
-  }
-  return 0;
-}
-
-/* "At fork" processing -- none under Win32 */
-
-static DWORD st_atfork(void (*fn)(void))
-{
-  return 0;
-}
-
-/* Signal handling -- none under Win32 */
-
-value caml_thread_sigmask(value cmd, value sigs) /* ML */
-{
-  caml_invalid_argument("Thread.sigmask not implemented");
-  return Val_int(0);            /* not reached */
-}
-
-value caml_wait_signal(value sigs) /* ML */
-{
-  caml_invalid_argument("Thread.wait_signal not implemented");
-  return Val_int(0);            /* not reached */
-}
+/* FIXME Replace winpthreads implementation with native */
+#include "st_posix.h"
diff --git a/otherlibs/systhreads/thread.ml b/otherlibs/systhreads/thread.ml
index a3c7ad274b25..3e507f63b67a 100644
--- a/otherlibs/systhreads/thread.ml
+++ b/otherlibs/systhreads/thread.ml
@@ -18,6 +18,8 @@
 type t
 
 external thread_initialize : unit -> unit = "caml_thread_initialize"
+external thread_initialize_domain : unit -> unit =
+            "caml_thread_initialize_domain"
 external thread_cleanup : unit -> unit = "caml_thread_cleanup"
 external thread_new : (unit -> unit) -> t = "caml_thread_new"
 external thread_uncaught_exception : exn -> unit =
@@ -84,6 +86,7 @@ let preempt_signal =
 
 let () =
   Sys.set_signal preempt_signal (Sys.Signal_handle preempt);
+  Domain.at_startup thread_initialize_domain;
   thread_initialize ();
   Callback.register "Thread.at_shutdown" (fun () ->
     thread_cleanup();
diff --git a/otherlibs/systhreads/threads.h b/otherlibs/systhreads/threads.h
index 97fd1b2746c9..8b2a99767f3b 100644
--- a/otherlibs/systhreads/threads.h
+++ b/otherlibs/systhreads/threads.h
@@ -59,6 +59,8 @@ CAMLextern int caml_c_thread_unregister(void);
    functions.  Just call [caml_c_thread_register] once.
    Before the thread finishes, it must call [caml_c_thread_unregister].
    Both functions return 1 on success, 0 on error.
+   In multicore OCaml, note that threads created by C code will be registered
+   to the domain 0 threads chaining.
 */
 
 #ifdef __cplusplus
diff --git a/otherlibs/unix/fork.c b/otherlibs/unix/fork.c
index a244a5cf8372..6367f61aa832 100644
--- a/otherlibs/unix/fork.c
+++ b/otherlibs/unix/fork.c
@@ -19,14 +19,21 @@
 #include <caml/debugger.h>
 #include <caml/eventlog.h>
 #include "unixsupport.h"
+#include <caml/domain.h>
+#include <caml/fail.h>
 
 CAMLprim value unix_fork(value unit)
 {
   int ret;
+  if (caml_domain_is_multicore()) {
+    caml_failwith
+      ("Unix.fork may not be called while other domains were created");
+  }
 
   CAML_EV_FLUSH();
 
   ret = fork();
+  if (ret == 0) caml_atfork_hook();
   if (ret == -1) uerror("fork", Nothing);
 
   CAML_EVENTLOG_DO({
diff --git a/otherlibs/unix/kill.c b/otherlibs/unix/kill.c
index 7154e1d1014d..aa924f4581f6 100644
--- a/otherlibs/unix/kill.c
+++ b/otherlibs/unix/kill.c
@@ -27,6 +27,6 @@ CAMLprim value unix_kill(value pid, value signal)
   sig = caml_convert_signal_number(Int_val(signal));
   if (kill(Int_val(pid), sig) == -1)
     uerror("kill", Nothing);
-  caml_process_pending_actions();
+  caml_process_pending_signals();
   return Val_unit;
 }
diff --git a/otherlibs/unix/signals.c b/otherlibs/unix/signals.c
index 6e54032d6485..e1b516bd28d8 100644
--- a/otherlibs/unix/signals.c
+++ b/otherlibs/unix/signals.c
@@ -49,9 +49,9 @@ static value encode_sigset(sigset_t * set)
   Begin_root(res)
     for (i = 1; i < NSIG; i++)
       if (sigismember(set, i) > 0) {
-        value newcons = caml_alloc_small(2, 0);
-        Field(newcons, 0) = Val_int(caml_rev_convert_signal_number(i));
-        Field(newcons, 1) = res;
+        value newcons = caml_alloc_2(0,
+          Val_int(caml_rev_convert_signal_number(i)),
+          res);
         res = newcons;
       }
   End_roots();
@@ -69,10 +69,10 @@ CAMLprim value unix_sigprocmask(value vaction, value vset)
   how = sigprocmask_cmd[Int_val(vaction)];
   decode_sigset(vset, &set);
   caml_enter_blocking_section();
-  retcode = caml_sigmask_hook(how, &set, &oldset);
+  retcode = sigprocmask(how, &set, &oldset);
   caml_leave_blocking_section();
   /* Run any handlers for just-unmasked pending signals */
-  caml_process_pending_actions();
+  caml_process_pending_signals();
   if (retcode != 0) unix_error(retcode, "sigprocmask", Nothing);
   return encode_sigset(&oldset);
 }
@@ -83,7 +83,7 @@ CAMLprim value unix_sigpending(value unit)
   int i;
   if (sigpending(&pending) == -1) uerror("sigpending", Nothing);
   for (i = 1; i < NSIG; i++)
-    if(caml_pending_signals[i])
+    if(atomic_load_explicit(&caml_pending_signals[i], memory_order_seq_cst))
       sigaddset(&pending, i);
   return encode_sigset(&pending);
 }
diff --git a/otherlibs/unix/unixsupport.c b/otherlibs/unix/unixsupport.c
index daff13177dd0..5726c453de36 100644
--- a/otherlibs/unix/unixsupport.c
+++ b/otherlibs/unix/unixsupport.c
@@ -253,8 +253,6 @@ int error_table[] = {
   EHOSTUNREACH, ELOOP, EOVERFLOW /*, EUNKNOWNERR */
 };
 
-static const value * unix_error_exn = NULL;
-
 value unix_error_of_code (int errcode)
 {
   int errconstr;
@@ -288,18 +286,17 @@ int code_of_unix_error (value error)
 void unix_error(int errcode, const char *cmdname, value cmdarg)
 {
   value res;
+  const value * unix_error_exn;
   value name = Val_unit, err = Val_unit, arg = Val_unit;
 
   Begin_roots3 (name, err, arg);
     arg = cmdarg == Nothing ? caml_copy_string("") : cmdarg;
     name = caml_copy_string(cmdname);
     err = unix_error_of_code (errcode);
-    if (unix_error_exn == NULL) {
-      unix_error_exn = caml_named_value("Unix.Unix_error");
-      if (unix_error_exn == NULL)
-        caml_invalid_argument("Exception Unix.Unix_error not initialized,"
-                         " please link unix.cma");
-    }
+    unix_error_exn = caml_named_value("Unix.Unix_error");
+    if (unix_error_exn == NULL)
+      caml_invalid_argument("Exception Unix.Unix_error not initialized,"
+                       " please link unix.cma");
     res = caml_alloc_small(4, 0);
     Field(res, 0) = *unix_error_exn;
     Field(res, 1) = err;
diff --git a/runtime/Makefile b/runtime/Makefile
index f5c08d8798ff..fc3c3d89f77b 100644
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -20,22 +20,24 @@ include $(ROOTDIR)/Makefile.common
 # Lists of source files
 
 BYTECODE_C_SOURCES := $(addsuffix .c, \
-  interp misc stacks fix_code startup_aux startup_byt freelist major_gc \
-  minor_gc memory alloc roots_byt globroots fail_byt signals \
+  interp misc fix_code startup_aux startup_byt major_gc \
+  minor_gc memory alloc roots globroots fail_byt signals \
   signals_byt printexc backtrace_byt backtrace compare ints eventlog \
   floats str array io extern intern hash sys meta parsing gc_ctrl md5 obj \
-  lexing callback debugger weak compact finalise custom dynlink \
-  afl $(UNIX_OR_WIN32) bigarray main memprof domain \
-  skiplist codefrag)
+  lexing callback debugger weak finalise custom dynlink \
+  platform fiber shared_heap addrmap \
+  afl $(UNIX_OR_WIN32) bigarray main memprof domain sync \
+  skiplist lf_skiplist codefrag)
 
 NATIVE_C_SOURCES := $(addsuffix .c, \
-  startup_aux startup_nat main fail_nat roots_nat signals \
-  signals_nat misc freelist major_gc minor_gc memory alloc compare ints \
+  startup_aux startup_nat main fail_nat roots signals \
+  signals_nat misc major_gc minor_gc memory alloc compare ints \
   floats str array io extern intern hash sys parsing gc_ctrl eventlog md5 obj \
-  lexing $(UNIX_OR_WIN32) printexc callback weak compact finalise custom \
+  lexing $(UNIX_OR_WIN32) printexc callback weak finalise custom \
   globroots backtrace_nat backtrace dynlink_nat debugger meta \
+  platform fiber shared_heap addrmap frame_descriptors \
   dynlink clambda_checks afl bigarray \
-  memprof domain skiplist codefrag)
+  memprof domain sync skiplist lf_skiplist codefrag)
 
 # Header files generated by configure
 CONFIGURED_HEADERS := caml/m.h caml/s.h caml/version.h
@@ -99,9 +101,11 @@ libcamlrunpic_OBJECTS := $(BYTECODE_C_SOURCES:.c=.bpic.$(O))
 
 libasmrun_OBJECTS := $(NATIVE_C_SOURCES:.c=.n.$(O)) $(ASM_OBJECTS)
 
-libasmrund_OBJECTS := $(NATIVE_C_SOURCES:.c=.nd.$(O)) $(ASM_OBJECTS)
+libasmrund_OBJECTS := \
+  $(NATIVE_C_SOURCES:.c=.nd.$(O)) $(ASM_OBJECTS:.$(O)=.d.$(O))
 
-libasmruni_OBJECTS := $(NATIVE_C_SOURCES:.c=.ni.$(O)) $(ASM_OBJECTS)
+libasmruni_OBJECTS := \
+  $(NATIVE_C_SOURCES:.c=.ni.$(O)) $(ASM_OBJECTS:.$(O)=.i.$(O))
 
 libasmrunpic_OBJECTS := $(NATIVE_C_SOURCES:.c=.npic.$(O)) \
   $(ASM_OBJECTS:.$(O)=_libasmrunpic.$(O))
@@ -375,12 +379,19 @@ $(foreach object_type,$(subst %,,$(object_types)), \
 
 # Compilation of assembly files
 
-%.o: %.S
-	$(ASPP) $(ASPPFLAGS) -o $@ $< || \
-	{ echo "If your assembler produced syntax errors, it is probably";\
+ASPP_ERROR = \
+  { echo "If your assembler produced syntax errors, it is probably";\
           echo "unhappy with the preprocessor. Check your assembler, or";\
           echo "try producing $*.o by hand.";\
           exit 2; }
+%.o: %.S
+	$(ASPP) $(ASPPFLAGS) -o $@ $< || $(ASPP_ERROR)
+
+%.d.o: %.S
+	$(ASPP) $(ASPPFLAGS) $(OC_DEBUG_CPPFLAGS) -o $@ $< || $(ASPP_ERROR)
+
+%.i.o: %.S
+	$(ASPP) $(ASPPFLAGS) $(OC_INSTR_CPPFLAGS) -o $@ $< || $(ASPP_ERROR)
 
 %_libasmrunpic.o: %.S
 	$(ASPP) $(ASPPFLAGS) $(SHAREDLIB_CFLAGS) -o $@ $<
@@ -397,6 +408,18 @@ amd64nt.obj: amd64nt.asm domain_state64.inc
 i386nt.obj: i386nt.asm domain_state32.inc
 	$(ASM)$@ $(ASMFLAGS) $<
 
+amd64nt.d.obj: amd64nt.asm domain_state64.inc
+	$(ASM)$@ $(ASMFLAGS) $(OC_DEBUG_CPPFLAGS) $<
+
+i386nt.d.obj: i386nt.asm domain_state32.inc
+	$(ASM)$@ $(ASMFLAGS) $(OC_DEBUG_CPPFLAGS) $<
+
+amd64nt.i.obj: amd64nt.asm domain_state64.inc
+	$(ASM)$@ $(ASMFLAGS) $(OC_INSTR_CPPFLAGS) $<
+
+i386nt.i.obj: i386nt.asm domain_state32.inc
+	$(ASM)$@ $(ASMFLAGS) $(OC_INSTR_CPPFLAGS) $<
+
 %_libasmrunpic.obj: %.asm
 	$(ASM)$@ $(ASMFLAGS) $<
 
diff --git a/runtime/addrmap.c b/runtime/addrmap.c
new file mode 100644
index 000000000000..a7562b3b6a17
--- /dev/null
+++ b/runtime/addrmap.c
@@ -0,0 +1,134 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*                 Stephen Dolan, University of Cambridge                 */
+/*                                                                        */
+/*   Copyright 2015 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#include "caml/config.h"
+#include "caml/memory.h"
+#include "caml/addrmap.h"
+
+#define MAX_CHAIN 100
+
+Caml_inline uintnat pos_initial(struct addrmap* t, value key)
+{
+  uintnat pos = (uintnat)key;
+  pos *= 0xcc9e2d51;
+  pos ^= (pos >> 17);
+
+  CAMLassert(Is_power_of_2(t->size));
+  return pos & (t->size - 1);
+}
+
+Caml_inline uintnat pos_next(struct addrmap* t, uintnat pos)
+{
+  return (pos + 1) & (t->size - 1);
+}
+
+int caml_addrmap_contains(struct addrmap* t, value key)
+{
+  uintnat pos, i;
+
+  CAMLassert(Is_block(key));
+  if (!t->entries) return 0;
+
+  for (i = 0, pos = pos_initial(t, key);
+       i < MAX_CHAIN;
+       i++, pos = pos_next(t, pos)) {
+    if (t->entries[pos].key == ADDRMAP_INVALID_KEY) break;
+    if (t->entries[pos].key == key) return 1;
+  }
+  return 0;
+}
+
+value caml_addrmap_lookup(struct addrmap* t, value key)
+{
+  uintnat pos;
+
+  CAMLassert(Is_block(key));
+  CAMLassert(t->entries);
+
+  for (pos = pos_initial(t, key); ; pos = pos_next(t, pos)) {
+    CAMLassert(t->entries[pos].key != ADDRMAP_INVALID_KEY);
+    if (t->entries[pos].key == key)
+      return t->entries[pos].value;
+  }
+}
+
+static void addrmap_alloc(struct addrmap* t, uintnat sz)
+{
+  uintnat i;
+  CAMLassert(Is_power_of_2(sz));
+  t->entries = caml_stat_alloc(sizeof(struct addrmap_entry) * sz);
+  t->size = sz;
+  for (i = 0; i < sz; i++) {
+    t->entries[i].key = ADDRMAP_INVALID_KEY;
+    t->entries[i].value = ADDRMAP_NOT_PRESENT;
+  }
+}
+
+void caml_addrmap_clear(struct addrmap* t) {
+  caml_stat_free(t->entries);
+  t->entries = 0;
+  t->size = 0;
+}
+
+
+
+value* caml_addrmap_insert_pos(struct addrmap* t, value key) {
+  uintnat i, pos;
+  CAMLassert(Is_block(key));
+  if (!t->entries) {
+    /* first call, initialise table with a small initial size */
+    addrmap_alloc(t, 256);
+  }
+  for (i = 0, pos = pos_initial(t, key);
+       i < MAX_CHAIN;
+       i++,   pos = pos_next(t, pos)) {
+    if (t->entries[pos].key == ADDRMAP_INVALID_KEY) {
+      t->entries[pos].key = key;
+    }
+    if (t->entries[pos].key == key) {
+      return &t->entries[pos].value;
+    }
+  }
+  /* failed to insert, rehash and try again */
+  {
+    struct addrmap_entry* old_table = t->entries;
+    uintnat old_size = t->size;
+    addrmap_alloc(t, old_size * 2);
+    for (i = 0; i < old_size; i++) {
+      if (old_table[i].key != ADDRMAP_INVALID_KEY) {
+        value* p = caml_addrmap_insert_pos(t, old_table[i].key);
+        CAMLassert(*p == ADDRMAP_NOT_PRESENT);
+        *p = old_table[i].value;
+      }
+    }
+    caml_stat_free(old_table);
+  }
+  return caml_addrmap_insert_pos(t, key);
+}
+
+void caml_addrmap_insert(struct addrmap* t, value k, value v) {
+  value* p = caml_addrmap_insert_pos(t, k);
+  CAMLassert(*p == ADDRMAP_NOT_PRESENT);
+  *p = v;
+}
+
+void caml_addrmap_iter(struct addrmap* t, void (*f)(value, value)) {
+  addrmap_iterator i;
+  for (i = caml_addrmap_iterator(t);
+       caml_addrmap_iter_ok(t, i);
+       i = caml_addrmap_next(t, i)) {
+    f(caml_addrmap_iter_key(t, i),
+      caml_addrmap_iter_value(t, i));
+  }
+}
diff --git a/runtime/afl.c b/runtime/afl.c
index 9d5852424403..c78a3a8fb0ae 100644
--- a/runtime/afl.c
+++ b/runtime/afl.c
@@ -23,6 +23,7 @@ uintnat caml_afl_prev_loc;
 #if !defined(HAS_SYS_SHM_H) || !defined(HAS_SHMAT)
 
 #include "caml/mlvalues.h"
+#include "caml/domain.h"
 
 CAMLprim value caml_reset_afl_instrumentation(value full)
 {
@@ -113,10 +114,15 @@ CAMLexport value caml_setup_afl(value unit)
   }
   afl_read();
 
+  /* ensure that another module has not already spawned a domain */
+  if (!caml_domain_is_multicore())
+    caml_fatal_error("afl-fuzz: cannot fork with multiple domains running");
+
   while (1) {
     int child_pid = fork();
     if (child_pid < 0) caml_fatal_error("afl-fuzz: could not fork");
     else if (child_pid == 0) {
+      caml_atfork_hook();
       /* Run the program */
       close(FORKSRV_FD_READ);
       close(FORKSRV_FD_WRITE);
diff --git a/runtime/alloc.c b/runtime/alloc.c
index 189d309d314a..8b5900c8ba24 100644
--- a/runtime/alloc.c
+++ b/runtime/alloc.c
@@ -21,16 +21,14 @@
 */
 
 #include <string.h>
+#include <stdarg.h>
 #include "caml/alloc.h"
 #include "caml/custom.h"
 #include "caml/major_gc.h"
 #include "caml/memory.h"
 #include "caml/mlvalues.h"
-#include "caml/stacks.h"
-#include "caml/signals.h"
-
-#define Setup_for_gc
-#define Restore_after_gc
+#include "caml/fiber.h"
+#include "caml/domain.h"
 
 CAMLexport value caml_alloc (mlsize_t wosize, tag_t tag)
 {
@@ -43,14 +41,15 @@ CAMLexport value caml_alloc (mlsize_t wosize, tag_t tag)
     if (wosize == 0){
       result = Atom (tag);
     }else{
-      Alloc_small (result, wosize, tag);
+      Alloc_small (result, wosize, tag,
+                   { caml_handle_gc_interrupt_no_async_exceptions(); });
       if (tag < No_scan_tag){
         for (i = 0; i < wosize; i++) Field (result, i) = Val_unit;
       }
     }
-  }else{
+  } else {
     result = caml_alloc_shr (wosize, tag);
-    if (tag < No_scan_tag){
+    if (tag < No_scan_tag) {
       for (i = 0; i < wosize; i++) Field (result, i) = Val_unit;
     }
     result = caml_check_urgent_gc (result);
@@ -58,6 +57,108 @@ CAMLexport value caml_alloc (mlsize_t wosize, tag_t tag)
   return result;
 }
 
+Caml_inline void enter_gc_preserving_vals(mlsize_t wosize, value* vals)
+{
+  mlsize_t i;
+  CAMLparam0();
+  /* Copy the values to be preserved to a different array.
+     The original vals array never escapes, generating better code in
+     the fast path. */
+  CAMLlocalN(vals_copy, wosize);
+  for (i = 0; i < wosize; i++) vals_copy[i] = vals[i];
+  caml_handle_gc_interrupt_no_async_exceptions();
+  for (i = 0; i < wosize; i++) vals[i] = vals_copy[i];
+  CAMLreturn0;
+}
+
+Caml_inline value do_alloc_small(mlsize_t wosize, tag_t tag, value* vals)
+{
+  value v;
+  mlsize_t i;
+  CAMLassert (tag < 256);
+  Alloc_small(v, wosize, tag,
+      { enter_gc_preserving_vals(wosize, vals); });
+  for (i = 0; i < wosize; i++) {
+    Field(v, i) = vals[i];
+  }
+  return v;
+}
+
+
+CAMLexport value caml_alloc_1 (tag_t tag, value a)
+{
+  value v[1] = {a};
+  return do_alloc_small(1, tag, v);
+}
+
+CAMLexport value caml_alloc_2 (tag_t tag, value a, value b)
+{
+  value v[2] = {a, b};
+  return do_alloc_small(2, tag, v);
+}
+
+CAMLexport value caml_alloc_3 (tag_t tag, value a, value b, value c)
+{
+  value v[3] = {a, b, c};
+  return do_alloc_small(3, tag, v);
+}
+
+CAMLexport value caml_alloc_4 (tag_t tag, value a, value b, value c, value d)
+{
+  value v[4] = {a, b, c, d};
+  return do_alloc_small(4, tag, v);
+}
+
+CAMLexport value caml_alloc_5 (tag_t tag, value a, value b, value c, value d,
+                               value e)
+{
+  value v[5] = {a, b, c, d, e};
+  return do_alloc_small(5, tag, v);
+}
+
+CAMLexport value caml_alloc_6 (tag_t tag, value a, value b, value c, value d,
+                               value e, value f)
+{
+  value v[6] = {a, b, c, d, e, f};
+  return do_alloc_small(6, tag, v);
+}
+
+CAMLexport value caml_alloc_7 (tag_t tag, value a, value b, value c, value d,
+                               value e, value f, value g)
+{
+  value v[7] = {a, b, c, d, e, f, g};
+  return do_alloc_small(7, tag, v);
+}
+
+CAMLexport value caml_alloc_8 (tag_t tag, value a, value b, value c, value d,
+                               value e, value f, value g, value h)
+{
+  value v[8] = {a, b, c, d, e, f, g, h};
+  return do_alloc_small(8, tag, v);
+}
+
+CAMLexport value caml_alloc_9 (tag_t tag, value a, value b, value c, value d,
+                               value e, value f, value g, value h, value i)
+{
+  value v[9] = {a, b, c, d, e, f, g, h, i};
+  return do_alloc_small(9, tag, v);
+}
+
+CAMLexport value caml_alloc_N (mlsize_t wosize, tag_t tag, ...)
+{
+  va_list args;
+  mlsize_t i;
+  value vals[wosize];
+  value ret;
+  va_start(args, tag);
+  for (i = 0; i < wosize; i++)
+    vals[i] = va_arg(args, value);
+  ret = do_alloc_small(wosize, tag, vals);
+  va_end(args);
+  return ret;
+}
+
+
 CAMLexport value caml_alloc_small (mlsize_t wosize, tag_t tag)
 {
   value result;
@@ -65,7 +166,9 @@ CAMLexport value caml_alloc_small (mlsize_t wosize, tag_t tag)
   CAMLassert (wosize > 0);
   CAMLassert (wosize <= Max_young_wosize);
   CAMLassert (tag < 256);
-  Alloc_small (result, wosize, tag);
+  CAMLassert (tag != Infix_tag);
+  Alloc_small (result, wosize, tag,
+               { caml_handle_gc_interrupt_no_async_exceptions(); });
   return result;
 }
 
@@ -83,7 +186,8 @@ CAMLexport value caml_alloc_string (mlsize_t len)
   mlsize_t wosize = (len + sizeof (value)) / sizeof (value);
 
   if (wosize <= Max_young_wosize) {
-    Alloc_small (result, wosize, String_tag);
+    Alloc_small (result, wosize, String_tag,
+                 { caml_handle_gc_interrupt_no_async_exceptions(); });
   }else{
     result = caml_alloc_shr (wosize, String_tag);
     result = caml_check_urgent_gc (result);
@@ -123,7 +227,7 @@ CAMLexport value caml_copy_string(char const *s)
 }
 
 CAMLexport value caml_alloc_array(value (*funct)(char const *),
-                                  char const ** arr)
+                                  char const * const* arr)
 {
   CAMLparam0 ();
   mlsize_t nbr, n;
@@ -155,8 +259,9 @@ value caml_alloc_float_array(mlsize_t len)
     if (wosize == 0)
       return Atom(0);
     else
-      Alloc_small (result, wosize, Double_array_tag);
-  }else {
+      Alloc_small (result, wosize, Double_array_tag,
+                   { caml_handle_gc_interrupt_no_async_exceptions(); });
+  } else {
     result = caml_alloc_shr (wosize, Double_array_tag);
     result = caml_check_urgent_gc (result);
   }
@@ -167,12 +272,12 @@ value caml_alloc_float_array(mlsize_t len)
 }
 
 
-CAMLexport value caml_copy_string_array(char const ** arr)
+CAMLexport value caml_copy_string_array(char const * const * arr)
 {
   return caml_alloc_array(caml_copy_string, arr);
 }
 
-CAMLexport int caml_convert_flag_list(value list, int *flags)
+CAMLexport int caml_convert_flag_list(value list, const int *flags)
 {
   int res;
   res = 0;
@@ -218,7 +323,7 @@ CAMLprim value caml_alloc_dummy_infix(value vsize, value voffset)
   Closinfo_val(v) = Make_closinfo(0, wosize);
   if (offset > 0) {
     v += Bsize_wsize(offset);
-    Hd_val(v) = Make_header(offset, Infix_tag, Caml_white);
+    (((header_t *) (v)) [-1]) = Make_header(offset, Infix_tag, 0);
   }
   return v;
 }
diff --git a/runtime/amd64.S b/runtime/amd64.S
index 756d4a5a0abe..44bcd1ed4062 100644
--- a/runtime/amd64.S
+++ b/runtime/amd64.S
@@ -86,160 +86,223 @@
 #define CFI_ENDPROC .cfi_endproc
 #define CFI_ADJUST(n) .cfi_adjust_cfa_offset n
 #define CFI_OFFSET(r, n) .cfi_offset r, n
+#define CFI_DEF_CFA_OFFSET(n) .cfi_def_cfa_offset n
+#define CFI_DEF_CFA_REGISTER(r) .cfi_def_cfa_register r
 #define CFI_SAME_VALUE(r) .cfi_same_value r
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#define CFI_REMEMBER_STATE .cfi_remember_state
+#define CFI_RESTORE_STATE .cfi_restore_state
 #else
 #define CFI_STARTPROC
 #define CFI_ENDPROC
 #define CFI_ADJUST(n)
 #define CFI_OFFSET(r, n)
+#define CFI_DEF_CFA_OFFSET(n)
+#define CFI_DEF_CFA_REGISTER(r)
 #define CFI_SAME_VALUE(r)
-#endif
-
-#ifdef WITH_FRAME_POINTERS
+#define CFI_SIGNAL_FRAME
+#define CFI_REMEMBER_STATE
+#define CFI_RESTORE_STATE
 
-#define ENTER_FUNCTION \
-        pushq   %rbp; CFI_ADJUST(8); \
-        movq    %rsp, %rbp
-#define LEAVE_FUNCTION \
-        popq    %rbp; CFI_ADJUST(-8);
+#endif
 
+#ifdef DEBUG
+#define CHECK_STACK_ALIGNMENT \
+        test $0xf, %rsp; jz 9f; int3; 9:
+#define IF_DEBUG(...) __VA_ARGS__
 #else
+#define CHECK_STACK_ALIGNMENT
+#define IF_DEBUG(...)
+#endif
 
-#define ENTER_FUNCTION \
-        subq    $8, %rsp; CFI_ADJUST (8);
-#define LEAVE_FUNCTION \
-        addq    $8, %rsp; CFI_ADJUST (-8);
+/* struct stack_info */
+#define Stack_sp                 0
+#define Stack_exception          8
+#define Stack_handler            16
 
-#endif
+/* struct stack_handler */
+#define Handler_value(REG)       0(REG)
+#define Handler_exception(REG)   8(REG)
+#define Handler_effect(REG)     16(REG)
+#define Handler_parent          24
 
-        .set    domain_curr_field, 0
-#define DOMAIN_STATE(c_type, name) \
-        .equ    domain_field_caml_##name, domain_curr_field ; \
-        .set    domain_curr_field, domain_curr_field + 1
-#include "../runtime/caml/domain_state.tbl"
-#undef DOMAIN_STATE
+/* struct c_stack_link */
+#define Cstack_stack             0
+#define Cstack_sp                8
+#define Cstack_prev             16
 
-#define Caml_state(var) (8*domain_field_caml_##var)(%r14)
+/******************************************************************************/
+/* DWARF */
+/******************************************************************************/
 
-#if defined(__PIC__) && !defined(SYS_mingw64) && !defined(SYS_cygwin)
+/* These constants are taken from:
 
-/* Position-independent operations on global variables. */
+     DWARF Debugging Information Format, Version 3
+     http://dwarfstd.org/doc/Dwarf3.pdf
 
-/* Store [srcreg] in global [dstlabel].  Clobbers %r11. */
-#define STORE_VAR(srcreg,dstlabel) \
-        movq    GREL(dstlabel)(%rip), %r11 ; \
-        movq    srcreg, (%r11)
+   with the amd64-specific register numbers coming from
+   Fig. 3.36 ("DWARF Register Number Mapping") of:
 
-#define STORE_VAR32(srcreg,dstlabel) \
-        movq    GREL(dstlabel)(%rip), %r11 ; \
-        movl    srcreg, (%r11)
+     System V Application Binary Interface
+     AMD64 Architecture Processor Supplement
+     Version 1.0
+     https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf */
 
-/* Load global [srclabel] in register [dstreg].  Clobbers %r11. */
-#define LOAD_VAR(srclabel,dstreg) \
-        movq    GREL(srclabel)(%rip), %r11 ; \
-        movq    (%r11), dstreg
+#define DW_CFA_def_cfa_expression 0x0f
+#define DW_CFA_val_offset         0x14
+#define DW_REG_rbp                6
+#define DW_REG_rsp                7
+#define DW_REG_r13                13
+#define DW_OP_breg                0x70
+#define DW_OP_deref               0x06
+#define DW_OP_plus_uconst         0x23
 
-/* Compare global [label] with register [reg].  Clobbers %rax. */
-#define CMP_VAR(label,reg) \
-        movq    GREL(label)(%rip), %rax ; \
-        cmpq    (%rax), reg
+/******************************************************************************/
+/* Access to the current domain state block. */
+/******************************************************************************/
 
-/* Test 32-bit global [label] against mask [imm].  Clobbers %r11. */
-#define TESTL_VAR(imm,label) \
-        movq    GREL(label)(%rip), %r11 ; \
-        testl   imm, (%r11)
+#define CAML_CONFIG_H_NO_TYPEDEFS
+#include "../runtime/caml/config.h"
 
-/* Push global [label] on stack.  Clobbers %r11. */
-#define PUSH_VAR(srclabel) \
-        movq    GREL(srclabel)(%rip), %r11 ; \
-        pushq   (%r11); CFI_ADJUST (8)
+        .set    domain_curr_field, 0
+#define DOMAIN_STATE(c_type, name) \
+        .equ    domain_field_caml_##name, domain_curr_field ; \
+        .set    domain_curr_field, domain_curr_field + 1
+#include "../runtime/caml/domain_state.tbl"
+#undef DOMAIN_STATE
 
-/* Pop global [label] off stack.  Clobbers %r11. */
-#define POP_VAR(dstlabel) \
-        movq    GREL(dstlabel)(%rip), %r11 ; \
-        popq    (%r11);  CFI_ADJUST (-8)
+#define Caml_state(var) (8*domain_field_caml_##var)(%r14)
 
 /* Load address of global [label] in register [dst]. */
+#if defined(__PIC__) && !defined(SYS_mingw64) && !defined(SYS_cygwin)
 #define LEA_VAR(label,dst) \
         movq    GREL(label)(%rip), dst
-
 #else
+#define LEA_VAR(label,dst) \
+        leaq    G(label)(%rip), dst
+#endif
 
-/* Non-PIC operations on global variables.  Slightly faster. */
-
-#define STORE_VAR(srcreg,dstlabel) \
-        movq    srcreg, G(dstlabel)(%rip)
-
-#define STORE_VAR32(srcreg,dstlabel) \
-        movl    srcreg, G(dstlabel)(%rip)
-
-#define LOAD_VAR(srclabel,dstreg) \
-        movq    G(srclabel)(%rip), dstreg
-
-#define CMP_VAR(label,reg) \
-        cmpq    G(label)(%rip), %r15
-
-#define TESTL_VAR(imm,label) \
-        testl   imm, G(label)(%rip)
+/* Push the current exception handler. Clobbers %r11 */
+#define PUSH_EXN_HANDLER \
+        movq    Caml_state(exn_handler), %r11; \
+        pushq   %r11; CFI_ADJUST(8);
 
-#define PUSH_VAR(srclabel) \
-        pushq   G(srclabel)(%rip) ; CFI_ADJUST(8)
+/* Pop the current exception handler. Undoes PUSH_EXN_HANDLER. Clobbers %r11 */
+#define POP_EXN_HANDLER \
+        leaq    Caml_state(exn_handler), %r11; \
+        popq    (%r11); CFI_ADJUST(-8)
 
-#define POP_VAR(dstlabel) \
-        popq    G(dstlabel)(%rip); CFI_ADJUST(-8)
+/******************************************************************************/
+/* Stack switching operations */
+/******************************************************************************/
 
-#define LEA_VAR(label,dst) \
-        leaq    G(label)(%rip), dst
+/* Switch from OCaml to C stack. Clobbers %r10, %r11. */
+#ifdef ASM_CFI_SUPPORTED
+#define SWITCH_OCAML_TO_C_CFI                                   \
+        CFI_REMEMBER_STATE;                                     \
+          /* %rsp points to the c_stack_link. */                \
+        .cfi_escape DW_CFA_def_cfa_expression, 5,               \
+           DW_OP_breg + DW_REG_rsp, Cstack_sp, DW_OP_deref,     \
+           DW_OP_plus_uconst, 8 /* retaddr */
+#else
+#define SWITCH_OCAML_TO_C_CFI
 #endif
-
+#define SWITCH_OCAML_TO_C                                  \
+    /* Fill in Caml_state->current_stack->sp */            \
+        movq    Caml_state(current_stack), %r10;           \
+        movq    %rsp, Stack_sp(%r10);                      \
+    /* Fill in Caml_state->c_stack */                      \
+        movq    Caml_state(c_stack), %r11;                 \
+        movq    %rsp, Cstack_sp(%r11);                     \
+        movq    %r10, Cstack_stack(%r11);                  \
+    /* Switch to C stack */                                \
+        movq    %r11, %rsp;                                \
+        SWITCH_OCAML_TO_C_CFI
+
+/* Switch from C to OCaml stack.  Clobbers %r11. */
+#define SWITCH_C_TO_OCAML                                           \
+    /* Assert that %rsp == Caml_state->c_stack &&
+        Caml_state->c_stack->sp == Caml_state->current_stack->sp */ \
+        IF_DEBUG(cmpq %rsp, Caml_state(c_stack); je 8f; int3; 8:    \
+                 movq Caml_state(current_stack), %r11;              \
+                 movq Stack_sp(%r11), %r11;                         \
+                 cmpq %r11, Cstack_sp(%rsp); je 8f; int3; 8:)       \
+        movq    Cstack_sp(%rsp), %rsp;                              \
+        CFI_RESTORE_STATE
+
+/* Load Caml_state->exn_handler into %rsp and restores prior exn_handler.
+   Clobbers %r10 and %r11. */
+#define RESTORE_EXN_HANDLER_OCAML              \
+        movq    Caml_state(exn_handler), %rsp; \
+        CFI_DEF_CFA_OFFSET(16);                \
+        POP_EXN_HANDLER
+
+/* Switch between OCaml stacks. Clobbers %r12.
+   Expects old stack in %rsi and target stack in %r10.
+   Leaves old stack in %rsi and target stack in %r10. */
+#define SWITCH_OCAML_STACKS                               \
+    /* Save OCaml SP and exn_handler in the stack info */ \
+        movq    %rsp, Stack_sp(%rsi);                     \
+        movq    Caml_state(exn_handler), %r12;            \
+        movq    %r12, Stack_exception(%rsi);              \
+    /* switch stacks */                                   \
+        movq    %r10, Caml_state(current_stack);          \
+        movq    Stack_sp(%r10), %rsp;                     \
+        CFI_DEF_CFA_OFFSET(8);                            \
+    /* restore exn_handler for new stack */               \
+        movq    Stack_exception(%r10), %r12;              \
+        movq    %r12, Caml_state(exn_handler)
+
+/******************************************************************************/
 /* Save and restore all callee-save registers on stack.
    Keep the stack 16-aligned. */
+/******************************************************************************/
 
 #if defined(SYS_mingw64) || defined(SYS_cygwin)
 
 /* Win64 API: callee-save regs are rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15 */
 
-#define PUSH_CALLEE_SAVE_REGS \
+#define PUSH_CALLEE_SAVE_REGS                               \
         pushq   %rbx; CFI_ADJUST (8); CFI_OFFSET(rbx, -16); \
         pushq   %rbp; CFI_ADJUST (8); CFI_OFFSET(rbp, -24); \
-                      /* Allows debugger to walk the stack */ \
+                    /* Allows debugger to walk the stack */ \
         pushq   %rsi; CFI_ADJUST (8); CFI_OFFSET(rsi, -32); \
         pushq   %rdi; CFI_ADJUST (8); CFI_OFFSET(rdi, -40); \
         pushq   %r12; CFI_ADJUST (8); CFI_OFFSET(r12, -48); \
         pushq   %r13; CFI_ADJUST (8); CFI_OFFSET(r13, -56); \
         pushq   %r14; CFI_ADJUST (8); CFI_OFFSET(r14, -64); \
         pushq   %r15; CFI_ADJUST (8); CFI_OFFSET(r15, -72); \
-        subq    $(8+10*16), %rsp; CFI_ADJUST (8+10*16); \
-        movupd  %xmm6, 0*16(%rsp); \
-        movupd  %xmm7, 1*16(%rsp); \
-        movupd  %xmm8, 2*16(%rsp); \
-        movupd  %xmm9, 3*16(%rsp); \
-        movupd  %xmm10, 4*16(%rsp); \
-        movupd  %xmm11, 5*16(%rsp); \
-        movupd  %xmm12, 6*16(%rsp); \
-        movupd  %xmm13, 7*16(%rsp); \
-        movupd  %xmm14, 8*16(%rsp); \
+        subq    $(10*16), %rsp; CFI_ADJUST (10*16);         \
+        movupd  %xmm6, 0*16(%rsp);                          \
+        movupd  %xmm7, 1*16(%rsp);                          \
+        movupd  %xmm8, 2*16(%rsp);                          \
+        movupd  %xmm9, 3*16(%rsp);                          \
+        movupd  %xmm10, 4*16(%rsp);                         \
+        movupd  %xmm11, 5*16(%rsp);                         \
+        movupd  %xmm12, 6*16(%rsp);                         \
+        movupd  %xmm13, 7*16(%rsp);                         \
+        movupd  %xmm14, 8*16(%rsp);                         \
         movupd  %xmm15, 9*16(%rsp)
 
-#define POP_CALLEE_SAVE_REGS \
-        movupd  0*16(%rsp), %xmm6; \
-        movupd  1*16(%rsp), %xmm7; \
-        movupd  2*16(%rsp), %xmm8; \
-        movupd  3*16(%rsp), %xmm9; \
-        movupd  4*16(%rsp), %xmm10; \
-        movupd  5*16(%rsp), %xmm11; \
-        movupd  6*16(%rsp), %xmm12; \
-        movupd  7*16(%rsp), %xmm13; \
-        movupd  8*16(%rsp), %xmm14; \
-        movupd  9*16(%rsp), %xmm15; \
-        addq    $(8+10*16), %rsp; CFI_ADJUST (-8-10*16); \
-        popq    %r15; CFI_ADJUST(-8); CFI_SAME_VALUE(r15); \
-        popq    %r14; CFI_ADJUST(-8); CFI_SAME_VALUE(r14); \
-        popq    %r13; CFI_ADJUST(-8); CFI_SAME_VALUE(r13); \
-        popq    %r12; CFI_ADJUST(-8); CFI_SAME_VALUE(r12); \
-        popq    %rdi; CFI_ADJUST(-8); CFI_SAME_VALUE(rdi); \
-        popq    %rsi; CFI_ADJUST(-8); CFI_SAME_VALUE(rsi); \
-        popq    %rbp; CFI_ADJUST(-8); CFI_SAME_VALUE(rbp); \
+#define POP_CALLEE_SAVE_REGS                                \
+        movupd  0*16(%rsp), %xmm6;                          \
+        movupd  1*16(%rsp), %xmm7;                          \
+        movupd  2*16(%rsp), %xmm8;                          \
+        movupd  3*16(%rsp), %xmm9;                          \
+        movupd  4*16(%rsp), %xmm10;                         \
+        movupd  5*16(%rsp), %xmm11;                         \
+        movupd  6*16(%rsp), %xmm12;                         \
+        movupd  7*16(%rsp), %xmm13;                         \
+        movupd  8*16(%rsp), %xmm14;                         \
+        movupd  9*16(%rsp), %xmm15;                         \
+        addq    $(10*16), %rsp; CFI_ADJUST (-10*16);        \
+        popq    %r15; CFI_ADJUST(-8); CFI_SAME_VALUE(r15);  \
+        popq    %r14; CFI_ADJUST(-8); CFI_SAME_VALUE(r14);  \
+        popq    %r13; CFI_ADJUST(-8); CFI_SAME_VALUE(r13);  \
+        popq    %r12; CFI_ADJUST(-8); CFI_SAME_VALUE(r12);  \
+        popq    %rdi; CFI_ADJUST(-8); CFI_SAME_VALUE(rdi);  \
+        popq    %rsi; CFI_ADJUST(-8); CFI_SAME_VALUE(rsi);  \
+        popq    %rbp; CFI_ADJUST(-8); CFI_SAME_VALUE(rbp);  \
         popq    %rbx; CFI_ADJUST(-8); CFI_SAME_VALUE(rbx)
 
 #else
@@ -252,11 +315,9 @@
         pushq   %r12; CFI_ADJUST(8); CFI_OFFSET(r12, -32); \
         pushq   %r13; CFI_ADJUST(8); CFI_OFFSET(r13, -40); \
         pushq   %r14; CFI_ADJUST(8); CFI_OFFSET(r14, -48); \
-        pushq   %r15; CFI_ADJUST(8); CFI_OFFSET(r15, -56); \
-        subq    $8, %rsp; CFI_ADJUST(8)
+        pushq   %r15; CFI_ADJUST(8); CFI_OFFSET(r15, -56)
 
 #define POP_CALLEE_SAVE_REGS \
-        addq    $8, %rsp; CFI_ADJUST(-8); \
         popq    %r15; CFI_ADJUST(-8); CFI_SAME_VALUE(r15); \
         popq    %r14; CFI_ADJUST(-8); CFI_SAME_VALUE(r14); \
         popq    %r13; CFI_ADJUST(-8); CFI_SAME_VALUE(r13); \
@@ -278,7 +339,12 @@
 #  define STACK_PROBE_SIZE 4096
 #endif
 
+#define C_call(target) \
+  PREPARE_FOR_C_CALL; CHECK_STACK_ALIGNMENT; call target; CLEANUP_AFTER_C_CALL
+
+/******************************************************************************/
 /* Registers holding arguments of C functions. */
+/******************************************************************************/
 
 #if defined(SYS_mingw64) || defined(SYS_cygwin)
 #define C_ARG_1 %rcx
@@ -304,108 +370,129 @@ G(caml_hot__code_begin):
 G(caml_hot__code_end):
 #endif
 
+/******************************************************************************/
+/* text section */
+/******************************************************************************/
+
         TEXT_SECTION(caml_system__code_begin)
         .globl  G(caml_system__code_begin)
 G(caml_system__code_begin):
         ret  /* just one instruction, so that debuggers don't display
         caml_system__code_begin instead of caml_call_gc */
 
+/******************************************************************************/
 /* Allocation */
+/******************************************************************************/
+
+/* Save all of the registers that may be in use to a free gc_regs bucket.
+   Returns: bucket in %r15. Clobbers %r11 (after saving it) */
+#define SAVE_ALL_REGS                                  \
+    /* First, save the young_ptr. */                   \
+        movq    %r15, Caml_state(young_ptr);           \
+    /* Now, use %r15 to point to the gc_regs bucket */ \
+    /* We save %r11 first to allow it to be scratch */ \
+        movq    Caml_state(gc_regs_buckets), %r15;     \
+        movq    %r11, 11*8(%r15);                      \
+        movq    0(%r15), %r11; /* next ptr */          \
+        movq    %r11, Caml_state(gc_regs_buckets);     \
+        movq    %rax,         0*8(%r15);               \
+        movq    %rbx,         1*8(%r15);               \
+        movq    %rdi,         2*8(%r15);               \
+        movq    %rsi,         3*8(%r15);               \
+        movq    %rdx,         4*8(%r15);               \
+        movq    %rcx,         5*8(%r15);               \
+        movq    %r8,          6*8(%r15);               \
+        movq    %r9,          7*8(%r15);               \
+        movq    %r12,         8*8(%r15);               \
+        movq    %r13,         9*8(%r15);               \
+        movq    %r10,        10*8(%r15);               \
+             /* %r11 is at   11*8(%r15); */            \
+        movq    %rbp,        12*8(%r15);               \
+        movsd   %xmm0,   (0+13)*8(%r15);               \
+        movsd   %xmm1,   (1+13)*8(%r15);               \
+        movsd   %xmm2,   (2+13)*8(%r15);               \
+        movsd   %xmm3,   (3+13)*8(%r15);               \
+        movsd   %xmm4,   (4+13)*8(%r15);               \
+        movsd   %xmm5,   (5+13)*8(%r15);               \
+        movsd   %xmm6,   (6+13)*8(%r15);               \
+        movsd   %xmm7,   (7+13)*8(%r15);               \
+        movsd   %xmm8,   (8+13)*8(%r15);               \
+        movsd   %xmm9,   (9+13)*8(%r15);               \
+        movsd   %xmm10, (10+13)*8(%r15);               \
+        movsd   %xmm11, (11+13)*8(%r15);               \
+        movsd   %xmm12, (12+13)*8(%r15);               \
+        movsd   %xmm13, (13+13)*8(%r15);               \
+        movsd   %xmm14, (14+13)*8(%r15);               \
+        movsd   %xmm15, (15+13)*8(%r15)
+
+/* Undo SAVE_ALL_REGS. Expects gc_regs bucket in %r15 */
+#define RESTORE_ALL_REGS                               \
+    /* Restore %rax, freeing up the next ptr slot */   \
+        movq    0*8(%r15), %rax;                       \
+        movq    Caml_state(gc_regs_buckets), %r11;     \
+        movq    %r11, 0(%r15); /* next ptr */          \
+        movq    %r15, Caml_state(gc_regs_buckets);     \
+           /* above:  0*8(%r15),%rax; */               \
+        movq          1*8(%r15),%rbx;                  \
+        movq          2*8(%r15),%rdi;                  \
+        movq          3*8(%r15),%rsi;                  \
+        movq          4*8(%r15),%rdx;                  \
+        movq          5*8(%r15),%rcx;                  \
+        movq          6*8(%r15),%r8;                   \
+        movq          7*8(%r15),%r9;                   \
+        movq          8*8(%r15),%r12;                  \
+        movq          9*8(%r15),%r13;                  \
+        movq         10*8(%r15),%r10;                  \
+        movq         11*8(%r15),%r11;                  \
+        movq         12*8(%r15),%rbp;                  \
+        movsd    (0+13)*8(%r15),%xmm0;                 \
+        movsd    (1+13)*8(%r15),%xmm1;                 \
+        movsd    (2+13)*8(%r15),%xmm2;                 \
+        movsd    (3+13)*8(%r15),%xmm3;                 \
+        movsd    (4+13)*8(%r15),%xmm4;                 \
+        movsd    (5+13)*8(%r15),%xmm5;                 \
+        movsd    (6+13)*8(%r15),%xmm6;                 \
+        movsd    (7+13)*8(%r15),%xmm7;                 \
+        movsd    (8+13)*8(%r15),%xmm8;                 \
+        movsd    (9+13)*8(%r15),%xmm9;                 \
+        movsd   (10+13)*8(%r15),%xmm10;                \
+        movsd   (11+13)*8(%r15),%xmm11;                \
+        movsd   (12+13)*8(%r15),%xmm12;                \
+        movsd   (13+13)*8(%r15),%xmm13;                \
+        movsd   (14+13)*8(%r15),%xmm14;                \
+        movsd   (15+13)*8(%r15),%xmm15;                \
+        movq    Caml_state(young_ptr), %r15
+
+FUNCTION(G(caml_call_realloc_stack))
+CFI_STARTPROC
+        CFI_SIGNAL_FRAME
+        SAVE_ALL_REGS
+        movq    8(%rsp), C_ARG_1 /* argument */
+        SWITCH_OCAML_TO_C
+        C_call  (GCALL(caml_try_realloc_stack))
+        SWITCH_C_TO_OCAML
+        cmpq    $0, %rax
+        jz      1f
+        RESTORE_ALL_REGS
+        ret
+1:      RESTORE_ALL_REGS
+        LEA_VAR(caml_exn_Stack_overflow, %rax)
+        add     $16, %rsp /* pop argument, retaddr */
+        jmp     GCALL(caml_raise_exn)
+CFI_ENDPROC
+ENDFUNCTION(G(caml_call_realloc_stack))
 
 FUNCTION(G(caml_call_gc))
-        CFI_STARTPROC
+CFI_STARTPROC
+        CFI_SIGNAL_FRAME
 LBL(caml_call_gc):
-    /* Record lowest stack address and return address. */
-        movq    (%rsp), %r11
-        movq    %r11, Caml_state(last_return_address)
-        leaq    8(%rsp), %r11
-        movq    %r11, Caml_state(bottom_of_stack)
-    /* Touch the stack to trigger a recoverable segfault
-       if insufficient space remains */
-        subq    $(STACK_PROBE_SIZE), %rsp; CFI_ADJUST(STACK_PROBE_SIZE);
-        movq    %r11, 0(%rsp)
-        addq    $(STACK_PROBE_SIZE), %rsp; CFI_ADJUST(-STACK_PROBE_SIZE);
-    /* Build array of registers, save it into Caml_state->gc_regs */
-#ifdef WITH_FRAME_POINTERS
-        ENTER_FUNCTION          ;
-#else
-        pushq   %rbp; CFI_ADJUST(8);
-#endif
-        pushq   %r11; CFI_ADJUST (8);
-        pushq   %r10; CFI_ADJUST (8);
-        pushq   %r13; CFI_ADJUST (8);
-        pushq   %r12; CFI_ADJUST (8);
-        pushq   %r9; CFI_ADJUST (8);
-        pushq   %r8; CFI_ADJUST (8);
-        pushq   %rcx; CFI_ADJUST (8);
-        pushq   %rdx; CFI_ADJUST (8);
-        pushq   %rsi; CFI_ADJUST (8);
-        pushq   %rdi; CFI_ADJUST (8);
-        pushq   %rbx; CFI_ADJUST (8);
-        pushq   %rax; CFI_ADJUST (8);
-        movq    %rsp, Caml_state(gc_regs)
-    /* Save young_ptr */
-        movq    %r15, Caml_state(young_ptr)
-    /* Save floating-point registers */
-        subq    $(16*8), %rsp; CFI_ADJUST (16*8);
-        movsd   %xmm0, 0*8(%rsp)
-        movsd   %xmm1, 1*8(%rsp)
-        movsd   %xmm2, 2*8(%rsp)
-        movsd   %xmm3, 3*8(%rsp)
-        movsd   %xmm4, 4*8(%rsp)
-        movsd   %xmm5, 5*8(%rsp)
-        movsd   %xmm6, 6*8(%rsp)
-        movsd   %xmm7, 7*8(%rsp)
-        movsd   %xmm8, 8*8(%rsp)
-        movsd   %xmm9, 9*8(%rsp)
-        movsd   %xmm10, 10*8(%rsp)
-        movsd   %xmm11, 11*8(%rsp)
-        movsd   %xmm12, 12*8(%rsp)
-        movsd   %xmm13, 13*8(%rsp)
-        movsd   %xmm14, 14*8(%rsp)
-        movsd   %xmm15, 15*8(%rsp)
-    /* Call the garbage collector */
-        PREPARE_FOR_C_CALL
-        call    GCALL(caml_garbage_collection)
-        CLEANUP_AFTER_C_CALL
-    /* Restore young_ptr */
-        movq    Caml_state(young_ptr), %r15
-    /* Restore all regs used by the code generator */
-        movsd   0*8(%rsp), %xmm0
-        movsd   1*8(%rsp), %xmm1
-        movsd   2*8(%rsp), %xmm2
-        movsd   3*8(%rsp), %xmm3
-        movsd   4*8(%rsp), %xmm4
-        movsd   5*8(%rsp), %xmm5
-        movsd   6*8(%rsp), %xmm6
-        movsd   7*8(%rsp), %xmm7
-        movsd   8*8(%rsp), %xmm8
-        movsd   9*8(%rsp), %xmm9
-        movsd   10*8(%rsp), %xmm10
-        movsd   11*8(%rsp), %xmm11
-        movsd   12*8(%rsp), %xmm12
-        movsd   13*8(%rsp), %xmm13
-        movsd   14*8(%rsp), %xmm14
-        movsd   15*8(%rsp), %xmm15
-        addq    $(16*8), %rsp; CFI_ADJUST(-16*8)
-        popq    %rax; CFI_ADJUST(-8)
-        popq    %rbx; CFI_ADJUST(-8)
-        popq    %rdi; CFI_ADJUST(-8)
-        popq    %rsi; CFI_ADJUST(-8)
-        popq    %rdx; CFI_ADJUST(-8)
-        popq    %rcx; CFI_ADJUST(-8)
-        popq    %r8; CFI_ADJUST(-8)
-        popq    %r9; CFI_ADJUST(-8)
-        popq    %r12; CFI_ADJUST(-8)
-        popq    %r13; CFI_ADJUST(-8)
-        popq    %r10; CFI_ADJUST(-8)
-        popq    %r11; CFI_ADJUST(-8)
-#ifdef WITH_FRAME_POINTERS
-        LEAVE_FUNCTION
-#else
-        popq    %rbp; CFI_ADJUST(-8);
-#endif
-    /* Return to caller */
+        SAVE_ALL_REGS
+        movq    %r15, Caml_state(gc_regs)
+        SWITCH_OCAML_TO_C
+        C_call (GCALL(caml_garbage_collection))
+        SWITCH_C_TO_OCAML
+        movq    Caml_state(gc_regs), %r15
+        RESTORE_ALL_REGS
         ret
 CFI_ENDPROC
 ENDFUNCTION(G(caml_call_gc))
@@ -445,170 +532,226 @@ CFI_STARTPROC
 CFI_ENDPROC
 ENDFUNCTION(G(caml_allocN))
 
+/******************************************************************************/
 /* Call a C function from OCaml */
+/******************************************************************************/
 
 FUNCTION(G(caml_c_call))
 CFI_STARTPROC
+        CFI_SIGNAL_FRAME
 LBL(caml_c_call):
-    /* Record lowest stack address and return address */
-        popq    Caml_state(last_return_address); CFI_ADJUST(-8)
-        movq    %rsp, Caml_state(bottom_of_stack)
-    /* equivalent to pushing last return address */
-        subq    $8, %rsp; CFI_ADJUST(8)
-    /* Touch the stack to trigger a recoverable segfault
-       if insufficient space remains */
-        subq    $(STACK_PROBE_SIZE), %rsp; CFI_ADJUST(STACK_PROBE_SIZE);
-        movq    %rax, 0(%rsp)
-        addq    $(STACK_PROBE_SIZE), %rsp; CFI_ADJUST(-STACK_PROBE_SIZE);
+    /* Arguments:
+        C arguments         : %rdi, %rsi, %rdx, %rcx, %r8, and %r9
+        C function          : %rax */
+    /* Switch from OCaml to C */
+        SWITCH_OCAML_TO_C
     /* Make the alloc ptr available to the C code */
         movq    %r15, Caml_state(young_ptr)
     /* Call the function (address in %rax) */
-    /* No need to PREPARE_FOR_C_CALL since the caller already
-       reserved the stack space if needed (cf. amd64/proc.ml) */
-        jmp    *%rax
+        C_call  (*%rax)
+    /* Prepare for return to OCaml */
+        movq    Caml_state(young_ptr), %r15
+    /* Load ocaml stack and restore global variables */
+        SWITCH_C_TO_OCAML
+    /* Return to OCaml caller */
+        ret
 CFI_ENDPROC
 ENDFUNCTION(G(caml_c_call))
 
+FUNCTION(G(caml_c_call_stack_args))
+CFI_STARTPROC
+        CFI_SIGNAL_FRAME
+    /* Arguments:
+        C arguments         : %rdi, %rsi, %rdx, %rcx, %r8, and %r9
+        C function          : %rax
+        C stack args        : begin=%r13 end=%r12 */
+    /* Switch from OCaml to C */
+        SWITCH_OCAML_TO_C
+    /* we use %rbp (otherwise unused) to enable backtraces */
+        movq    %rsp, %rbp
+#ifdef ASM_CFI_SUPPORTED
+        .cfi_escape DW_CFA_def_cfa_expression, 5,           \
+          /* %rbp points to the c_stack_link structure */   \
+          DW_OP_breg + DW_REG_rbp, Cstack_sp, DW_OP_deref,  \
+          DW_OP_plus_uconst, 8 /* ret addr */
+#endif
+    /* Make the alloc ptr available to the C code */
+        movq    %r15, Caml_state(young_ptr)
+    /* Copy arguments from OCaml to C stack */
+LBL(105):
+        subq    $8, %r12
+        cmpq    %r13,%r12
+        jb      LBL(106)
+        push    (%r12); CFI_ADJUST(8)
+        jmp     LBL(105)
+LBL(106):
+    /* Call the function (address in %rax) */
+        C_call  (*%rax)
+    /* Pop arguments back off the stack */
+        movq    Caml_state(c_stack), %rsp
+    /* Prepare for return to OCaml */
+        movq    Caml_state(young_ptr), %r15
+    /* Load ocaml stack and restore global variables */
+        SWITCH_C_TO_OCAML
+    /* Return to OCaml caller */
+        ret
+CFI_ENDPROC
+ENDFUNCTION(G(caml_c_call_stack_args))
+
+/******************************************************************************/
 /* Start the OCaml program */
+/******************************************************************************/
 
 FUNCTION(G(caml_start_program))
-       CFI_STARTPROC
+CFI_STARTPROC
+        CFI_SIGNAL_FRAME
     /* Save callee-save registers */
         PUSH_CALLEE_SAVE_REGS
     /* Load Caml_state into r14 (was passed as an argument from C) */
         movq    C_ARG_1, %r14
     /* Initial entry point is G(caml_program) */
         LEA_VAR(caml_program, %r12)
+#ifdef DEBUG
+        movq    $0, %rax  /* dummy */
+        movq    $0, %rbx  /* dummy */
+        movq    $0, %rdi  /* dummy */
+        movq    $0, %rsi  /* dummy */
+#endif
     /* Common code for caml_start_program and caml_callback* */
 LBL(caml_start_program):
-    /* Build a callback link */
-        subq    $8, %rsp; CFI_ADJUST (8)        /* stack 16-aligned */
-        pushq   Caml_state(gc_regs); CFI_ADJUST(8)
-        pushq   Caml_state(last_return_address); CFI_ADJUST(8)
-        pushq   Caml_state(bottom_of_stack); CFI_ADJUST(8)
-    /* Setup alloc ptr */
+    /* Load young_ptr into %r15 */
         movq    Caml_state(young_ptr), %r15
-    /* Build an exception handler */
-        lea     LBL(108)(%rip), %r13
-        pushq   %r13; CFI_ADJUST(8)
-        pushq   Caml_state(exception_pointer); CFI_ADJUST(8)
-        movq    %rsp, Caml_state(exception_pointer)
-    /* Call the OCaml code */
+    /* Build struct c_stack_link on the C stack */
+        subq    $24 /* sizeof struct c_stack_link */, %rsp; CFI_ADJUST(24)
+        movq    $0, Cstack_stack(%rsp)
+        movq    $0, Cstack_sp(%rsp)
+        movq    Caml_state(c_stack), %r10
+        movq    %r10, Cstack_prev(%rsp)
+        movq    %rsp, Caml_state(c_stack)
+        CHECK_STACK_ALIGNMENT
+    /* Load the OCaml stack. */
+        movq    Caml_state(current_stack), %r11
+        movq    Stack_sp(%r11), %r10
+    /* Store the stack pointer to allow DWARF unwind */
+        subq    $16, %r10
+        movq    %rsp, 0(%r10) /* C_STACK_SP */
+    /* Store the gc_regs for callbacks during a GC */
+        movq    Caml_state(gc_regs), %r11
+        movq    %r11, 8(%r10)
+    /* Build a handler for exceptions raised in OCaml on the OCaml stack. */
+        subq    $16, %r10
+        lea     LBL(109)(%rip), %r11
+        movq    %r11, 8(%r10)
+    /* link in the previous exn_handler so that copying stacks works */
+        movq    Caml_state(exn_handler), %r11
+        movq    %r11, 0(%r10)
+        movq    %r10, Caml_state(exn_handler)
+    /* Switch stacks and call the OCaml code */
+        movq    %r10, %rsp
+#ifdef ASM_CFI_SUPPORTED
+        CFI_REMEMBER_STATE
+        .cfi_escape DW_CFA_def_cfa_expression, 3 + 2,                 \
+            /* %rsp points to the exn handler on the OCaml stack */   \
+            /* %rsp + 16 contains the C_STACK_SP */                   \
+          DW_OP_breg + DW_REG_rsp, 16 /* exn handler */, DW_OP_deref, \
+          DW_OP_plus_uconst,                                          \
+             24  /* struct c_stack_link */ +                          \
+             6*8 /* callee save regs */ +                             \
+             8   /* ret addr */
+#endif
         call    *%r12
-LBL(107):
-    /* Pop the exception handler */
-        popq    Caml_state(exception_pointer); CFI_ADJUST(-8)
-        popq    %r12; CFI_ADJUST(-8)   /* dummy register */
-LBL(109):
+LBL(108):
+    /* pop exn handler */
+        movq    0(%rsp), %r11
+        movq    %r11, Caml_state(exn_handler)
+        leaq    16(%rsp), %r10
+1:  /* restore GC regs */
+        movq    8(%r10), %r11
+        movq    %r11, Caml_state(gc_regs)
+        addq    $16, %r10
     /* Update alloc ptr */
         movq    %r15, Caml_state(young_ptr)
-    /* Pop the callback link, restoring the global variables */
-        popq    Caml_state(bottom_of_stack); CFI_ADJUST(-8)
-        popq    Caml_state(last_return_address); CFI_ADJUST(-8)
-        popq    Caml_state(gc_regs); CFI_ADJUST(-8)
-        addq    $8, %rsp; CFI_ADJUST (-8);
+    /* Return to C stack. */
+        movq    Caml_state(current_stack), %r11
+        movq    %r10, Stack_sp(%r11)
+        movq    Caml_state(c_stack), %rsp
+        CFI_RESTORE_STATE
+    /* Pop the struct c_stack_link */
+        movq    Cstack_prev(%rsp), %r10
+        movq    %r10, Caml_state(c_stack)
+        addq    $24, %rsp; CFI_ADJUST(-24)
     /* Restore callee-save registers. */
         POP_CALLEE_SAVE_REGS
     /* Return to caller. */
         ret
-LBL(108):
+LBL(109):
     /* Exception handler*/
     /* Mark the bucket as an exception result and return it */
         orq     $2, %rax
-        jmp     LBL(109)
+        /* exn handler already popped here */
+        movq    %rsp, %r10
+        jmp     1b
 CFI_ENDPROC
 ENDFUNCTION(G(caml_start_program))
 
+/******************************************************************************/
+/* Exceptions */
+/******************************************************************************/
+
 /* Raise an exception from OCaml */
 
 FUNCTION(G(caml_raise_exn))
 CFI_STARTPROC
+LBL(caml_raise_exn):
         testq   $1, Caml_state(backtrace_active)
-        jne     LBL(110)
-        movq    Caml_state(exception_pointer), %rsp
-        popq    Caml_state(exception_pointer); CFI_ADJUST(-8)
+        jne   LBL(116)
+        RESTORE_EXN_HANDLER_OCAML
         ret
-LBL(110):
-        movq    %rax, %r12            /* Save exception bucket */
-        movq    %rax, C_ARG_1         /* arg 1: exception bucket */
-#ifdef WITH_FRAME_POINTERS
-        ENTER_FUNCTION
-        movq    8(%rsp), C_ARG_2      /* arg 2: pc of raise */
-        leaq    16(%rsp), C_ARG_3     /* arg 3: sp at raise */
-#else
-        popq    C_ARG_2               /* arg 2: pc of raise */
-        movq    %rsp, C_ARG_3         /* arg 3: sp at raise */
-#endif
-        /* arg 4: sp of handler */
-        movq    Caml_state(exception_pointer), C_ARG_4
-        /* PR#5700: thanks to popq above, stack is now 16-aligned */
-        /* Thanks to ENTER_FUNCTION, stack is now 16-aligned */
-        PREPARE_FOR_C_CALL            /* no need to cleanup after */
-        call    GCALL(caml_stash_backtrace)
-        movq    %r12, %rax            /* Recover exception bucket */
-        movq    Caml_state(exception_pointer), %rsp
-        popq    Caml_state(exception_pointer); CFI_ADJUST(-8)
+LBL(116):
+        movq    $0, Caml_state(backtrace_pos)
+LBL(117):
+        movq    %rsp, %r10        /* Save OCaml stack pointer */
+        movq    %rax, %r12        /* Save exception bucket */
+        movq    Caml_state(c_stack), %rsp
+        movq    %rax, C_ARG_1     /* arg 1: exception bucket */
+        movq    (%r10), C_ARG_2   /* arg 2: pc of raise */
+        leaq    8(%r10), C_ARG_3  /* arg 3: sp at raise */
+        movq    Caml_state(exn_handler), C_ARG_4
+                                  /* arg 4: sp of handler */
+        C_call  (GCALL(caml_stash_backtrace))
+        movq    %r12, %rax        /* Recover exception bucket */
+        RESTORE_EXN_HANDLER_OCAML
         ret
 CFI_ENDPROC
 ENDFUNCTION(G(caml_raise_exn))
 
+FUNCTION(G(caml_reraise_exn))
+CFI_STARTPROC
+        testq   $1, Caml_state(backtrace_active)
+        jne   LBL(117)
+        RESTORE_EXN_HANDLER_OCAML
+        ret
+CFI_ENDPROC
+ENDFUNCTION(G(caml_reraise_exn))
+
 /* Raise an exception from C */
 
 FUNCTION(G(caml_raise_exception))
 CFI_STARTPROC
-        movq    C_ARG_1, %r14   /* Caml_state */
-        testq   $1, Caml_state(backtrace_active)
-        jne     LBL(112)
+        movq    C_ARG_1, %r14                /* Caml_state */
         movq    C_ARG_2, %rax
-        movq    Caml_state(exception_pointer), %rsp  /* Cut stack */
-        /* Recover previous exception handler */
-        popq    Caml_state(exception_pointer); CFI_ADJUST(-8)
-        movq    Caml_state(young_ptr), %r15 /* Reload alloc ptr */
-        ret
-LBL(112):
-#ifdef WITH_FRAME_POINTERS
-        ENTER_FUNCTION          ;
-#endif
-        /* Save exception bucket. Caml_state in r14 saved across C calls. */
-        movq    C_ARG_2, %r12
-        /* arg 1: exception bucket */
-        movq    C_ARG_2, C_ARG_1
-        /* arg 2: pc of raise */
-        movq    Caml_state(last_return_address), C_ARG_2
-        /* arg 3: sp of raise */
-        movq    Caml_state(bottom_of_stack), C_ARG_3
-        /* arg 4: sp of handler */
-        movq    Caml_state(exception_pointer), C_ARG_4
-#ifndef WITH_FRAME_POINTERS
-        subq    $8, %rsp              /* PR#5700: maintain stack alignment */
-#endif
-        PREPARE_FOR_C_CALL            /* no need to cleanup after */
-        call    GCALL(caml_stash_backtrace)
-        movq    %r12, %rax            /* Recover exception bucket */
-        movq    Caml_state(exception_pointer), %rsp
-     /* Recover previous exception handler */
-        popq    Caml_state(exception_pointer); CFI_ADJUST(-8)
-        movq    Caml_state(young_ptr), %r15 /* Reload alloc ptr */
-        ret
+    /* Load young_ptr into %r15 */
+        movq    Caml_state(young_ptr), %r15
+    /* Discard the C stack pointer and reset to ocaml stack */
+        movq    Caml_state(current_stack), %r10
+        movq    Stack_sp(%r10), %rsp         /* FIXME: CFI */
+        jmp LBL(caml_raise_exn)
 CFI_ENDPROC
 ENDFUNCTION(G(caml_raise_exception))
 
-/* Raise a Stack_overflow exception on return from segv_handler()
-   (in runtime/signals_nat.c).  On entry, the stack is full, so we
-   cannot record a backtrace.
-   No CFI information here since this function disrupts the stack
-   backtrace anyway. */
-
-FUNCTION(G(caml_stack_overflow))
-        movq    C_ARG_1, %r14                 /* Caml_state */
-        LEA_VAR(caml_exn_Stack_overflow, %rax)
-        movq    Caml_state(exception_pointer), %rsp /* cut the stack */
-     /* Recover previous exn handler */
-        popq    Caml_state(exception_pointer)
-        ret                                   /* jump to handler's code */
-ENDFUNCTION(G(caml_stack_overflow))
-
+/******************************************************************************/
 /* Callback from C to OCaml */
+/******************************************************************************/
 
 FUNCTION(G(caml_callback_asm))
 CFI_STARTPROC
@@ -619,6 +762,8 @@ CFI_STARTPROC
         movq    C_ARG_2, %rbx      /* closure */
         movq    0(C_ARG_3), %rax   /* argument */
         movq    0(%rbx), %r12      /* code pointer */
+        movq    $0, %rdi           /* dummy */
+        movq    $0, %rsi           /* dummy */
         jmp     LBL(caml_start_program)
 CFI_ENDPROC
 ENDFUNCTION(G(caml_callback_asm))
@@ -633,6 +778,7 @@ CFI_STARTPROC
         movq    0(C_ARG_3), %rax   /* first argument */
         movq    8(C_ARG_3), %rbx   /* second argument */
         LEA_VAR(caml_apply2, %r12) /* code pointer */
+        movq    $0, %rsi           /* dummy */
         jmp     LBL(caml_start_program)
 CFI_ENDPROC
 ENDFUNCTION(G(caml_callback2_asm))
@@ -652,6 +798,157 @@ CFI_STARTPROC
 CFI_ENDPROC
 ENDFUNCTION(G(caml_callback3_asm))
 
+/******************************************************************************/
+/* Fibers */
+/*
+ * A continuation is a one word object that points to a fiber. A fiber [f] will
+ * point to its parent at Handler_parent(Stack_handler(f)). In the following,
+ * the [last_fiber] refers to the last fiber in the linked-list formed by the
+ * parent pointer.
+ */
+/******************************************************************************/
+
+FUNCTION(G(caml_perform))
+CFI_STARTPROC
+    /*  %rax: effect to perform
+        %rbx: freshly allocated continuation */
+        movq    Caml_state(current_stack), %rsi /* %rsi := old stack */
+        leaq    1(%rsi), %rdi /* %rdi (last_fiber) := Val_ptr(old stack) */
+        movq    %rdi, 0(%rbx) /* Initialise continuation */
+LBL(do_perform):
+    /*  %rsi: old stack
+        %rdi: last_fiber */
+        movq    Stack_handler(%rsi), %r11  /* %r11 := old stack -> handler */
+        movq    Handler_parent(%r11), %r10 /* %r10 := parent stack */
+        cmpq    $0, %r10                   /* parent is NULL? */
+        je      LBL(112)
+        SWITCH_OCAML_STACKS /* preserves r11 and rsi */
+        /* we have to null the Handler_parent after the switch because
+        the Handler_parent is needed to unwind the stack for backtraces */
+        movq    $0, Handler_parent(%r11) /* Set parent of performer to NULL */
+        movq    Handler_effect(%r11), %rsi  /* %rsi := effect handler */
+        jmp     GCALL(caml_apply3)
+LBL(112):
+    /* switch back to original performer before raising Unhandled
+        (no-op unless this is a reperform) */
+        movq    0(%rbx), %r10  /* load performer stack from continuation */
+        subq    $1, %r10       /* r10 := Ptr_val(r10) */
+        movq    Caml_state(current_stack), %rsi
+        SWITCH_OCAML_STACKS
+    /* No parent stack. Raise Unhandled. */
+        LEA_VAR(caml_exn_Unhandled, %rax)
+        jmp LBL(caml_raise_exn)
+CFI_ENDPROC
+ENDFUNCTION(G(caml_perform))
+
+FUNCTION(G(caml_reperform))
+CFI_STARTPROC
+    /*  %rax: effect to reperform
+        %rbx: continuation
+        %rdi: last_fiber */
+        movq    Caml_state(current_stack), %rsi  /* %rsi := old stack */
+        movq    (Stack_handler-1)(%rdi), %r10
+        movq    %rsi, Handler_parent(%r10)       /* Append to last_fiber */
+        leaq    1(%rsi), %rdi  /* %rdi (last_fiber) := Val_ptr(old stack) */
+        jmp     LBL(do_perform)
+CFI_ENDPROC
+ENDFUNCTION(G(caml_reperform))
+
+FUNCTION(G(caml_resume))
+CFI_STARTPROC
+    /* %rax -> stack, %rbx -> fun, %rdi -> arg */
+        leaq    -1(%rax), %r10  /* %r10 (new stack) = Ptr_val(%rax) */
+        movq    %rdi, %rax      /* %rax := argument to function in %rbx */
+    /*  check if stack null, then already used */
+        testq   %r10, %r10
+        jz      2f
+    /* Find end of list of stacks and add current */
+        movq    %r10, %rsi
+1:      movq    Stack_handler(%rsi), %rcx
+        movq    Handler_parent(%rcx), %rsi
+        testq   %rsi, %rsi
+        jnz     1b
+        movq    Caml_state(current_stack), %rsi
+        movq    %rsi, Handler_parent(%rcx)
+        SWITCH_OCAML_STACKS
+        jmp     *(%rbx)
+2:      LEA_VAR(caml_exn_Continuation_already_taken, %rax)
+        jmp LBL(caml_raise_exn)
+CFI_ENDPROC
+ENDFUNCTION(G(caml_resume))
+
+/* Run a function on a new stack,
+   then invoke either the value or exception handler */
+FUNCTION(G(caml_runstack))
+CFI_STARTPROC
+        CFI_SIGNAL_FRAME
+    /* %rax -> stack, %rbx -> fun, %rdi -> arg */
+        andq    $-2, %rax       /* %rax = Ptr_val(%rax) */
+    /* save old stack pointer and exception handler */
+        movq    Caml_state(current_stack), %rcx
+        movq    Caml_state(exn_handler), %r10
+        movq    %rsp, Stack_sp(%rcx)
+        movq    %r10, Stack_exception(%rcx)
+    /* Load new stack pointer and set parent */
+        movq    Stack_handler(%rax), %r11
+        movq    %rcx, Handler_parent(%r11)
+        movq    %rax, Caml_state(current_stack)
+        movq    Stack_sp(%rax), %r11
+    /* Create an exception handler on the target stack
+       after 16byte DWARF & gc_regs block (which is unused here) */
+        subq    $32, %r11
+        leaq    LBL(fiber_exn_handler)(%rip), %r10
+        movq    %r10, 8(%r11)
+    /* link the previous exn_handler so that copying stacks works */
+        movq    Stack_exception(%rax), %r10
+        movq    %r10, 0(%r11)
+        movq    %r11, Caml_state(exn_handler)
+    /* Switch to the new stack */
+        movq    %r11, %rsp
+#ifdef ASM_CFI_SUPPORTED
+        CFI_REMEMBER_STATE
+        .cfi_escape DW_CFA_def_cfa_expression, 3+3+2, \
+          DW_OP_breg + DW_REG_rsp,                    \
+            16 /* exn */ +                            \
+            8 /* gc_regs slot (unused) */ +           \
+            8 /* C_STACK_SP for DWARF (unused) */     \
+            + Handler_parent, DW_OP_deref,            \
+          DW_OP_plus_uconst, Stack_sp, DW_OP_deref,   \
+          DW_OP_plus_uconst, 8 /* ret addr */
+#endif
+        movq    %rdi, %rax /* first argument */
+        callq   *(%rbx) /* closure in %rbx (second argument) */
+LBL(frame_runstack):
+        leaq    32(%rsp), %r11 /* SP with exn handler popped */
+        movq    Handler_value(%r11), %rbx
+1:      movq    Caml_state(current_stack), C_ARG_1 /* arg to caml_free_stack */
+    /* restore parent stack and exn_handler into Caml_state */
+        movq    Handler_parent(%r11), %r10
+        movq    Stack_exception(%r10), %r11
+        movq    %r10, Caml_state(current_stack)
+        movq    %r11, Caml_state(exn_handler)
+    /* free old stack by switching directly to c_stack; is a no-alloc call */
+        movq    Stack_sp(%r10), %r13 /* saved across C call */
+        CFI_RESTORE_STATE
+        CFI_REMEMBER_STATE
+        CFI_DEF_CFA_REGISTER(DW_REG_r13)
+        movq    %rax, %r12 /* save %rax across C call */
+        movq    Caml_state(c_stack), %rsp
+        C_call  (GCALL(caml_free_stack))
+    /* switch directly to parent stack with correct return */
+        movq    %r13, %rsp
+        CFI_RESTORE_STATE
+        movq    %r12, %rax
+
+    /* Invoke handle_value (or handle_exn) */
+        jmp     *(%rbx)
+LBL(fiber_exn_handler):
+        leaq    16(%rsp), %r11
+        movq    Handler_exception(%r11), %rbx
+        jmp     1b
+CFI_ENDPROC
+ENDFUNCTION(G(caml_runstack))
+
 FUNCTION(G(caml_ml_array_bound_error))
 CFI_STARTPROC
         LEA_VAR(caml_array_bound_error, %rax)
@@ -659,6 +956,20 @@ CFI_STARTPROC
 CFI_ENDPROC
 ENDFUNCTION(G(caml_ml_array_bound_error))
 
+FUNCTION(G(caml_assert_stack_invariants))
+CFI_STARTPROC
+/*      CHECK_STACK_ALIGNMENT */
+        movq    Caml_state(current_stack), %r11
+        movq    %rsp, %r10
+        subq    %r11, %r10      /* %r10: number of bytes left on stack */
+        /* can be two words over: the return addresses */
+        cmp     $((Stack_threshold_words + Stack_ctx_words - 2)*8), %r10
+        jge     1f
+        int3
+1:      ret
+CFI_ENDPROC
+ENDFUNCTION(G(caml_assert_stack_invariants))
+
         TEXT_SECTION(caml_system__code_end)
         .globl  G(caml_system__code_end)
 G(caml_system__code_end):
@@ -667,14 +978,14 @@ G(caml_system__code_end):
         .globl  G(caml_system__frametable)
         .align  EIGHT_ALIGN
 G(caml_system__frametable):
-        .quad   1           /* one descriptor */
-        .quad   LBL(107)    /* return address into callback */
+        .quad   2           /* two descriptors */
+        .quad   LBL(108)    /* return address into callback */
         .value  -1          /* negative frame size => use callback link */
         .value  0           /* no roots here */
         .align  EIGHT_ALIGN
-        .quad   16
-        .quad   0
-        .string "amd64.S"
+        .quad   LBL(frame_runstack) /* return address into fiber_val_handler */
+        .value  -1          /* negative frame size => use callback link */
+        .value  0           /* no roots here */
 
 #if defined(SYS_macosx)
         .literal16
diff --git a/runtime/array.c b/runtime/array.c
index 0c2577bd2daa..dec9e230fcd4 100644
--- a/runtime/array.c
+++ b/runtime/array.c
@@ -68,11 +68,8 @@ CAMLprim value caml_floatarray_get(value array, value index)
   if (idx < 0 || idx >= Wosize_val(array) / Double_wosize)
     caml_array_bound_error();
   d = Double_flat_field(array, idx);
-#define Setup_for_gc
-#define Restore_after_gc
-  Alloc_small(res, Double_wosize, Double_tag);
-#undef Setup_for_gc
-#undef Restore_after_gc
+  Alloc_small(res, Double_wosize, Double_tag,
+    { caml_handle_gc_interrupt_no_async_exceptions(); });
   Store_double_val(res, d);
   return res;
 }
@@ -131,11 +128,8 @@ CAMLprim value caml_floatarray_unsafe_get(value array, value index)
 
   CAMLassert (Tag_val(array) == Double_array_tag);
   d = Double_flat_field(array, idx);
-#define Setup_for_gc
-#define Restore_after_gc
-  Alloc_small(res, Double_wosize, Double_tag);
-#undef Setup_for_gc
-#undef Restore_after_gc
+  Alloc_small(res, Double_wosize, Double_tag,
+    { caml_handle_gc_interrupt_no_async_exceptions(); });
   Store_double_val(res, d);
   return res;
 }
@@ -161,6 +155,10 @@ static value caml_array_unsafe_set_addr(value array, value index,value newval)
 }
 
 /* [ floatarray -> int -> float -> unit ] */
+/* [MM]: [caml_array_unsafe_set_addr] has a fence for enforcing the OCaml
+   memory model through its use of [caml_modify].
+   [MM] [TODO]: [caml_floatarray_unsafe_set] will also need a similar fence in
+   [Store_double_flat_field]. */
 CAMLprim value caml_floatarray_unsafe_set(value array, value index,value newval)
 {
   intnat idx = Long_val(index);
@@ -191,18 +189,15 @@ CAMLprim value caml_floatarray_create(value len)
     if (wosize == 0)
       return Atom(0);
     else
-#define Setup_for_gc
-#define Restore_after_gc
-      Alloc_small (result, wosize, Double_array_tag);
-#undef Setup_for_gc
-#undef Restore_after_gc
+      Alloc_small (result, wosize, Double_array_tag,
+        { caml_handle_gc_interrupt_no_async_exceptions(); });
   }else if (wosize > Max_wosize)
     caml_invalid_argument("Float.Array.create");
   else {
     result = caml_alloc_shr (wosize, Double_array_tag);
   }
-  // Give the GC a chance to run, and run memprof callbacks
-  return caml_process_pending_actions_with_root (result);
+  /* Give the GC a chance to run */
+  return caml_check_urgent_gc (result);
 }
 
 /* [len] is a [value] representing number of words or floats */
@@ -217,8 +212,7 @@ CAMLprim value caml_make_vect(value len, value init)
     res = Atom(0);
 #ifdef FLAT_FLOAT_ARRAY
   } else if (Is_block(init)
-           && Is_in_value_area(init)
-           && Tag_val(init) == Double_tag) {
+             && Tag_val(init) == Double_tag) {
     mlsize_t wsize;
     double d;
     d = Double_val(init);
@@ -249,7 +243,7 @@ CAMLprim value caml_make_vect(value len, value init)
       for (i = 0; i < size; i++) Field(res, i) = init;
     }
   }
-  // Give the GC a chance to run, and run memprof callbacks
+  /* Give the GC a chance to run */
   caml_process_pending_actions ();
   CAMLreturn (res);
 }
@@ -289,7 +283,6 @@ CAMLprim value caml_make_array(value init)
   } else {
     v = Field(init, 0);
     if (Is_long(v)
-        || ! Is_in_value_area(v)
         || Tag_val(v) != Double_tag) {
       CAMLreturn (init);
     } else {
@@ -303,7 +296,6 @@ CAMLprim value caml_make_array(value init)
         double d = Double_val(Field(init, i));
         Store_double_flat_field(res, i, d);
       }
-      // run memprof callbacks
       caml_process_pending_actions();
       CAMLreturn (res);
     }
@@ -315,9 +307,48 @@ CAMLprim value caml_make_array(value init)
 
 /* Blitting */
 
+/* [wo_memmove] copies [nvals] values from [src] to [dst]. If there is a single
+   domain running, then we use [memmove]. Otherwise, we copy one word at a
+   time.
+
+   Since the [memmove] implementation does not guarantee that the writes are
+   always word-sized, we explicitly perform word-sized writes of the release
+   kind to avoid mixed-mode accesses. Performing release writes should be
+   sufficient to prevent smart compilers from coalesing the writes into vector
+   writes, and hence prevent mixed-mode accesses. [MM].
+   */
+static void wo_memmove (value* const dst, const value* const src,
+                        mlsize_t nvals)
+{
+  mlsize_t i;
+
+  if (caml_domain_alone ()) {
+    memmove (dst, src, nvals * sizeof (value));
+  } else {
+    /* See memory model [MM] notes in memory.c */
+    atomic_thread_fence(memory_order_acquire);
+    if (dst < src) {
+      /* copy ascending */
+      for (i = 0; i < nvals; i++)
+        atomic_store_explicit(&((atomic_value*)dst)[i], src[i],
+                              memory_order_release);
+
+    } else {
+      /* copy descending */
+      for (i = nvals; i > 0; i--)
+        atomic_store_explicit(&((atomic_value*)dst)[i-1], src[i-1],
+                              memory_order_release);
+    }
+  }
+}
+
+/* [MM] [TODO]: Not consistent with the memory model. See the discussion in
+   https://github.com/ocaml-multicore/ocaml-multicore/pull/822. */
 CAMLprim value caml_floatarray_blit(value a1, value ofs1, value a2, value ofs2,
                                     value n)
 {
+  /* See memory model [MM] notes in memory.c */
+  atomic_thread_fence(memory_order_acquire);
   memmove((double *)a2 + Long_val(ofs2),
           (double *)a1 + Long_val(ofs1),
           Long_val(n) * sizeof(double));
@@ -339,10 +370,10 @@ CAMLprim value caml_array_blit(value a1, value ofs1, value a2, value ofs2,
     /* Arrays of values, destination is in young generation.
        Here too we can do a direct copy since this cannot create
        old-to-young pointers, nor mess up with the incremental major GC.
-       Again, memmove takes care of overlap. */
-    memmove(&Field(a2, Long_val(ofs2)),
-            &Field(a1, Long_val(ofs1)),
-            Long_val(n) * sizeof(value));
+       Again, wo_memmove takes care of overlap. */
+    wo_memmove(&Field(a2, Long_val(ofs2)),
+               &Field(a1, Long_val(ofs1)),
+               Long_val(n));
     return Val_unit;
   }
   /* Array of values, destination is in old generation.
@@ -406,6 +437,8 @@ static value caml_array_gather(intnat num_arrays,
     wsize = size * Double_wosize;
     res = caml_alloc(wsize, Double_array_tag);
     for (i = 0, pos = 0; i < num_arrays; i++) {
+      /* [res] is freshly allocated, and no other domain has a reference to it.
+         Hence, a plain [memcpy] is sufficient. */
       memcpy((double *)res + pos,
              (double *)arrays[i] + offsets[i],
              lengths[i] * sizeof(double));
@@ -419,6 +452,8 @@ static value caml_array_gather(intnat num_arrays,
        We can use memcpy directly. */
     res = caml_alloc_small(size, 0);
     for (i = 0, pos = 0; i < num_arrays; i++) {
+      /* [res] is freshly allocated, and no other domain has a reference to it.
+         Hence, a plain [memcpy] is sufficient. */
       memcpy(&Field(res, pos),
              &Field(arrays[i], offsets[i]),
              lengths[i] * sizeof(value));
@@ -445,7 +480,7 @@ static value caml_array_gather(intnat num_arrays,
     /* Many caml_initialize in a row can create a lot of old-to-young
        refs.  Give the minor GC a chance to run if it needs to.
        Run memprof callbacks for the major allocation. */
-    res = caml_process_pending_actions_with_root (res);
+    res = caml_check_urgent_gc(res);
   }
   CAMLreturn (res);
 }
@@ -539,17 +574,16 @@ CAMLprim value caml_array_fill(value array,
     for (; len > 0; len--, fp++) *fp = val;
   } else {
     int is_val_young_block = Is_block(val) && Is_young(val);
-    CAMLassert(Is_in_heap(fp));
     for (; len > 0; len--, fp++) {
       value old = *fp;
       if (old == val) continue;
       *fp = val;
       if (Is_block(old)) {
         if (Is_young(old)) continue;
-        if (caml_gc_phase == Phase_mark) caml_darken(old, NULL);
+        caml_darken(NULL, old, NULL);
       }
       if (is_val_young_block)
-        add_to_ref_table (Caml_state->ref_table, fp);
+        Ref_table_add(&Caml_state->minor_tables->major_ref, fp);
     }
     if (is_val_young_block) caml_check_urgent_gc (Val_unit);
   }
diff --git a/runtime/backtrace.c b/runtime/backtrace.c
index 3af8b9ec7e95..1200449f8be8 100644
--- a/runtime/backtrace.c
+++ b/runtime/backtrace.c
@@ -29,23 +29,14 @@
 #include "caml/debugger.h"
 #include "caml/startup.h"
 
-void caml_init_backtrace(void)
-{
-  caml_register_global_root(&Caml_state->backtrace_last_exn);
-}
-
 /* Start or stop the backtrace machinery */
 CAMLexport void caml_record_backtraces(int flag)
 {
   if (flag != Caml_state->backtrace_active) {
     Caml_state->backtrace_active = flag;
     Caml_state->backtrace_pos = 0;
-    Caml_state->backtrace_last_exn = Val_unit;
-    /* Note: We do lazy initialization of Caml_state->backtrace_buffer when
-       needed in order to simplify the interface with the thread
-       library (thread creation doesn't need to allocate
-       Caml_state->backtrace_buffer). So we don't have to allocate it here.
-    */
+    caml_modify_generational_global_root(&Caml_state->backtrace_last_exn,
+                                         Val_unit);
   }
   return;
 }
@@ -166,17 +157,35 @@ CAMLprim value caml_get_exception_raw_backtrace(value unit)
   CAMLparam0();
   CAMLlocal1(res);
 
+  /* Beware: the allocations below may cause finalizers to be run, and another
+     backtrace---possibly of a different length---to be stashed (for example
+     if the finalizer raises then catches an exception).  We choose to ignore
+     any such finalizer backtraces and return the original one. */
+
   if (!Caml_state->backtrace_active ||
       Caml_state->backtrace_buffer == NULL ||
       Caml_state->backtrace_pos == 0) {
     res = caml_alloc(0, 0);
   }
   else {
-    intnat i, len = Caml_state->backtrace_pos;
+    backtrace_slot saved_caml_backtrace_buffer[BACKTRACE_BUFFER_SIZE];
+    int saved_caml_backtrace_pos;
+    intnat i;
+
+    saved_caml_backtrace_pos = Caml_state->backtrace_pos;
+
+    if (saved_caml_backtrace_pos > BACKTRACE_BUFFER_SIZE) {
+      saved_caml_backtrace_pos = BACKTRACE_BUFFER_SIZE;
+    }
+
+    memcpy(saved_caml_backtrace_buffer, Caml_state->backtrace_buffer,
+           saved_caml_backtrace_pos * sizeof(backtrace_slot));
 
-    res = caml_alloc(len, 0);
-    for (i = 0; i < len; i++)
-      Field(res, i) = Val_backtrace_slot(Caml_state->backtrace_buffer[i]);
+    res = caml_alloc(saved_caml_backtrace_pos, 0);
+    for (i = 0; i < saved_caml_backtrace_pos; i++) {
+      caml_initialize(&Field(res, i),
+                      Val_backtrace_slot(saved_caml_backtrace_buffer[i]));
+    }
   }
 
   CAMLreturn(res);
@@ -190,7 +199,9 @@ CAMLprim value caml_restore_raw_backtrace(value exn, value backtrace)
   intnat i;
   mlsize_t bt_size;
 
-  Caml_state->backtrace_last_exn = exn;
+  caml_domain_state* domain_state = Caml_state;
+
+  caml_modify_generational_global_root (&domain_state->backtrace_last_exn, exn);
 
   bt_size = Wosize_val(backtrace);
   if(bt_size > BACKTRACE_BUFFER_SIZE){
@@ -200,19 +211,19 @@ CAMLprim value caml_restore_raw_backtrace(value exn, value backtrace)
   /* We don't allocate if the backtrace is empty (no -g or backtrace
      not activated) */
   if(bt_size == 0){
-    Caml_state->backtrace_pos = 0;
+    domain_state->backtrace_pos = 0;
     return Val_unit;
   }
 
   /* Allocate if needed and copy the backtrace buffer */
-  if (Caml_state->backtrace_buffer == NULL &&
-      caml_alloc_backtrace_buffer() == -1) {
+  if (domain_state->backtrace_buffer == NULL
+      && caml_alloc_backtrace_buffer() == -1){
     return Val_unit;
   }
 
-  Caml_state->backtrace_pos = bt_size;
-  for(i=0; i < Caml_state->backtrace_pos; i++){
-    Caml_state->backtrace_buffer[i] = Backtrace_slot_val(Field(backtrace, i));
+  domain_state->backtrace_pos = bt_size;
+  for(i=0; i < domain_state->backtrace_pos; i++){
+    domain_state->backtrace_buffer[i] = Backtrace_slot_val(Field(backtrace, i));
   }
 
   return Val_unit;
@@ -358,23 +369,9 @@ CAMLprim value caml_get_exception_backtrace(value unit)
       Store_field(arr, i, caml_convert_debuginfo(dbg));
     }
 
-    res = caml_alloc_small(1, 0); Field(res, 0) = arr; /* Some */
+    res = caml_alloc_small(1, 0);
+    Field(res, 0) = arr; /* Some */
   }
 
   CAMLreturn(res);
 }
-
-CAMLprim value caml_get_current_callstack(value max_frames_value)
-{
-  CAMLparam1(max_frames_value);
-  CAMLlocal1(res);
-  value* callstack = NULL;
-  intnat callstack_alloc_len = 0;
-  intnat callstack_len =
-    caml_collect_current_callstack(&callstack, &callstack_alloc_len,
-                                   Long_val(max_frames_value), -1);
-  res = caml_alloc(callstack_len, 0);
-  memcpy(Op_val(res), callstack, sizeof(value) * callstack_len);
-  caml_stat_free(callstack);
-  CAMLreturn(res);
-}
diff --git a/runtime/backtrace_byt.c b/runtime/backtrace_byt.c
index 61ca5603d4f4..281c734acda8 100644
--- a/runtime/backtrace_byt.c
+++ b/runtime/backtrace_byt.c
@@ -37,7 +37,7 @@
 #include "caml/fix_code.h"
 #include "caml/memory.h"
 #include "caml/startup.h"
-#include "caml/stacks.h"
+#include "caml/fiber.h"
 #include "caml/sys.h"
 #include "caml/backtrace.h"
 #include "caml/fail.h"
@@ -257,7 +257,8 @@ value caml_remove_debug_info(code_t start)
   CAMLreturn(Val_unit);
 }
 
-int caml_alloc_backtrace_buffer(void){
+int caml_alloc_backtrace_buffer (void)
+{
   CAMLassert(Caml_state->backtrace_pos == 0);
   Caml_state->backtrace_buffer =
     caml_stat_alloc_noexc(BACKTRACE_BUFFER_SIZE * sizeof(code_t));
@@ -270,9 +271,11 @@ int caml_alloc_backtrace_buffer(void){
 
 void caml_stash_backtrace(value exn, value * sp, int reraise)
 {
+  value *trap_sp;
+
   if (exn != Caml_state->backtrace_last_exn || !reraise) {
     Caml_state->backtrace_pos = 0;
-    Caml_state->backtrace_last_exn = exn;
+    caml_modify_generational_global_root(&Caml_state->backtrace_last_exn, exn);
   }
 
   if (Caml_state->backtrace_buffer == NULL &&
@@ -281,7 +284,8 @@ void caml_stash_backtrace(value exn, value * sp, int reraise)
 
   /* Traverse the stack and put all values pointing into bytecode
      into the backtrace buffer. */
-  for (/*nothing*/; sp < Caml_state->trapsp; sp++) {
+  trap_sp = Stack_high(Caml_state->current_stack) + Caml_state->trap_sp_off;
+  for (/*nothing*/; sp < trap_sp; sp++) {
     code_t p;
     if (Is_long(*sp)) continue;
     p = (code_t) *sp;
@@ -292,18 +296,19 @@ void caml_stash_backtrace(value exn, value * sp, int reraise)
 }
 
 /* returns the next frame pointer (or NULL if none is available);
-   updates *sp to point to the following one, and *trsp to the next
+   updates *sp to point to the following one, and *trap_spoff to the next
    trap frame, which we will skip when we reach it  */
 
-code_t caml_next_frame_pointer(value ** sp, value ** trsp)
+code_t caml_next_frame_pointer(value* stack_high, value ** sp,
+                          intnat * trap_spoff)
 {
-  while (*sp < Caml_state->stack_high) {
+  while (*sp < stack_high) {
     value *spv = (*sp)++;
     code_t *p;
     if (Is_long(*spv)) continue;
     p = (code_t*) spv;
-    if(&Trap_pc(*trsp) == p) {
-      *trsp = *trsp + Long_val(Trap_link_offset(*trsp));
+    if((code_t*)&Trap_pc(stack_high + *trap_spoff) == p) {
+      *trap_spoff = Trap_link(stack_high + *trap_spoff);
       continue;
     }
 
@@ -313,40 +318,110 @@ code_t caml_next_frame_pointer(value ** sp, value ** trsp)
   return NULL;
 }
 
-#define Default_callstack_size 32
-intnat caml_collect_current_callstack(value** ptrace, intnat* plen,
-                                      intnat max_frames, int alloc_idx)
+/* Stores upto [max_frames_value] frames of the current call stack to
+   return to the user. This is used not in an exception-raising context, but
+   only when the user requests to save the trace (hopefully less often).
+   Instead of using a bounded buffer as [Caml_state->stash_backtrace], we first
+   traverse the stack to compute the right size, then allocate space for the
+   trace. */
+
+static void get_callstack(value* sp, intnat trap_spoff,
+                          struct stack_info* stack,
+                          intnat max_frames,
+                          code_t** trace, intnat* trace_size)
 {
-  value * sp = Caml_state->extern_sp;
-  value * trsp = Caml_state->trapsp;
-  intnat trace_pos = 0;
-  CAMLassert(alloc_idx == 0 || alloc_idx == -1);
-
-  if (max_frames <= 0) return 0;
-  if (*plen == 0) {
-    value* trace =
-      caml_stat_alloc_noexc(Default_callstack_size * sizeof(value));
-    if (trace == NULL) return 0;
-    *ptrace = trace;
-    *plen = Default_callstack_size;
+  struct stack_info* parent = Stack_parent(stack);
+  value *stack_high = Stack_high(stack);
+  value* saved_sp = sp;
+  intnat saved_trap_spoff = trap_spoff;
+
+  CAMLnoalloc;
+
+  /* first compute the size of the trace */
+  {
+    *trace_size = 0;
+    while (*trace_size < max_frames) {
+      code_t p = caml_next_frame_pointer(stack_high, &sp, &trap_spoff);
+      if (p == NULL) {
+        if (parent == NULL) break;
+        sp = parent->sp;
+        trap_spoff = Long_val(sp[0]);
+        stack_high = Stack_high(parent);
+        parent = Stack_parent(parent);
+      } else {
+        ++*trace_size;
+      }
+    }
   }
 
-  while (trace_pos < max_frames) {
-    code_t p = caml_next_frame_pointer(&sp, &trsp);
-    if (p == NULL) break;
-    if (trace_pos == *plen) {
-      intnat new_len = *plen * 2;
-      value * trace = caml_stat_resize_noexc(*ptrace, new_len * sizeof(value));
-      if (trace == NULL) break;
-      *ptrace = trace;
-      *plen = new_len;
+  *trace = caml_stat_alloc(sizeof(code_t*) * *trace_size);
+
+  sp = saved_sp;
+  parent = Stack_parent(stack);
+  stack_high = Stack_high(stack);
+  trap_spoff = saved_trap_spoff;
+
+  /* then collect the trace */
+  {
+    uintnat trace_pos = 0;
+
+    while (trace_pos < *trace_size) {
+      code_t p = caml_next_frame_pointer(stack_high, &sp, &trap_spoff);
+      if (p == NULL) {
+        sp = parent->sp;
+        trap_spoff = Long_val(sp[0]);
+        stack_high = Stack_high(parent);
+        parent = Stack_parent(parent);
+      } else {
+        (*trace)[trace_pos] = p;
+        ++trace_pos;
+      }
     }
-    (*ptrace)[trace_pos++] = Val_backtrace_slot(p);
   }
+}
 
-  return trace_pos;
+static value alloc_callstack(code_t* trace, intnat trace_len)
+{
+  CAMLparam0();
+  CAMLlocal1(callstack);
+  int i;
+  callstack = caml_alloc(trace_len, 0);
+  for (i = 0; i < trace_len; i++)
+    Store_field(callstack, i, Val_backtrace_slot(trace[i]));
+  caml_stat_free(trace);
+  CAMLreturn(callstack);
+}
+
+CAMLprim value caml_get_current_callstack (value max_frames_value)
+{
+  code_t* trace;
+  intnat trace_len;
+  get_callstack(Caml_state->current_stack->sp, Caml_state->trap_sp_off,
+                Caml_state->current_stack, Long_val(max_frames_value),
+                &trace, &trace_len);
+  return alloc_callstack(trace, trace_len);
 }
 
+CAMLprim value caml_get_continuation_callstack (value cont, value max_frames)
+{
+  code_t* trace;
+  intnat trace_len;
+  struct stack_info *stack;
+  value *sp;
+
+  stack = Ptr_val(caml_continuation_use(cont));
+  {
+    CAMLnoalloc; /* GC must not see the stack outside the cont */
+    sp = stack->sp;
+    get_callstack(sp, Long_val(sp[0]), stack, Long_val(max_frames),
+                  &trace, &trace_len);
+    caml_continuation_replace(cont, stack);
+  }
+
+  return alloc_callstack(trace, trace_len);
+}
+
+
 /* Read the debugging info contained in the current bytecode executable. */
 
 static void read_main_debug_info(struct debug_info *di)
@@ -366,13 +441,13 @@ static void read_main_debug_info(struct debug_info *di)
 
      See  https://github.com/ocaml/ocaml/issues/9344 for details.
   */
-  if (caml_cds_file == NULL && caml_byte_program_mode == COMPLETE_EXE)
+  if (caml_params->cds_file == NULL && caml_byte_program_mode == COMPLETE_EXE)
     CAMLreturn0;
 
-  if (caml_cds_file != NULL) {
-    exec_name = caml_cds_file;
+  if (caml_params->cds_file != NULL) {
+    exec_name = (char_os*) caml_params->cds_file;
   } else {
-    exec_name = caml_exe_name;
+    exec_name = (char_os*) caml_params->exe_name;
   }
 
   fd = caml_attempt_open(&exec_name, &trail, 1);
@@ -397,7 +472,7 @@ static void read_main_debug_info(struct debug_info *di)
       /* Relocate events in event list */
       for (l = evl; l != Val_int(0); l = Field(l, 1)) {
         value ev = Field(l, 0);
-        Field(ev, EV_POS) = Val_long(Long_val(Field(ev, EV_POS)) + orig);
+        Store_field (ev, EV_POS, Val_long(Long_val(Field(ev, EV_POS)) + orig));
       }
       /* Record event list */
       Store_field(events, i, evl);
@@ -422,7 +497,7 @@ CAMLexport void caml_init_debug_info(void)
 
 CAMLexport void caml_load_main_debug_info(void)
 {
-  if (Caml_state->backtrace_active > 1) {
+  if (caml_params->backtrace_enabled > 1) {
     read_main_debug_info(caml_debug_info.contents[0]);
   }
 }
diff --git a/runtime/backtrace_nat.c b/runtime/backtrace_nat.c
index 61228311ad5c..9c155e643095 100644
--- a/runtime/backtrace_nat.c
+++ b/runtime/backtrace_nat.c
@@ -24,43 +24,46 @@
 #include "caml/alloc.h"
 #include "caml/backtrace.h"
 #include "caml/backtrace_prim.h"
+#include "caml/frame_descriptors.h"
+#include "caml/stack.h"
 #include "caml/memory.h"
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
-#include "caml/stack.h"
+#include "caml/fiber.h"
+#include "caml/fail.h"
 
 /* Returns the next frame descriptor (or NULL if none is available),
    and updates *pc and *sp to point to the following one.  */
-frame_descr * caml_next_frame_descriptor(uintnat * pc, char ** sp)
+frame_descr * caml_next_frame_descriptor
+    (caml_frame_descrs fds, uintnat * pc, char ** sp, struct stack_info* stack)
 {
   frame_descr * d;
-  uintnat h;
 
   while (1) {
-    h = Hash_retaddr(*pc);
-    while (1) {
-      d = caml_frame_descriptors[h];
-      if (d == NULL) return NULL; /* happens if some code compiled without -g */
-      if (d->retaddr == *pc) break;
-      h = (h+1) & caml_frame_descriptors_mask;
+    d = caml_find_frame_descr(fds, *pc);
+
+    if( d == NULL ) {
+      return NULL;
     }
+
     /* Skip to next frame */
     if (d->frame_size != 0xFFFF) {
       /* Regular frame, update sp/pc and return the frame descriptor */
       *sp += (d->frame_size & 0xFFFC);
       *pc = Saved_return_address(*sp);
-#ifdef Mask_already_scanned
-      *pc = Mask_already_scanned(*pc);
-#endif
       return d;
     } else {
-      /* Special frame marking the top of a stack chunk for an ML callback.
-         Skip C portion of stack and continue with next ML stack chunk. */
-      struct caml_context * next_context = Callback_link(*sp);
-      *sp = next_context->bottom_of_stack;
-      *pc = next_context->last_retaddr;
-      /* A null sp means no more ML stack chunks; stop here. */
-      if (*sp == NULL) return NULL;
+      /* This marks the top of an ML stack chunk. Move sp to the previous stack
+       chunk. This includes skipping over the DWARF link & trap frame
+       (4 words). */
+      *sp += 4 * sizeof(value);
+      if (*sp == (char*)Stack_high(stack)) {
+        /* We've reached the top of stack. No more frames. */
+        *pc = 0;
+        return NULL;
+      }
+      *pc = **(uintnat**)sp;
+      *sp += sizeof(value); /* return address */
     }
   }
 }
@@ -79,24 +82,30 @@ int caml_alloc_backtrace_buffer(void){
    preserved the global, statically bounded buffer of the old
    implementation -- before the more flexible
    [caml_get_current_callstack] was implemented. */
-void caml_stash_backtrace(value exn, uintnat pc, char * sp, char * trapsp)
+void caml_stash_backtrace(value exn, uintnat pc, char * sp, char* trapsp)
 {
-  if (exn != Caml_state->backtrace_last_exn) {
-    Caml_state->backtrace_pos = 0;
-    Caml_state->backtrace_last_exn = exn;
+  caml_domain_state* domain_state = Caml_state;
+  caml_frame_descrs fds;
+
+  if (exn != domain_state->backtrace_last_exn) {
+    domain_state->backtrace_pos = 0;
+    caml_modify_generational_global_root
+      (&domain_state->backtrace_last_exn, exn);
   }
 
   if (Caml_state->backtrace_buffer == NULL &&
       caml_alloc_backtrace_buffer() == -1)
     return;
 
+  fds = caml_get_frame_descrs();
   /* iterate on each frame  */
   while (1) {
-    frame_descr * descr = caml_next_frame_descriptor(&pc, &sp);
+    frame_descr * descr = caml_next_frame_descriptor
+                                (fds, &pc, &sp, domain_state->current_stack);
     if (descr == NULL) return;
     /* store its descriptor in the backtrace buffer */
-    if (Caml_state->backtrace_pos >= BACKTRACE_BUFFER_SIZE) return;
-    Caml_state->backtrace_buffer[Caml_state->backtrace_pos++] =
+    if (domain_state->backtrace_pos >= BACKTRACE_BUFFER_SIZE) return;
+    domain_state->backtrace_buffer[domain_state->backtrace_pos++] =
       (backtrace_slot) descr;
 
     /* Stop when we reach the current exception handler */
@@ -104,66 +113,108 @@ void caml_stash_backtrace(value exn, uintnat pc, char * sp, char * trapsp)
   }
 }
 
-/* A backtrace_slot is either a debuginfo or a frame_descr* */
-#define Slot_is_debuginfo(s) ((uintnat)(s) & 2)
-#define Debuginfo_slot(s) ((debuginfo)((uintnat)(s) - 2))
-#define Slot_debuginfo(d) ((backtrace_slot)((uintnat)(d) + 2))
-#define Frame_descr_slot(s) ((frame_descr*)(s))
-#define Slot_frame_descr(f) ((backtrace_slot)(f))
-static debuginfo debuginfo_extract(frame_descr* d, int alloc_idx);
-
-#define Default_callstack_size 32
-intnat caml_collect_current_callstack(value** ptrace, intnat* plen,
-                                      intnat max_frames, int alloc_idx)
+/* Stores upto [max_frames_value] frames of the current call stack to
+   return to the user. This is used not in an exception-raising
+   context, but only when the user requests to save the trace
+   (hopefully less often). Instead of using a bounded buffer as
+   [caml_stash_backtrace], we first traverse the stack to compute the
+   right size, then allocate space for the trace. */
+static void get_callstack(struct stack_info* orig_stack, intnat max_frames,
+                          frame_descr*** trace, intnat* trace_size)
 {
-  uintnat pc = Caml_state->last_return_address;
-  char * sp = Caml_state->bottom_of_stack;
-  intnat trace_pos = 0;
-
-  if (max_frames <= 0) return 0;
-  if (*plen == 0) {
-    value* trace =
-      caml_stat_alloc_noexc(Default_callstack_size * sizeof(value));
-    if (trace == NULL) return 0;
-    *ptrace = trace;
-    *plen = Default_callstack_size;
-  }
+  intnat trace_pos;
+  char *sp;
+  uintnat pc;
+  caml_frame_descrs fds;
+  CAMLnoalloc;
 
-  if (alloc_idx >= 0) {
-    /* First frame has a Comballoc selector */
-    frame_descr * descr = caml_next_frame_descriptor(&pc, &sp);
-    debuginfo info;
-    if (descr == NULL) return 0;
-    info = debuginfo_extract(descr, alloc_idx);
-    if (info != NULL) {
-      CAMLassert(((uintnat)info & 3) == 0);
-      (*ptrace)[trace_pos++] = Val_backtrace_slot(Slot_debuginfo(info));
-    } else {
-      (*ptrace)[trace_pos++] = Val_backtrace_slot(Slot_frame_descr(descr));
+  fds = caml_get_frame_descrs();
+
+  /* first compute the size of the trace */
+  {
+    struct stack_info* stack = orig_stack;
+    caml_get_stack_sp_pc(stack, &sp, &pc);
+    trace_pos = 0;
+
+    while(1) {
+      frame_descr *descr = caml_next_frame_descriptor(fds, &pc, &sp, stack);
+      if (trace_pos >= max_frames) break;
+      if (descr == NULL) {
+        stack = Stack_parent(stack);
+        if (stack == NULL) break;
+        caml_get_stack_sp_pc(stack, &sp, &pc);
+      } else {
+        ++trace_pos;
+      }
     }
   }
 
-  while (trace_pos < max_frames) {
-    frame_descr * descr = caml_next_frame_descriptor(&pc, &sp);
-    if (descr == NULL) break;
-    CAMLassert(((uintnat)descr & 3) == 0);
-    if (trace_pos == *plen) {
-      intnat new_len = *plen * 2;
-      value * trace = caml_stat_resize_noexc(*ptrace, new_len * sizeof(value));
-      if (trace == NULL) break;
-      *ptrace = trace;
-      *plen = new_len;
+  *trace_size = trace_pos;
+  *trace = caml_stat_alloc(sizeof(frame_descr*) * trace_pos);
+
+  /* then collect the trace */
+  {
+    struct stack_info* stack = orig_stack;
+    caml_get_stack_sp_pc(stack, &sp, &pc);
+    trace_pos = 0;
+
+    while(1) {
+      frame_descr *descr = caml_next_frame_descriptor(fds, &pc, &sp, stack);
+      if (trace_pos >= max_frames) break;
+      if (descr == NULL) {
+        stack = Stack_parent(stack);
+        if (stack == NULL) break;
+        caml_get_stack_sp_pc(stack, &sp, &pc);
+      } else {
+        (*trace)[trace_pos] = descr;
+        ++trace_pos;
+      }
     }
-    (*ptrace)[trace_pos++] = Val_backtrace_slot(Slot_frame_descr(descr));
+  }
+}
+
+static value alloc_callstack(frame_descr** trace, intnat trace_len)
+{
+  CAMLparam0();
+  CAMLlocal1(callstack);
+  int i;
+  callstack = caml_alloc(trace_len, 0);
+  for (i = 0; i < trace_len; i++)
+    Store_field(callstack, i, Val_backtrace_slot(trace[i]));
+  caml_stat_free(trace);
+  CAMLreturn(callstack);
+}
+
+CAMLprim value caml_get_current_callstack (value max_frames_value) {
+  frame_descr** trace;
+  intnat trace_len;
+  get_callstack(Caml_state->current_stack, Long_val(max_frames_value),
+                &trace, &trace_len);
+  return alloc_callstack(trace, trace_len);
+}
+
+CAMLprim value caml_get_continuation_callstack (value cont, value max_frames)
+{
+  frame_descr** trace;
+  intnat trace_len;
+  struct stack_info* stack;
+
+  stack = Ptr_val(caml_continuation_use(cont));
+  {
+    CAMLnoalloc;
+    get_callstack(stack, max_frames,
+                  &trace, &trace_len);
+    caml_continuation_replace(cont, stack);
   }
 
-  return trace_pos;
+  return alloc_callstack(trace, trace_len);
 }
 
-static debuginfo debuginfo_extract(frame_descr* d, int alloc_idx)
+debuginfo caml_debuginfo_extract(backtrace_slot slot)
 {
   unsigned char* infoptr;
   uint32_t debuginfo_offset;
+  frame_descr * d = (frame_descr *)slot;
 
   /* The special frames marking the top of an ML stack chunk are never
      returned by caml_next_frame_descriptor, so should never reach here. */
@@ -175,46 +226,24 @@ static debuginfo debuginfo_extract(frame_descr* d, int alloc_idx)
   /* Recover debugging info */
   infoptr = (unsigned char*)&d->live_ofs[d->num_live];
   if (d->frame_size & 2) {
-    CAMLassert(alloc_idx == -1 || (0 <= alloc_idx && alloc_idx < *infoptr));
     /* skip alloc_lengths */
     infoptr += *infoptr + 1;
     /* align to 32 bits */
     infoptr = Align_to(infoptr, uint32_t);
-    /* select the right debug info for this allocation */
-    if (alloc_idx != -1) {
-      infoptr += alloc_idx * sizeof(uint32_t);
-      if (*(uint32_t*)infoptr == 0) {
-        /* No debug info for this particular allocation */
-        return NULL;
-      }
-    } else {
-      /* We don't care which alloc_idx we use, so use the first
-         that has debug info. (e.g. this is a backtrace through a
-         finaliser/signal handler triggered via a Comballoc alloc) */
-      while (*(uint32_t*)infoptr == 0) {
-        infoptr += sizeof(uint32_t);
-      }
+    /* we know there's at least one valid debuginfo,
+       but it may not be the one for the first alloc */
+    while (*(uint32_t*)infoptr == 0) {
+      infoptr += sizeof(uint32_t);
     }
   } else {
     /* align to 32 bits */
     infoptr = Align_to(infoptr, uint32_t);
-    CAMLassert(alloc_idx == -1);
   }
+  /* read offset to debuginfo */
   debuginfo_offset = *(uint32_t*)infoptr;
-  CAMLassert(debuginfo_offset != 0 && (debuginfo_offset & 3) == 0);
   return (debuginfo)(infoptr + debuginfo_offset);
 }
 
-debuginfo caml_debuginfo_extract(backtrace_slot slot)
-{
-  if (Slot_is_debuginfo(slot)) {
-    /* already a decoded debuginfo */
-    return Debuginfo_slot(slot);
-  } else {
-    return debuginfo_extract(Frame_descr_slot(slot), -1);
-  }
-}
-
 debuginfo caml_debuginfo_next(debuginfo dbg)
 {
   uint32_t * infoptr;
diff --git a/runtime/bigarray.c b/runtime/bigarray.c
index 871b81ef21e0..6f19a603ccd4 100644
--- a/runtime/bigarray.c
+++ b/runtime/bigarray.c
@@ -66,7 +66,7 @@ CAMLexport uintnat caml_ba_byte_size(struct caml_ba_array * b)
 
 /* Operation table for bigarrays */
 
-CAMLexport struct custom_operations caml_ba_ops = {
+CAMLexport const struct custom_operations caml_ba_ops = {
   "_bigarr02",
   caml_ba_finalize,
   caml_ba_compare,
diff --git a/runtime/callback.c b/runtime/callback.c
index 347e3a9d1f2f..fefc11c9281c 100644
--- a/runtime/callback.c
+++ b/runtime/callback.c
@@ -19,26 +19,49 @@
 
 #include <string.h>
 #include "caml/callback.h"
-#include "caml/domain.h"
+#include "caml/codefrag.h"
 #include "caml/fail.h"
+#include "caml/fiber.h"
 #include "caml/memory.h"
 #include "caml/mlvalues.h"
+#include "caml/platform.h"
+
+/*
+ * These functions are to ensure effects are handled correctly inside
+ * callbacks. There are two aspects:
+ *  - we clear the stack parent for a callback to force an Unhandled
+ *  exception rather than effects being passed over the callback
+ *  - we register the stack parent as a local root while the callback
+ * is executing to ensure that the garbage collector follows the
+ * stack parent
+ */
+Caml_inline value save_and_clear_stack_parent(caml_domain_state* domain_state) {
+  struct stack_info* parent_stack = Stack_parent(domain_state->current_stack);
+  value cont = caml_alloc_1(Cont_tag, Val_ptr(parent_stack));
+  Stack_parent(domain_state->current_stack) = NULL;
+  return cont;
+}
+
+Caml_inline void restore_stack_parent(caml_domain_state* domain_state,
+                                      value cont) {
+  struct stack_info* parent_stack = Ptr_val(Op_val(cont)[0]);
+  CAMLassert(Stack_parent(domain_state->current_stack) == NULL);
+  Stack_parent(domain_state->current_stack) = parent_stack;
+}
+
 
 #ifndef NATIVE_CODE
 
 /* Bytecode callbacks */
 
-#include "caml/codefrag.h"
 #include "caml/interp.h"
 #include "caml/instruct.h"
 #include "caml/fix_code.h"
-#include "caml/stacks.h"
-
-CAMLexport int caml_callback_depth = 0;
+#include "caml/fiber.h"
 
-static opcode_t callback_code[] = { ACC, 0, APPLY, 0, POP, 1, STOP };
+static __thread opcode_t callback_code[] = { ACC, 0, APPLY, 0, POP, 1, STOP };
 
-static int callback_code_inited = 0;
+static __thread int callback_code_inited = 0;
 
 static void init_callback_code(void)
 {
@@ -53,23 +76,38 @@ static void init_callback_code(void)
 
 CAMLexport value caml_callbackN_exn(value closure, int narg, value args[])
 {
-  int i;
+  CAMLparam1(closure);
+  CAMLxparamN(args, narg);
+  CAMLlocal1(cont);
   value res;
+  int i;
+  caml_domain_state* domain_state = Caml_state;
 
   CAMLassert(narg + 4 <= 256);
+  domain_state->current_stack->sp -= narg + 4;
+  for (i = 0; i < narg; i++)
+    domain_state->current_stack->sp[i] = args[i]; /* arguments */
 
-  Caml_state->extern_sp -= narg + 4;
-  for (i = 0; i < narg; i++) Caml_state->extern_sp[i] = args[i]; /* arguments */
-  Caml_state->extern_sp[narg] = (value)(callback_code + 4); /* return address */
-  Caml_state->extern_sp[narg + 1] = Val_unit;    /* environment */
-  Caml_state->extern_sp[narg + 2] = Val_long(0); /* extra args */
-  Caml_state->extern_sp[narg + 3] = closure;
   if (!callback_code_inited) init_callback_code();
+
   callback_code[1] = narg + 3;
   callback_code[3] = narg;
+
+  domain_state->current_stack->sp[narg] =
+                     (value)(callback_code + 4); /* return address */
+  domain_state->current_stack->sp[narg + 1] = Val_unit;    /* environment */
+  domain_state->current_stack->sp[narg + 2] = Val_long(0); /* extra args */
+  domain_state->current_stack->sp[narg + 3] = closure;
+
+  cont = save_and_clear_stack_parent(domain_state);
+
   res = caml_interprete(callback_code, sizeof(callback_code));
-  if (Is_exception_result(res)) Caml_state->extern_sp += narg + 4; /* PR#3419 */
-  return res;
+  if (Is_exception_result(res))
+    domain_state->current_stack->sp += narg + 4; /* PR#3419 */
+
+  restore_stack_parent(domain_state, cont);
+
+  CAMLreturn (res);
 }
 
 CAMLexport value caml_callback_exn(value closure, value arg1)
@@ -97,33 +135,82 @@ CAMLexport value caml_callback3_exn(value closure,
   return caml_callbackN_exn(closure, 3, arg);
 }
 
-#else
+#else /* Nativecode callbacks */
 
-/* Native-code callbacks. */
+static void init_callback_code(void)
+{
+}
 
-typedef value (callback_stub)(caml_domain_state* state, value closure,
+typedef value (callback_stub)(caml_domain_state* state,
+                              value closure,
                               value* args);
 
 callback_stub caml_callback_asm, caml_callback2_asm, caml_callback3_asm;
 
 CAMLexport value caml_callback_exn(value closure, value arg)
 {
-  return caml_callback_asm(Caml_state, closure, &arg);
+  caml_domain_state* domain_state = Caml_state;
+  caml_maybe_expand_stack();
+
+  if (Stack_parent(domain_state->current_stack)) {
+    CAMLparam2 (closure, arg);
+    CAMLlocal1 (cont);
+    value res;
+
+    cont = save_and_clear_stack_parent(domain_state);
+    res = caml_callback_asm(domain_state, closure, &arg);
+    restore_stack_parent(domain_state, cont);
+
+    CAMLreturn (res);
+  } else {
+    return caml_callback_asm(domain_state, closure, &arg);
+  }
 }
 
 CAMLexport value caml_callback2_exn(value closure, value arg1, value arg2)
 {
   value args[] = {arg1, arg2};
-  return caml_callback2_asm(Caml_state, closure, args);
+  caml_domain_state* domain_state = Caml_state;
+  caml_maybe_expand_stack();
+
+  if (Stack_parent(domain_state->current_stack)) {
+    CAMLparam3 (closure, arg1, arg2);
+    CAMLlocal1 (cont);
+    value res;
+
+    cont = save_and_clear_stack_parent(domain_state);
+    res = caml_callback2_asm(domain_state, closure, args);
+    restore_stack_parent(domain_state, cont);
+
+    CAMLreturn (res);
+  } else {
+    return caml_callback2_asm(domain_state, closure, args);
+  }
 }
 
 CAMLexport value caml_callback3_exn(value closure,
                                     value arg1, value arg2, value arg3)
 {
   value args[] = {arg1, arg2, arg3};
-  return caml_callback3_asm(Caml_state, closure, args);
+  caml_domain_state* domain_state = Caml_state;
+  caml_maybe_expand_stack();
+
+  if (Stack_parent(domain_state->current_stack))  {
+    CAMLparam4 (closure, arg1, arg2, arg3);
+    CAMLlocal1 (cont);
+    value res;
+
+    cont = save_and_clear_stack_parent(domain_state);
+    res = caml_callback3_asm(domain_state, closure, args);
+    restore_stack_parent(domain_state, cont);
+
+    CAMLreturn (res);
+  } else {
+    return caml_callback3_asm(domain_state, closure, args);
+  }
 }
 
+/* Native-code callbacks.  caml_callback[123]_asm are implemented in asm. */
 
 CAMLexport value caml_callbackN_exn(value closure, int narg, value args[])
 {
@@ -192,6 +279,12 @@ struct named_value {
 #define Named_value_size 13
 
 static struct named_value * named_value_table[Named_value_size] = { NULL, };
+static caml_plat_mutex named_value_lock = CAML_PLAT_MUTEX_INITIALIZER;
+
+void caml_init_callbacks(void)
+{
+  init_callback_code();
+}
 
 static unsigned int hash_value_name(char const *name)
 {
@@ -206,31 +299,42 @@ CAMLprim value caml_register_named_value(value vname, value val)
   const char * name = String_val(vname);
   size_t namelen = strlen(name);
   unsigned int h = hash_value_name(name);
+  int found = 0;
 
+  caml_plat_lock(&named_value_lock);
   for (nv = named_value_table[h]; nv != NULL; nv = nv->next) {
     if (strcmp(name, nv->name) == 0) {
       caml_modify_generational_global_root(&nv->val, val);
-      return Val_unit;
+      found = 1;
+      break;
     }
   }
-  nv = (struct named_value *)
-          caml_stat_alloc(sizeof(struct named_value) + namelen);
-  memcpy(nv->name, name, namelen + 1);
-  nv->val = val;
-  nv->next = named_value_table[h];
-  named_value_table[h] = nv;
-  caml_register_generational_global_root(&nv->val);
+  if (!found) {
+    nv = (struct named_value *)
+      caml_stat_alloc(sizeof(struct named_value) + namelen);
+    memcpy(nv->name, name, namelen + 1);
+    nv->val = val;
+    nv->next = named_value_table[h];
+    named_value_table[h] = nv;
+    caml_register_generational_global_root(&nv->val);
+  }
+  caml_plat_unlock(&named_value_lock);
   return Val_unit;
 }
 
-CAMLexport const value * caml_named_value(char const *name)
+CAMLexport const value* caml_named_value(char const *name)
 {
   struct named_value * nv;
+  caml_plat_lock(&named_value_lock);
   for (nv = named_value_table[hash_value_name(name)];
        nv != NULL;
        nv = nv->next) {
-    if (strcmp(name, nv->name) == 0) return &nv->val;
+    if (strcmp(name, nv->name) == 0){
+      caml_plat_unlock(&named_value_lock);
+      return &nv->val;
+    }
   }
+  caml_plat_unlock(&named_value_lock);
   return NULL;
 }
 
@@ -240,7 +344,7 @@ CAMLexport void caml_iterate_named_values(caml_named_action f)
   for(i = 0; i < Named_value_size; i++){
     struct named_value * nv;
     for (nv = named_value_table[i]; nv != NULL; nv = nv->next) {
-      f( &nv->val, nv->name );
+      f( Op_val(nv->val), nv->name );
     }
   }
 }
diff --git a/runtime/caml/address_class.h b/runtime/caml/address_class.h
index 82e5cf71ffb1..08403e2dda94 100644
--- a/runtime/caml/address_class.h
+++ b/runtime/caml/address_class.h
@@ -15,44 +15,25 @@
 
 /* Classification of addresses for GC and runtime purposes. */
 
-/* The current runtime supports two different configurations that
-   correspond to two different value models, depending on whether
-   "naked pointers", that do not point to a well-formed OCaml block,
-   are allowed (considered valid values).
+/* Multicore runtime supports only the "no naked pointers" mode where any
+   out-of-heap pointers are not observable by the GC. The out-of-heap pointers
+   are either:
 
-   In "classic mode", naked pointers are allowed, and the
-   implementation uses a page table. A valid value is then either:
-   - a tagged integer (Is_long or !Is_block from mlvalues.h)
-   - a pointer to the minor heap (Is_young)
-   - a pointer to the major heap (Is_in_heap)
-   - a pointer to a constant block statically-allocated by OCaml code
-     or the OCaml runtime (Is_in_static_data)
-   - a "foreign" pointer, which is none of the above; the destination
-     of those pointers may be a well-formed OCaml blocks, but it may
-     also be a naked pointer.
+   - wrapped in Abstract_tag or Custom_tag objects, or
+   - have a valid header with colour `NOT_MARKABLE`, or
+   - made to look like immediate values by tagging the least significant bit so
+     that the GC does not follow it. This strategy has the downside that
+     out-of-heap pointers may not point to odd addresses.
 
-   The macros and functions below give access to a global page table
-   to classify addresses to be able to implement Is_in_heap,
-   In_static_data (or their disjunction Is_in_value_area) and thus
-   detect values which may be naked pointers. The runtime
-   conservatively assumes that all foreign pointers may be naked
-   pointers, and uses the page table to not dereference/follow them.
-
-   In "no naked pointers" mode (when NO_NAKED_POINTERS is defined),
-   naked pointers are illegal, so pointers that are values can always
-   be assumed to point to well-formed blocks.
-
-   To support an implementation without a global page table, runtime
-   code should not rely on Is_in_heap and Is_in_static_data. This
-   corresponds to a simpler model where a valid value is either:
+   A valid value is either:
    - a tagged integer (Is_long)
-   - a pointer to the minor heap (Is_young)
-   - a pointer to a well-formed block outside the minor heap
-     (it may be in the major heap, or static, or a foreign pointer,
-      without a check to distinguish the various cases).
+   - a pointer to the minor heap
+   - a pointer to a well-formed block outside the minor heap. It may be in the
+     major heap, or static data allocated by the OCaml code or the OCaml
+     runtime, or a foreign pointer.
 
-   (To create a well-formed block outside the heap that the GC will
-   not scan, one can use the Caml_out_of_heap_header from mlvalues.h.)
+   To create a well-formed block outside the heap that the GC will not scan,
+   one can use the Caml_out_of_heap_header from mlvalues.h.
 */
 
 #ifndef CAML_ADDRESS_CLASS_H
@@ -62,65 +43,8 @@
 #include "misc.h"
 #include "mlvalues.h"
 
-/* Use the following macros to test an address for the different classes
-   it might belong to. */
-
-#define Is_young(val) \
-  (CAMLassert (Is_block (val)), \
-   (char *)(val) < (char *)Caml_state_field(young_end) && \
-   (char *)(val) > (char *)Caml_state_field(young_start))
-
-#define Is_in_heap(a) (Classify_addr(a) & In_heap)
-
-#ifdef NO_NAKED_POINTERS
-
+/* These definitions are retained for backwards compatibility */
 #define Is_in_heap_or_young(a) 1
 #define Is_in_value_area(a) 1
 
-#else
-
-#define Is_in_heap_or_young(a) (Classify_addr(a) & (In_heap | In_young))
-
-#define Is_in_value_area(a)                                     \
-  (Classify_addr(a) & (In_heap | In_young | In_static_data))
-
-#define Is_in_static_data(a) (Classify_addr(a) & In_static_data)
-
-#endif
-
-/***********************************************************************/
-/* The rest of this file is private and may change without notice. */
-
-#define Not_in_heap 0
-#define In_heap 1
-#define In_young 2
-#define In_static_data 4
-
-#ifdef ARCH_SIXTYFOUR
-
-/* 64 bits: Represent page table as a sparse hash table */
-int caml_page_table_lookup(void * addr);
-#define Classify_addr(a) (caml_page_table_lookup((void *)(a)))
-
-#else
-
-/* 32 bits: Represent page table as a 2-level array */
-#define Pagetable2_log 11
-#define Pagetable2_size (1 << Pagetable2_log)
-#define Pagetable1_log (Page_log + Pagetable2_log)
-#define Pagetable1_size (1 << (32 - Pagetable1_log))
-CAMLextern unsigned char * caml_page_table[Pagetable1_size];
-
-#define Pagetable_index1(a) (((uintnat)(a)) >> Pagetable1_log)
-#define Pagetable_index2(a) \
-  ((((uintnat)(a)) >> Page_log) & (Pagetable2_size - 1))
-#define Classify_addr(a) \
-  caml_page_table[Pagetable_index1(a)][Pagetable_index2(a)]
-
-#endif
-
-int caml_page_table_add(int kind, void * start, void * end);
-int caml_page_table_remove(int kind, void * start, void * end);
-int caml_page_table_initialize(mlsize_t bytesize);
-
 #endif /* CAML_ADDRESS_CLASS_H */
diff --git a/runtime/caml/addrmap.h b/runtime/caml/addrmap.h
new file mode 100644
index 000000000000..2d80cf2d1e58
--- /dev/null
+++ b/runtime/caml/addrmap.h
@@ -0,0 +1,97 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*                 Stephen Dolan, University of Cambridge                 */
+/*                                                                        */
+/*   Copyright 2015 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+#include "mlvalues.h"
+
+#ifndef CAML_ADDRMAP_H
+#define CAML_ADDRMAP_H
+
+/* An addrmap is a value -> value hashmap, where
+   the values are blocks */
+
+struct addrmap_entry { value key, value; };
+struct addrmap {
+  struct addrmap_entry* entries;
+  uintnat size;
+};
+
+#define ADDRMAP_INIT {0,0}
+
+int caml_addrmap_contains(struct addrmap* t, value k);
+value caml_addrmap_lookup(struct addrmap* t, value k);
+
+#define ADDRMAP_NOT_PRESENT ((value)(0))
+#define ADDRMAP_INVALID_KEY ((value)(0))
+
+value* caml_addrmap_insert_pos(struct addrmap* t, value k);
+
+/* must not already be present */
+void caml_addrmap_insert(struct addrmap* t, value k, value v);
+
+void caml_addrmap_clear(struct addrmap* t);
+
+void caml_addrmap_iter(struct addrmap* t, void (*f)(value, value));
+
+/* iteration */
+typedef uintnat addrmap_iterator;
+Caml_inline addrmap_iterator caml_addrmap_iter_ok(struct addrmap* t,
+                                                  addrmap_iterator i)
+{
+  if (i < t->size) {
+    CAMLassert(t->entries[i].key != ADDRMAP_INVALID_KEY);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+Caml_inline addrmap_iterator caml_addrmap_next(struct addrmap* t,
+                                               addrmap_iterator i)
+{
+  if (!t->entries) return (uintnat)(-1);
+  i++;
+  while (i < t->size && t->entries[i].key == ADDRMAP_INVALID_KEY) {
+    i++;
+  }
+  caml_addrmap_iter_ok(t, i); /* just for assert-checks */
+  return i;
+}
+
+Caml_inline value caml_addrmap_iter_key(struct addrmap* t,
+                                        addrmap_iterator i)
+{
+  CAMLassert(caml_addrmap_iter_ok(t, i));
+  return t->entries[i].key;
+}
+
+Caml_inline value caml_addrmap_iter_value(struct addrmap* t,
+                                          addrmap_iterator i)
+{
+  CAMLassert(caml_addrmap_iter_ok(t, i));
+  return t->entries[i].value;
+}
+
+Caml_inline value* caml_addrmap_iter_val_pos(struct addrmap* t,
+                                             addrmap_iterator i)
+{
+  CAMLassert(caml_addrmap_iter_ok(t, i));
+  return &t->entries[i].value;
+}
+
+Caml_inline addrmap_iterator caml_addrmap_iterator(struct addrmap* t)
+{
+  return caml_addrmap_next(t, (uintnat)(-1));
+}
+
+
+#endif
diff --git a/runtime/caml/alloc.h b/runtime/caml/alloc.h
index 13f0fac2fb50..d15781bae993 100644
--- a/runtime/caml/alloc.h
+++ b/runtime/caml/alloc.h
@@ -27,23 +27,35 @@
 extern "C" {
 #endif
 
-/* It is guaranteed that these allocation functions will not trigger
-   any OCaml callback such as finalizers or signal handlers. */
-
-CAMLextern value caml_alloc (mlsize_t wosize, tag_t);
-CAMLextern value caml_alloc_small (mlsize_t wosize, tag_t);
-CAMLextern value caml_alloc_tuple (mlsize_t wosize);
+CAMLextern value caml_alloc (mlsize_t, tag_t);
+CAMLextern value caml_alloc_N(mlsize_t, tag_t, ...);
+CAMLextern value caml_alloc_1(tag_t, value);
+CAMLextern value caml_alloc_2(tag_t, value, value);
+CAMLextern value caml_alloc_3(tag_t, value, value, value);
+CAMLextern value caml_alloc_4(tag_t, value, value, value, value);
+CAMLextern value caml_alloc_5(tag_t, value, value, value, value,
+                              value);
+CAMLextern value caml_alloc_6(tag_t, value, value, value, value,
+                              value, value);
+CAMLextern value caml_alloc_7(tag_t, value, value, value, value,
+                              value, value, value);
+CAMLextern value caml_alloc_8(tag_t, value, value, value, value,
+                              value, value, value, value);
+CAMLextern value caml_alloc_9(tag_t, value, value, value, value,
+                              value, value, value, value, value);
+CAMLextern value caml_alloc_small (mlsize_t, tag_t);
+CAMLextern value caml_alloc_tuple (mlsize_t);
 CAMLextern value caml_alloc_float_array (mlsize_t len);
 CAMLextern value caml_alloc_string (mlsize_t len);  /* len in bytes (chars) */
 CAMLextern value caml_alloc_initialized_string (mlsize_t len, const char *);
 CAMLextern value caml_copy_string (char const *);
-CAMLextern value caml_copy_string_array (char const **);
+CAMLextern value caml_copy_string_array (char const * const*);
 CAMLextern value caml_copy_double (double);
 CAMLextern value caml_copy_int32 (int32_t);       /* defined in [ints.c] */
 CAMLextern value caml_copy_int64 (int64_t);       /* defined in [ints.c] */
 CAMLextern value caml_copy_nativeint (intnat);  /* defined in [ints.c] */
 CAMLextern value caml_alloc_array (value (*funct) (char const *),
-                                   char const ** array);
+                                   char const * const * array);
 CAMLextern value caml_alloc_sprintf(const char * format, ...)
 #ifdef __GNUC__
   __attribute__ ((format (printf, 1, 2)))
@@ -52,12 +64,12 @@ CAMLextern value caml_alloc_sprintf(const char * format, ...)
 CAMLextern value caml_alloc_some(value);
 
 typedef void (*final_fun)(value);
-CAMLextern value caml_alloc_final (mlsize_t wosize,
+CAMLextern value caml_alloc_final (mlsize_t, /*size in words*/
                                    final_fun, /*finalization function*/
                                    mlsize_t, /*resources consumed*/
                                    mlsize_t  /*max resources*/);
 
-CAMLextern int caml_convert_flag_list (value, int *);
+CAMLextern int caml_convert_flag_list (value, const int *);
 
 /* Convenience functions to deal with unboxable types. */
 Caml_inline value caml_alloc_unboxed (value arg) { return arg; }
diff --git a/runtime/caml/backtrace.h b/runtime/caml/backtrace.h
index bf2f9cabaa84..00c3e92305d7 100644
--- a/runtime/caml/backtrace.h
+++ b/runtime/caml/backtrace.h
@@ -96,10 +96,11 @@ CAMLextern void caml_record_backtraces(int);
  * raise and re-raise are distinguished by:
  * - passing reraise = 1 to [caml_stash_backtrace] (see below) in the bytecode
  *   interpreter;
- * - directly resetting [Caml_state->backtrace_pos] to 0 in native
-     runtimes for raise.
+ * - directly resetting [Caml_state->backtrace_pos] to 0 in native runtimes for
+ *   raise.
  */
 
+
 #ifndef NATIVE_CODE
 
 /* Path to the file containing debug information, if any, or NULL. */
diff --git a/runtime/caml/backtrace_prim.h b/runtime/caml/backtrace_prim.h
index cd084da0c9b2..38e6c06d2f9d 100644
--- a/runtime/caml/backtrace_prim.h
+++ b/runtime/caml/backtrace_prim.h
@@ -97,21 +97,10 @@ value caml_remove_debug_info(code_t start);
  * It defines the [caml_stash_backtrace] function, which is called to quickly
  * fill the backtrace buffer by walking the stack when an exception is raised.
  *
- * It also defines [caml_collect_current_callstack], which stores up
- * to [max_frames] frames of the current call stack into the
- * statically allocated buffer [*pbuffer] of length [*plen]. If the
- * buffer is not long enough, it will be reallocated. The number of
- * frames collected is returned.
- *
- * The alloc_idx parameter is used to select between the backtraces of
- * different allocation sites which were combined by Comballoc.
- * Passing -1 here means the caller doesn't care which is chosen.
- *
- * We use `intnat` for max_frames because, were it only `int`, passing
- * `max_int` from the OCaml side would overflow on 64bits machines. */
-
-intnat caml_collect_current_callstack(value** pbuffer, intnat* plen,
-                                      intnat max_frames, int alloc_idx);
+ * It also defines the [caml_get_current_callstack] OCaml primitive, which also
+ * walks the stack but directly turns it into a [raw_backtrace] and is called
+ * explicitly.
+ */
 
 #endif /* CAML_INTERNALS */
 
diff --git a/runtime/caml/callback.h b/runtime/caml/callback.h
index eef3342ec78a..1172b16ff800 100644
--- a/runtime/caml/callback.h
+++ b/runtime/caml/callback.h
@@ -22,11 +22,14 @@
 #include "compatibility.h"
 #endif
 #include "mlvalues.h"
+#include "memory.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+void caml_init_callbacks (void);
+
 CAMLextern value caml_callback (value closure, value arg);
 CAMLextern value caml_callback2 (value closure, value arg1, value arg2);
 CAMLextern value caml_callback3 (value closure, value arg1, value arg2,
@@ -50,8 +53,6 @@ CAMLextern void caml_startup_pooled (char_os ** argv);
 CAMLextern value caml_startup_pooled_exn (char_os ** argv);
 CAMLextern void caml_shutdown (void);
 
-CAMLextern int caml_callback_depth;
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/runtime/caml/camlatomic.h b/runtime/caml/camlatomic.h
new file mode 100644
index 000000000000..b03722de9366
--- /dev/null
+++ b/runtime/caml/camlatomic.h
@@ -0,0 +1,87 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                Stephen Dolan, University of Cambridge                  */
+/*                                                                        */
+/*   Copyright 2018 Indian Institute of Technology, Madras                */
+/*   Copyright 2018 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+#ifndef CAML_ATOMIC_H
+#define CAML_ATOMIC_H
+
+#ifndef CAML_NAME_SPACE
+#include "compatibility.h"
+#endif
+#include "config.h"
+#include "misc.h"
+
+/* On platforms supporting C11 atomics, this file just includes <stdatomic.h>.
+
+   On other platforms, this file includes platform-specific stubs for
+   the subset of C11 atomics needed by the OCaml runtime
+ */
+
+#ifdef __cplusplus
+
+extern "C++" {
+#include <atomic>
+#define ATOMIC_UINTNAT_INIT(x) (x)
+typedef std::atomic<uintnat> atomic_uintnat;
+typedef std::atomic<intnat> atomic_intnat;
+using std::memory_order_relaxed;
+using std::memory_order_acquire;
+using std::memory_order_release;
+using std::memory_order_acq_rel;
+using std::memory_order_seq_cst;
+}
+
+#elif defined(HAS_STDATOMIC_H)
+
+#include <stdatomic.h>
+#define ATOMIC_UINTNAT_INIT(x) (x)
+typedef _Atomic uintnat atomic_uintnat;
+typedef _Atomic intnat atomic_intnat;
+
+#elif defined(__GNUC__)
+
+/* Support for versions of gcc which have built-in atomics but do not
+   expose stdatomic.h (e.g. gcc 4.8) */
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+#define ATOMIC_UINTNAT_INIT(x) { (x) }
+typedef struct { uintnat repr; } atomic_uintnat;
+typedef struct { intnat repr; } atomic_intnat;
+
+#define atomic_load_explicit(x, m) __atomic_load_n(&(x)->repr, (m))
+#define atomic_load(x) atomic_load_explicit((x), memory_order_seq_cst)
+#define atomic_store_explicit(x, v, m) __atomic_store_n(&(x)->repr, (v), (m))
+#define atomic_store(x, v) atomic_store_explicit((x), (v), memory_order_seq_cst)
+#define atomic_compare_exchange_strong(x, oldv, newv) \
+  __atomic_compare_exchange_n( \
+    &(x)->repr, \
+    (oldv), (newv), 0, \
+    memory_order_seq_cst, memory_order_seq_cst)
+#define atomic_exchange(x, newv) \
+  __atomic_exchange_n(&(x)->repr, (newv), memory_order_seq_cst)
+#define atomic_fetch_add(x, n) \
+  __atomic_fetch_add(&(x)->repr, (n), memory_order_seq_cst)
+#define atomic_thread_fence __atomic_thread_fence
+
+#else
+#error "C11 atomics are unavailable on this platform. See camlatomic.h"
+#endif
+
+#endif /* CAML_ATOMIC_H */
diff --git a/runtime/caml/codefrag.h b/runtime/caml/codefrag.h
index ff623c01ff7c..afc897a3e0da 100644
--- a/runtime/caml/codefrag.h
+++ b/runtime/caml/codefrag.h
@@ -36,6 +36,10 @@ struct code_fragment {
   enum digest_status digest_status;
 };
 
+/* Initialise codefrag. This must be done before any of the other
+   operations in codefrag. */
+void caml_init_codefrag(void);
+
 /* Register a code fragment for addresses [start] (included)
    to [end] (excluded).  This range of addresses is assumed
    disjoint from all currently-registered code fragments.
@@ -75,6 +79,10 @@ extern struct code_fragment *
    Returns NULL if the code fragment was registered with [DIGEST_IGNORE]. */
 extern unsigned char * caml_digest_of_code_fragment(struct code_fragment *);
 
+/* Cleans up (and frees) removed code fragments. Must be called from a stop the
+   world pause by only a single thread. */
+extern void caml_code_fragment_cleanup(void);
+
 #endif
 
 #endif
diff --git a/runtime/caml/compatibility.h b/runtime/caml/compatibility.h
index 410f975cf7c4..e1c9be7b858e 100644
--- a/runtime/caml/compatibility.h
+++ b/runtime/caml/compatibility.h
@@ -56,10 +56,6 @@
 /* **** array.c */
 
 /* **** backtrace.c */
-#define backtrace_active CAML_DEPRECATED("backtrace_active", "caml_backtrace_active") caml_backtrace_active
-#define backtrace_pos CAML_DEPRECATED("backtrace_pos", "caml_backtrace_pos") caml_backtrace_pos
-#define backtrace_buffer CAML_DEPRECATED("backtrace_buffer", "caml_backtrace_buffer") caml_backtrace_buffer
-#define backtrace_last_exn CAML_DEPRECATED("backtrace_last_exn", "caml_backtrace_last_exn") caml_backtrace_last_exn
 #define print_exception_backtrace CAML_DEPRECATED("print_exception_backtrace", "caml_print_exception_backtrace") caml_print_exception_backtrace
 
 /* **** callback.c */
@@ -76,7 +72,6 @@
 /* **** compact.c */
 
 /* **** compare.c */
-#define compare_unordered CAML_DEPRECATED("compare_unordered", "caml_compare_unordered") caml_compare_unordered
 
 /* **** custom.c */
 #define alloc_custom CAML_DEPRECATED("alloc_custom", "caml_alloc_custom") caml_alloc_custom
@@ -103,7 +98,6 @@
 #define serialize_block_float_8 CAML_DEPRECATED("serialize_block_float_8", "caml_serialize_block_float_8") caml_serialize_block_float_8
 
 /* **** fail.c */
-#define external_raise CAML_DEPRECATED("external_raise", "caml_external_raise") caml_external_raise
 #define mlraise CAML_DEPRECATED("mlraise", "caml_raise") caml_raise /*SP*/
 #define raise_constant CAML_DEPRECATED("raise_constant", "caml_raise_constant") caml_raise_constant
 #define raise_with_arg CAML_DEPRECATED("raise_with_arg", "caml_raise_with_arg") caml_raise_with_arg
@@ -226,7 +220,6 @@
 
 /* **** memory.c */
 #define alloc_shr CAML_DEPRECATED("alloc_shr", "caml_alloc_shr") caml_alloc_shr
-#define initialize CAML_DEPRECATED("initialize", "caml_initialize") caml_initialize
 #define modify CAML_DEPRECATED("modify", "caml_modify") caml_modify
 #define stat_alloc CAML_DEPRECATED("stat_alloc", "caml_stat_alloc") caml_stat_alloc
 #define stat_free CAML_DEPRECATED("stat_free", "caml_stat_free") caml_stat_free
@@ -234,15 +227,6 @@
 
 /* **** meta.c */
 
-/* **** minor_gc.c */
-#define young_start CAML_DEPRECATED("young_start", "caml_young_start") caml_young_start
-#define young_end CAML_DEPRECATED("young_end", "caml_young_end") caml_young_end
-#define young_ptr CAML_DEPRECATED("young_ptr", "caml_young_ptr") caml_young_ptr
-#define young_limit CAML_DEPRECATED("young_limit", "caml_young_limit") caml_young_limit
-#define ref_table CAML_DEPRECATED("ref_table", "caml_ref_table") caml_ref_table
-#define minor_collection CAML_DEPRECATED("minor_collection", "caml_minor_collection") caml_minor_collection
-#define check_urgent_gc CAML_DEPRECATED("check_urgent_gc", "caml_check_urgent_gc") caml_check_urgent_gc
-
 /* **** misc.c */
 
 /* **** obj.c */
@@ -255,32 +239,20 @@
 #define format_caml_exception CAML_DEPRECATED("format_caml_exception", "caml_format_exception") caml_format_exception /*SP*/
 
 /* **** roots.c */
-#define local_roots CAML_DEPRECATED("local_roots", "caml_local_roots") caml_local_roots
-#define scan_roots_hook CAML_DEPRECATED("scan_roots_hook", "caml_scan_roots_hook") caml_scan_roots_hook
 #define do_local_roots CAML_DEPRECATED("do_local_roots", "caml_do_local_roots") caml_do_local_roots
 
 /* **** signals.c */
 #define pending_signals CAML_DEPRECATED("pending_signals", "caml_pending_signals") caml_pending_signals
-#define something_to_do CAML_DEPRECATED("something_to_do", "caml_something_to_do") caml_something_to_do
-#define enter_blocking_section_hook CAML_DEPRECATED("enter_blocking_section_hook", "caml_enter_blocking_section_hook") caml_enter_blocking_section_hook
-#define leave_blocking_section_hook CAML_DEPRECATED("leave_blocking_section_hook", "caml_leave_blocking_section_hook") caml_leave_blocking_section_hook
 #define enter_blocking_section CAML_DEPRECATED("enter_blocking_section", "caml_enter_blocking_section") caml_enter_blocking_section
 #define leave_blocking_section CAML_DEPRECATED("leave_blocking_section", "caml_leave_blocking_section") caml_leave_blocking_section
 #define convert_signal_number CAML_DEPRECATED("convert_signal_number", "caml_convert_signal_number") caml_convert_signal_number
 
 /* **** runtime/signals.c */
-#define garbage_collection CAML_DEPRECATED("garbage_collection", "caml_garbage_collection") caml_garbage_collection
 
 /* **** stacks.c */
-#define stack_low CAML_DEPRECATED("stack_low", "caml_stack_low") caml_stack_low
-#define stack_high CAML_DEPRECATED("stack_high", "caml_stack_high") caml_stack_high
-#define stack_threshold CAML_DEPRECATED("stack_threshold", "caml_stack_threshold") caml_stack_threshold
-#define extern_sp CAML_DEPRECATED("extern_sp", "caml_extern_sp") caml_extern_sp
-#define trapsp CAML_DEPRECATED("trapsp", "caml_trapsp") caml_trapsp
-#define trap_barrier CAML_DEPRECATED("trap_barrier", "caml_trap_barrier") caml_trap_barrier
 
 /* **** startup.c */
-#define atom_table CAML_DEPRECATED("atom_table", "caml_atom_table") caml_atom_table
+
 /* **** runtime/startup_nat.c */
 #define static_data_start CAML_DEPRECATED("static_data_start", "caml_static_data_start") caml_static_data_start
 #define static_data_end CAML_DEPRECATED("static_data_end", "caml_static_data_end") caml_static_data_end
diff --git a/runtime/caml/config.h b/runtime/caml/config.h
index 471a6bc6f4e9..29acf6d7e97c 100644
--- a/runtime/caml/config.h
+++ b/runtime/caml/config.h
@@ -153,6 +153,8 @@ typedef uint64_t uintnat;
 #error "No integer type available to represent pointers"
 #endif
 
+#define UINTNAT_MAX ((uintnat)-1)
+
 #endif /* CAML_CONFIG_H_NO_TYPEDEFS */
 
 /* Endianness of floats */
@@ -190,10 +192,19 @@ typedef uint64_t uintnat;
 #define Page_size (1 << Page_log)
 
 /* Initial size of stack (bytes). */
+#ifdef DEBUG
+#define Stack_size (32 * sizeof(value))
+#else
 #define Stack_size (4096 * sizeof(value))
+#endif
 
 /* Minimum free size of stack (bytes); below that, it is reallocated. */
-#define Stack_threshold (256 * sizeof(value))
+#define Stack_threshold_words 16
+#define Stack_threshold (Stack_threshold_words * sizeof(value))
+
+/* Number of words used in the control structure at the start of a stack
+   (see fiber.h) */
+#define Stack_ctx_words 5
 
 /* Default maximum size of the stack (words). */
 #define Max_stack_def (1024 * 1024)
@@ -206,8 +217,8 @@ typedef uint64_t uintnat;
 
 
 /* Minimum size of the minor zone (words).
-   This must be at least [2 * Max_young_whsize]. */
-#define Minor_heap_min 4096
+   This must be at least [Max_young_wosize + 1]. */
+#define Minor_heap_min (Max_young_wosize + 1)
 
 /* Maximum size of the minor zone (words).
    Must be greater than or equal to [Minor_heap_min].
@@ -245,6 +256,9 @@ typedef uint64_t uintnat;
  */
 #define Max_percent_free_def 500
 
+/* Maximum number of domains */
+#define Max_domains 128
+
 /* Default setting for the major GC slice smoothing window: 1
    (i.e. no smoothing)
 */
diff --git a/runtime/caml/custom.h b/runtime/caml/custom.h
index 420121f43ebd..3e9162f54885 100644
--- a/runtime/caml/custom.h
+++ b/runtime/caml/custom.h
@@ -48,23 +48,24 @@ struct custom_operations {
 #define custom_compare_ext_default NULL
 #define custom_fixed_length_default NULL
 
-#define Custom_ops_val(v) (*((struct custom_operations **) (v)))
+#define Custom_ops_val(v) (*((const struct custom_operations **) (v)))
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 
-CAMLextern value caml_alloc_custom(struct custom_operations * ops,
+CAMLextern value caml_alloc_custom(const struct custom_operations * ops,
                                    uintnat size, /*size in bytes*/
                                    mlsize_t mem, /*resources consumed*/
                                    mlsize_t max  /*max resources*/);
 
-CAMLextern value caml_alloc_custom_mem(struct custom_operations * ops,
+CAMLextern value caml_alloc_custom_mem(const struct custom_operations * ops,
                                        uintnat size, /*size in bytes*/
                                        mlsize_t mem  /*memory consumed*/);
 
-CAMLextern void caml_register_custom_operations(struct custom_operations * ops);
+CAMLextern void
+          caml_register_custom_operations(const struct custom_operations * ops);
 
 /* Global variable moved to Caml_state in 4.10 */
 #define caml_compare_unordered (Caml_state_field(compare_unordered))
@@ -76,10 +77,10 @@ extern struct custom_operations *
 
 extern void caml_init_custom_operations(void);
 
-extern struct custom_operations caml_nativeint_ops;
-extern struct custom_operations caml_int32_ops;
-extern struct custom_operations caml_int64_ops;
-extern struct custom_operations caml_ba_ops;
+extern const struct custom_operations caml_nativeint_ops;
+extern const struct custom_operations caml_int32_ops;
+extern const struct custom_operations caml_int64_ops;
+extern const struct custom_operations caml_ba_ops;
 #endif /* CAML_INTERNALS */
 
 #ifdef __cplusplus
diff --git a/runtime/caml/domain.h b/runtime/caml/domain.h
index 23833d24f1d4..7c8799522729 100644
--- a/runtime/caml/domain.h
+++ b/runtime/caml/domain.h
@@ -23,9 +23,78 @@ extern "C" {
 
 #ifdef CAML_INTERNALS
 
+#include "config.h"
+#include "mlvalues.h"
 #include "domain_state.h"
+#include "platform.h"
 
-void caml_init_domain(void);
+#define Caml_check_gc_interrupt(dom_st)   \
+  (CAMLalloc_point_here, \
+   CAMLunlikely((uintnat)(dom_st)->young_ptr < (dom_st)->young_limit))
+
+asize_t caml_norm_minor_heap_size (intnat);
+int caml_reallocate_minor_heap(asize_t);
+
+int caml_incoming_interrupts_queued(void);
+
+void caml_handle_gc_interrupt(void);
+void caml_handle_gc_interrupt_no_async_exceptions(void);
+void caml_handle_incoming_interrupts(void);
+
+CAMLextern void caml_interrupt_self(void);
+
+CAMLextern void caml_reset_domain_lock(void);
+CAMLextern int caml_bt_is_in_blocking_section(void);
+CAMLextern intnat caml_domain_is_multicore (void);
+CAMLextern void caml_bt_enter_ocaml(void);
+CAMLextern void caml_bt_exit_ocaml(void);
+CAMLextern void caml_acquire_domain_lock(void);
+CAMLextern void caml_release_domain_lock(void);
+
+CAMLextern void (*caml_atfork_hook)(void);
+
+CAMLextern void (*caml_domain_start_hook)(void);
+CAMLextern void (*caml_domain_stop_hook)(void);
+CAMLextern void (*caml_domain_external_interrupt_hook)(void);
+
+CAMLextern void caml_init_domains(uintnat minor_heap_size);
+CAMLextern void caml_init_domain_self(int);
+
+CAMLextern atomic_uintnat caml_num_domains_running;
+CAMLextern uintnat caml_minor_heaps_base;
+CAMLextern uintnat caml_minor_heaps_end;
+
+Caml_inline intnat caml_domain_alone(void)
+{
+  return atomic_load_acq(&caml_num_domains_running) == 1;
+}
+
+#ifdef DEBUG
+int caml_domain_is_in_stw(void);
+#endif
+
+int caml_try_run_on_all_domains_with_spin_work(
+  void (*handler)(caml_domain_state*, void*, int, caml_domain_state**),
+  void* data,
+  void (*leader_setup)(caml_domain_state*),
+  void (*enter_spin_callback)(caml_domain_state*, void*),
+  void* enter_spin_data);
+int caml_try_run_on_all_domains(
+  void (*handler)(caml_domain_state*, void*, int, caml_domain_state**),
+  void*,
+  void (*leader_setup)(caml_domain_state*));
+
+/* barriers */
+typedef uintnat barrier_status;
+void caml_global_barrier(void);
+barrier_status caml_global_barrier_begin(void);
+int caml_global_barrier_is_final(barrier_status);
+void caml_global_barrier_end(barrier_status);
+int caml_global_barrier_num_domains(void);
+
+int caml_domain_is_terminating(void);
+
+CAMLextern void caml_domain_set_name(char*);
 
 #endif /* CAML_INTERNALS */
 
diff --git a/runtime/caml/domain_state.h b/runtime/caml/domain_state.h
index 84e677d04174..e708bbe021b3 100644
--- a/runtime/caml/domain_state.h
+++ b/runtime/caml/domain_state.h
@@ -17,23 +17,21 @@
 #ifndef CAML_STATE_H
 #define CAML_STATE_H
 
+#ifdef __APPLE__
+#include <pthread.h>
+#endif
 #include <stddef.h>
 #include <stdio.h>
+
 #include "misc.h"
-#include "mlvalues.h"
 
 #define NUM_EXTRA_PARAMS 64
 typedef value extra_params_area[NUM_EXTRA_PARAMS];
 
 /* This structure sits in the TLS area and is also accessed efficiently
  * via native code, which is why the indices are important */
-
 typedef struct {
-#ifdef CAML_NAME_SPACE
 #define DOMAIN_STATE(type, name) CAMLalign(8) type name;
-#else
-#define DOMAIN_STATE(type, name) CAMLalign(8) type _##name;
-#endif
 #include "domain_state.tbl"
 #undef DOMAIN_STATE
 } caml_domain_state;
@@ -45,11 +43,7 @@ enum {
 #undef DOMAIN_STATE
 };
 
-#ifdef CAML_NAME_SPACE
 #define LAST_DOMAIN_STATE_MEMBER extra_params
-#else
-#define LAST_DOMAIN_STATE_MEMBER _extra_params
-#endif
 
 /* Check that the structure was laid out without padding,
    since the runtime assumes this in computing offsets */
@@ -57,11 +51,20 @@ CAML_STATIC_ASSERT(
     offsetof(caml_domain_state, LAST_DOMAIN_STATE_MEMBER) ==
     (Domain_state_num_fields - 1) * 8);
 
-CAMLextern caml_domain_state* Caml_state;
-#ifdef CAML_NAME_SPACE
-#define Caml_state_field(field) Caml_state->field
+#ifdef __APPLE__
+  CAMLextern pthread_key_t caml_domain_state_key;
+  CAMLextern void caml_init_domain_state_key(void);
+  #define CAML_INIT_DOMAIN_STATE caml_init_domain_state_key()
+  #define Caml_state \
+      ((caml_domain_state*) pthread_getspecific(caml_domain_state_key))
+  #define SET_Caml_state(x) \
+      (pthread_setspecific(caml_domain_state_key, x))
 #else
-#define Caml_state_field(field) Caml_state->_##field
+  CAMLextern __thread caml_domain_state* Caml_state;
+  #define CAML_INIT_DOMAIN_STATE
+  #define SET_Caml_state(x) (Caml_state = (x))
 #endif
 
+#define Caml_state_field(field) (Caml_state->field)
+
 #endif /* CAML_STATE_H */
diff --git a/runtime/caml/domain_state.tbl b/runtime/caml/domain_state.tbl
index 4429f24b64e0..84ee53f9ba59 100644
--- a/runtime/caml/domain_state.tbl
+++ b/runtime/caml/domain_state.tbl
@@ -14,81 +14,144 @@
 /*                                                                        */
 /**************************************************************************/
 
-DOMAIN_STATE(value*, young_limit)
-DOMAIN_STATE(value*, young_ptr)
-/* Minor heap limit. See minor_gc.c. */
+DOMAIN_STATE(volatile uintnat, young_limit)
+/* Minor heap limit. Typically young_limit == young_start, but this field is set
+ * by other domains to signal this domain by causing a spurious allocation
+ * failure. */
 
-DOMAIN_STATE(char*, exception_pointer)
-/* Exception pointer that points into the current stack */
+DOMAIN_STATE(value*, young_ptr)
+/* Minor heap pointer */
 
-DOMAIN_STATE(void*, young_base)
 DOMAIN_STATE(value*, young_start)
+/* Start of the minor heap */
+
 DOMAIN_STATE(value*, young_end)
-DOMAIN_STATE(value*, young_alloc_start)
-DOMAIN_STATE(value*, young_alloc_end)
-DOMAIN_STATE(value*, young_alloc_mid)
-DOMAIN_STATE(value*, young_trigger)
-DOMAIN_STATE(asize_t, minor_heap_wsz)
-DOMAIN_STATE(intnat, in_minor_collection)
-DOMAIN_STATE(double, extra_heap_resources_minor)
-DOMAIN_STATE(struct caml_ref_table*, ref_table)
-DOMAIN_STATE(struct caml_ephe_ref_table*, ephe_ref_table)
-DOMAIN_STATE(struct caml_custom_table*, custom_table)
-/* See minor_gc.c */
+/* End of the minor heap. always(young_start <= young_ptr <= young_end) */
+
+DOMAIN_STATE(struct stack_info*, current_stack)
+/* Current stack */
+
+DOMAIN_STATE(void*, exn_handler)
+/* Pointer into the current stack */
+
+DOMAIN_STATE(struct c_stack_link*, c_stack)
+/* The C stack associated with this domain.
+ * Used by this domain to perform external calls. */
+
+DOMAIN_STATE(struct stack_info**, stack_cache)
+/* This is a list of freelist buckets of stacks */
+
+DOMAIN_STATE(value*, gc_regs_buckets)
 
-DOMAIN_STATE(struct mark_stack*, mark_stack)
-/* See major_gc.c */
-
-DOMAIN_STATE(value*, stack_low)
-DOMAIN_STATE(value*, stack_high)
-DOMAIN_STATE(value*, stack_threshold)
-DOMAIN_STATE(value*, extern_sp)
-DOMAIN_STATE(value*, trapsp)
-DOMAIN_STATE(value*, trap_barrier)
-DOMAIN_STATE(struct longjmp_buffer*, external_raise)
-DOMAIN_STATE(value, exn_bucket)
-/* See interp.c */
-
-DOMAIN_STATE(char*, top_of_stack)
-DOMAIN_STATE(char*, bottom_of_stack)
-DOMAIN_STATE(uintnat, last_return_address)
 DOMAIN_STATE(value*, gc_regs)
-/* See roots_nat.c */
 
-DOMAIN_STATE(intnat, backtrace_active)
+DOMAIN_STATE(value**, gc_regs_slot)
+
+DOMAIN_STATE(struct caml_minor_tables*, minor_tables)
+
+DOMAIN_STATE(struct mark_stack*, mark_stack)
+/* Mark stack */
+
+DOMAIN_STATE(uintnat, marking_done)
+/* Is marking done for the current major cycle. */
+
+DOMAIN_STATE(uintnat, sweeping_done)
+/* Is sweeping done for the current major cycle. */
+
+DOMAIN_STATE(uintnat, allocated_words)
+
+DOMAIN_STATE(uintnat, swept_words)
+
+DOMAIN_STATE(intnat, major_work_computed)
+/* total work accumulated in this GC clock cycle (in words) */
+
+DOMAIN_STATE(intnat, major_work_todo)
+/* balance of work to do in this GC clock cycle (in words)
+ *  positive: we need to do this amount of work to finish the slice
+ *  negative: we have done more than we need and this is credit
+ */
+
+DOMAIN_STATE(double, major_gc_clock)
+
+DOMAIN_STATE(struct caml__roots_block*, local_roots)
+
+DOMAIN_STATE(struct caml_ephe_info*, ephe_info)
+
+DOMAIN_STATE(struct caml_final_info*, final_info)
+
 DOMAIN_STATE(intnat, backtrace_pos)
-DOMAIN_STATE(backtrace_slot*, backtrace_buffer)
+
+DOMAIN_STATE(intnat, backtrace_active)
+
+DOMAIN_STATE(code_t*, backtrace_buffer)
+
 DOMAIN_STATE(value, backtrace_last_exn)
-/* See backtrace.c */
 
 DOMAIN_STATE(intnat, compare_unordered)
-DOMAIN_STATE(intnat, requested_major_slice)
-DOMAIN_STATE(intnat, requested_minor_gc)
-DOMAIN_STATE(struct caml__roots_block *, local_roots)
 
-DOMAIN_STATE(double, stat_minor_words)
-DOMAIN_STATE(double, stat_promoted_words)
-DOMAIN_STATE(double, stat_major_words)
+DOMAIN_STATE(uintnat, oo_next_id_local)
+
+DOMAIN_STATE(uintnat, requested_major_slice)
+
+DOMAIN_STATE(uintnat, requested_minor_gc)
+
+DOMAIN_STATE(uintnat, requested_external_interrupt)
+
+DOMAIN_STATE(int, parser_trace)
+/* See parsing.c */
+
+DOMAIN_STATE(asize_t, minor_heap_wsz)
+
+DOMAIN_STATE(struct caml_heap_state*, shared_heap)
+
+DOMAIN_STATE(int, id)
+
+DOMAIN_STATE(int, unique_id)
+
+DOMAIN_STATE(struct pool**, pools_to_rescan)
+DOMAIN_STATE(int, pools_to_rescan_len)
+DOMAIN_STATE(int, pools_to_rescan_count)
+
+DOMAIN_STATE(value, dls_root)
+/* Domain-local state */
+
+DOMAIN_STATE(double, extra_heap_resources)
+DOMAIN_STATE(double, extra_heap_resources_minor)
+
+DOMAIN_STATE(uintnat, dependent_size)
+DOMAIN_STATE(uintnat, dependent_allocated)
+
+DOMAIN_STATE(struct caml_extern_state*, extern_state)
+DOMAIN_STATE(struct caml_intern_state*, intern_state)
+
+/*****************************************************************************/
+/* GC stats (see gc_ctrl.c and the Gc module). */
+/* These stats represent only the current domain's respective values. */
+/* Use the Gc module to get aggregated total program stats. */
+/*****************************************************************************/
+DOMAIN_STATE(uintnat, stat_minor_words)
+DOMAIN_STATE(uintnat, stat_promoted_words)
+DOMAIN_STATE(uintnat, stat_major_words)
 DOMAIN_STATE(intnat, stat_minor_collections)
 DOMAIN_STATE(intnat, stat_major_collections)
-DOMAIN_STATE(intnat, stat_heap_wsz)
-DOMAIN_STATE(intnat, stat_top_heap_wsz)
-DOMAIN_STATE(intnat, stat_compactions)
 DOMAIN_STATE(intnat, stat_forced_major_collections)
-DOMAIN_STATE(intnat, stat_heap_chunks)
-/* See gc_ctrl.c */
+DOMAIN_STATE(uintnat, stat_blocks_marked)
 
-DOMAIN_STATE(uintnat, eventlog_startup_timestamp)
-DOMAIN_STATE(long, eventlog_startup_pid)
 DOMAIN_STATE(uintnat, eventlog_paused)
 DOMAIN_STATE(uintnat, eventlog_enabled)
 DOMAIN_STATE(FILE*, eventlog_out)
+DOMAIN_STATE(struct event_buffer *, eventlog_buffer)
 /* See eventlog.c */
 
-#if defined(NAKED_POINTERS_CHECKER) && !defined(_WIN32)
-DOMAIN_STATE(void*, checking_pointer_pc)
-/* See major_gc.c */
-#endif
+DOMAIN_STATE(int, inside_stw_handler)
+/* whether or not a domain is inside of a stop-the-world handler
+   this is used for several debug assertions inside of methods
+   that can only be called from either in or outside this state. */
+
+DOMAIN_STATE(intnat, trap_sp_off)
+DOMAIN_STATE(intnat, trap_barrier_off)
+DOMAIN_STATE(struct caml_exception_context*, external_raise)
+/* Bytecode TLS vars, not used for native code */
 
 DOMAIN_STATE(extra_params_area, extra_params)
 /* This member must occur last, because it is an array, not a scalar */
diff --git a/runtime/caml/eventlog.h b/runtime/caml/eventlog.h
index 3f2a4fca2da9..8daaf8562daa 100644
--- a/runtime/caml/eventlog.h
+++ b/runtime/caml/eventlog.h
@@ -56,7 +56,35 @@ typedef enum {
     EV_MINOR_COPY,
     EV_MINOR_UPDATE_WEAK,
     EV_MINOR_FINALIZED,
-    EV_EXPLICIT_GC_MAJOR_SLICE
+    EV_EXPLICIT_GC_MAJOR_SLICE,
+    EV_DOMAIN_SPAWN,
+    EV_DOMAIN_SEND_INTERRUPT,
+    EV_DOMAIN_IDLE_WAIT,
+    EV_FINALISE_UPDATE_FIRST,
+    EV_FINALISE_UPDATE_LAST,
+    EV_INTERRUPT_GC,
+    EV_INTERRUPT_REMOTE,
+    EV_MAJOR_EPHE_MARK,
+    EV_MAJOR_EPHE_SWEEP,
+    EV_MAJOR_FINISH_MARKING,
+    EV_MAJOR_GC_CYCLE_DOMAINS,
+    EV_MAJOR_GC_PHASE_CHANGE,
+    EV_MAJOR_GC_STW,
+    EV_MAJOR_MARK_OPPORTUNISTIC,
+    EV_MAJOR_SLICE,
+    EV_MINOR_CLEAR,
+    EV_MINOR_FINALIZERS_OLDIFY,
+    EV_MINOR_GLOBAL_ROOTS,
+    EV_MINOR_LEAVE_BARRIER,
+    EV_STW_API_BARRIER,
+    EV_STW_HANDLER,
+    EV_STW_LEADER,
+    EV_MAJOR_FINISH_SWEEPING,
+    EV_MINOR_FINALIZERS_ADMIN,
+    EV_MINOR_REMEMBERED_SET,
+    EV_MINOR_REMEMBERED_SET_PROMOTE,
+    EV_MINOR_LOCAL_ROOTS_PROMOTE,
+    EV_DOMAIN_CONDITION_WAIT
 } ev_gc_phase;
 
 typedef enum {
@@ -87,12 +115,14 @@ typedef enum {
 
 #define CAML_EVENTLOG_INIT() caml_eventlog_init()
 #define CAML_EVENTLOG_DISABLE() caml_eventlog_disable()
+#define CAML_EVENTLOG_IS_BACKUP_THREAD() caml_eventlog_is_backup_thread()
 #define CAML_EV_BEGIN(p) caml_ev_begin(p)
 #define CAML_EV_END(p) caml_ev_end(p)
 #define CAML_EV_COUNTER(c, v) caml_ev_counter(c, v)
 #define CAML_EV_ALLOC(s) caml_ev_alloc(s)
 #define CAML_EV_ALLOC_FLUSH() caml_ev_alloc_flush()
 #define CAML_EV_FLUSH() caml_ev_flush()
+#define CAML_EVENTLOG_TEARDOWN() caml_eventlog_teardown()
 
 /* General note about the public API for the eventlog framework
    The caml_ev_* functions are no-op when called with the eventlog framework
@@ -105,6 +135,8 @@ typedef enum {
 
 void caml_eventlog_init(void);
 void caml_eventlog_disable(void);
+void caml_eventlog_teardown(void);
+void caml_eventlog_is_backup_thread(void);
 void caml_ev_begin(ev_gc_phase phase);
 void caml_ev_end(ev_gc_phase phase);
 void caml_ev_counter(ev_gc_counter counter, uint64_t val);
@@ -118,12 +150,14 @@ void caml_ev_flush(void);
 
 #define CAML_EVENTLOG_INIT() /**/
 #define CAML_EVENTLOG_DISABLE() /**/
+#define CAML_EVENTLOG_IS_BACKUP_THREAD() /**/
 #define CAML_EV_BEGIN(p) /**/
 #define CAML_EV_END(p) /**/
 #define CAML_EV_COUNTER(c, v) /**/
 #define CAML_EV_ALLOC(S) /**/
 #define CAML_EV_ALLOC_FLUSH() /**/
 #define CAML_EV_FLUSH() /**/
+#define CAML_EVENTLOG_TEARDOWN() /**/
 
 #endif /*CAML_INSTR*/
 
diff --git a/runtime/caml/fail.h b/runtime/caml/fail.h
index 822c60326069..3a23ca6ee2bc 100644
--- a/runtime/caml/fail.h
+++ b/runtime/caml/fail.h
@@ -44,6 +44,8 @@
 #define SYS_BLOCKED_IO 9        /* "Sys_blocked_io" */
 #define ASSERT_FAILURE_EXN 10   /* "Assert_failure" */
 #define UNDEFINED_RECURSIVE_MODULE_EXN 11 /* "Undefined_recursive_module" */
+#define UNHANDLED_EXN 12        /* "Unhandled" */
+#define CONTINUATION_ALREADY_TAKEN_EXN 13 /* "Continuation_already_taken" */
 
 #ifdef POSIX_SIGNALS
 struct longjmp_buffer {
@@ -64,6 +66,12 @@ struct longjmp_buffer {
 #define siglongjmp(buf,val) longjmp(buf,val)
 #endif
 
+struct caml_exception_context {
+  struct longjmp_buffer* jmp;
+  struct caml__roots_block* local_roots;
+  volatile value* exn_bucket;
+};
+
 /* Global variables moved to Caml_state in 4.10 */
 #define caml_external_raise (Caml_state_field(external_raise))
 #define caml_exn_bucket (Caml_state_field(exn_bucket))
@@ -146,6 +154,10 @@ CAMLnoreturn_start
 CAMLextern void caml_raise_sys_blocked_io (void)
 CAMLnoreturn_end;
 
+CAMLnoreturn_start
+CAMLextern void caml_raise_continuation_already_taken (void)
+CAMLnoreturn_end;
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/runtime/caml/fiber.h b/runtime/caml/fiber.h
new file mode 100644
index 000000000000..565f91580bcd
--- /dev/null
+++ b/runtime/caml/fiber.h
@@ -0,0 +1,135 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                   Tom Kelly, OCaml Labs Consultancy                    */
+/*                Stephen Dolan, University of Cambridge                  */
+/*                                                                        */
+/*   Copyright 2021 Indian Institute of Technology, Madras                */
+/*   Copyright 2021 OCaml Labs Consultancy                                */
+/*   Copyright 2019 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#ifndef CAML_FIBER_H
+#define CAML_FIBER_H
+
+#ifdef CAML_INTERNALS
+
+#include "misc.h"
+#include "mlvalues.h"
+#include "roots.h"
+
+struct stack_info;
+
+struct stack_handler {
+  value handle_value;
+  value handle_exn;
+  value handle_effect;
+  struct stack_info* parent;
+};
+
+struct stack_info {
+#ifdef NATIVE_CODE
+  void* sp;
+  void* exception_ptr;
+#else
+  value* sp;
+  value* exception_ptr;
+#endif
+  struct stack_handler* handler;
+
+  /* [size_bucket] is a pointer to a bucket in Caml->stack_cache if this
+   * size is pooled. If unpooled, it is NULL.
+   *
+   * Stacks may be unpooled if either the stack size is not 2**N multiple of
+   * [caml_fiber_wsz] or the stack is bigger than pooled sizes. */
+  struct stack_info** size_bucket;
+  uintnat magic;
+};
+
+CAML_STATIC_ASSERT(sizeof(struct stack_info) ==
+                   Stack_ctx_words * sizeof(value));
+#define Stack_base(stk) ((value*)(stk + 1))
+#define Stack_threshold_ptr(stk) \
+  (Stack_base(stk) + Stack_threshold / sizeof(value))
+#define Stack_high(stk) (value*)stk->handler
+
+#define Stack_handle_value(stk) (stk)->handler->handle_value
+#define Stack_handle_exception(stk) (stk)->handler->handle_exn
+#define Stack_handle_effect(stk) (stk)->handler->handle_effect
+#define Stack_parent(stk) (stk)->handler->parent
+
+/* Stack layout for native code. Stack grows downwards.
+ *
+ * +------------------------+
+ * |  struct stack_handler  |
+ * +------------------------+ <--- Stack_high
+ * |     caml_runstack      |
+ * +------------------------+
+ * |                        |
+ * .      OCaml frames      . <--- sp
+ * |                        |
+ * +------------------------+ <--- Stack_threshold
+ * |                        |
+ * .        Red Zone        .
+ * |                        |
+ * +------------------------+ <--- Stack_base
+ * |   struct stack_info    |
+ * +------------------------+ <--- Caml_state->current_stack
+ * |      HEADER WORD       |
+ * +------------------------+
+ */
+
+/* This structure is used for storing the OCaml return pointer when
+ * transitioning from an OCaml stack to a C stack at a C call. When an OCaml
+ * stack is reallocated, this linked list is walked to update the OCaml stack
+ * pointers. It is also used for DWARF backtraces. */
+struct c_stack_link {
+  /* The reference to the OCaml stack */
+  struct stack_info* stack;
+  /* OCaml return address */
+  void* sp;
+  struct c_stack_link* prev;
+};
+
+#define NUM_STACK_SIZE_CLASSES 5
+
+/* The table of global identifiers */
+extern value caml_global_data;
+
+#define Trap_pc(tp) (((code_t *)(tp))[0])
+#define Trap_link(tp) ((tp)[1])
+
+struct stack_info** caml_alloc_stack_cache (void);
+CAMLextern struct stack_info* caml_alloc_main_stack (uintnat init_size);
+void caml_scan_stack(scanning_action f, void* fdata,
+                     struct stack_info* stack, value* v_gc_regs);
+/* try to grow the stack until at least required_size words are available.
+   returns nonzero on success */
+int caml_try_realloc_stack (asize_t required_size);
+void caml_change_max_stack_size (uintnat new_max_size);
+void caml_maybe_expand_stack(void);
+CAMLextern void caml_free_stack(struct stack_info* stk);
+
+#ifdef NATIVE_CODE
+void caml_get_stack_sp_pc (struct stack_info* stack,
+                           char** sp /* out */, uintnat* pc /* out */);
+#endif
+
+value caml_continuation_use (value cont);
+
+/* Replace the stack of a continuation that was previouly removed
+   with caml_continuation_use. The GC must not be allowed to run
+   between continuation_use and continuation_replace.
+   Used for cloning continuations and continuation backtraces. */
+void caml_continuation_replace(value cont, struct stack_info* stack);
+
+#endif /* CAML_INTERNALS */
+
+#endif /* CAML_FIBER_H */
diff --git a/runtime/caml/finalise.h b/runtime/caml/finalise.h
index a61b2b63d294..c58bfcbfe53a 100644
--- a/runtime/caml/finalise.h
+++ b/runtime/caml/finalise.h
@@ -19,16 +19,66 @@
 #ifdef CAML_INTERNALS
 
 #include "roots.h"
+#include "domain.h"
 
-void caml_final_update_mark_phase (void);
-void caml_final_update_clean_phase (void);
-value caml_final_do_calls_exn (void);
-void caml_final_do_roots (scanning_action f);
-void caml_final_invert_finalisable_values (void);
-void caml_final_oldify_young_roots (void);
-void caml_final_empty_young (void);
-void caml_final_update_minor_roots(void);
-void caml_final_invariant_check(void);
+struct final {
+  value fun;
+  value val;
+  int offset;
+};
+
+struct finalisable {
+  struct final *table;
+  uintnat old;
+  uintnat young;
+  uintnat size;
+};
+/* [0..old) : finalisable set, the values are in the major heap
+   [old..young) : recent set, the values could be in the minor heap
+   [young..size) : free space
+
+   The element of the finalisable set are moved to the finalising set
+   below when the value are unreachable (for the first or last time).
+
+*/
+
+struct final_todo {
+  struct final_todo *next;
+  int size;
+  struct final item[1];  /* variable size */
+};
+
+/*
+  todo_head: head of the list of finalisation functions that can be run.
+  todo_tail: tail of the list of finalisation functions that can be run.
+
+  It is the finalising set.
+*/
+
+struct caml_final_info {
+  struct finalisable first;
+  uintnat updated_first;
+  struct finalisable last;
+  uintnat updated_last;
+  struct final_todo *todo_head;
+  struct final_todo *todo_tail;
+  uintnat running_finalisation_function;
+  struct caml_final_info* next; /* used for orphaned finalisers.
+                                   See major_gc.c */
+};
+
+void caml_final_merge_finalisable (struct finalisable *source,
+                                   struct finalisable *target);
+int caml_final_update_first (caml_domain_state* d);
+int caml_final_update_last (caml_domain_state* d);
+void caml_final_do_calls (void);
+void caml_final_do_roots (scanning_action f, void* fdata,
+                          caml_domain_state* domain, int do_val);
+void caml_final_do_young_roots (scanning_action f, void* fdata,
+                                caml_domain_state* d, int do_last_val);
+void caml_final_empty_young (caml_domain_state* d);
+void caml_final_update_last_minor (caml_domain_state* d);
+struct caml_final_info* caml_alloc_final_info(void);
 
 #endif /* CAML_INTERNALS */
 
diff --git a/runtime/caml/frame_descriptors.h b/runtime/caml/frame_descriptors.h
new file mode 100644
index 000000000000..3a2724bcbbf7
--- /dev/null
+++ b/runtime/caml/frame_descriptors.h
@@ -0,0 +1,71 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                   Tom Kelly, OCaml Labs Consultancy                    */
+/*                 Stephen Dolan, University of Cambridge                 */
+/*                                                                        */
+/*   Copyright 2019 Indian Institute of Technology, Madras                */
+/*   Copyright 2021 OCaml Labs Consultancy Ltd                            */
+/*   Copyright 2019 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#ifndef CAML_FRAME_DESCRIPTORS_H
+#define CAML_FRAME_DESCRIPTORS_H
+
+#ifdef CAML_INTERNALS
+
+#include "config.h"
+
+#define Hash_retaddr(addr, mask)                          \
+  (((uintnat)(addr) >> 3) & (mask))
+
+/* Structure of frame descriptors */
+
+typedef struct {
+  uintnat retaddr;
+  unsigned short frame_size;
+  unsigned short num_live;
+  unsigned short live_ofs[1 /* num_live */];
+  /*
+    If frame_size & 1, then debug info follows:
+  uint32_t debug_info_offset;
+    Debug info is stored as a relative offset to a debuginfo structure. */
+} frame_descr;
+
+/* Allocation lengths are encoded as 0-255, giving sizes 1-256 */
+#define Wosize_encoded_alloc_len(n) ((uintnat)(n) + 1)
+
+/* Used to compute offsets in frame tables.
+   ty must have power-of-2 size */
+#define Align_to(p, ty) \
+  (void*)(((uintnat)(p) + sizeof(ty) - 1) & -sizeof(ty))
+
+
+void caml_init_frame_descriptors(void);
+void caml_register_frametable(intnat *table);
+
+typedef struct {
+  frame_descr** descriptors;
+  int mask;
+} caml_frame_descrs;
+
+caml_frame_descrs caml_get_frame_descrs(void);
+
+/* Find the current table of frame descriptors.
+   The resulting structure is only valid until the next GC */
+frame_descr* caml_find_frame_descr(caml_frame_descrs fds, uintnat pc);
+
+
+frame_descr * caml_next_frame_descriptor
+    (caml_frame_descrs fds, uintnat * pc, char ** sp, struct stack_info* stack);
+
+#endif /* CAML_INTERNALS */
+
+#endif /* CAML_FRAME_DESCRIPTORS_H */
diff --git a/runtime/caml/freelist.h b/runtime/caml/freelist.h
deleted file mode 100644
index 1735d772c69d..000000000000
--- a/runtime/caml/freelist.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*              Damien Doligez, projet Para, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-/* Free lists of heap blocks. */
-
-#ifndef CAML_FREELIST_H
-#define CAML_FREELIST_H
-
-#ifdef CAML_INTERNALS
-
-#include "misc.h"
-#include "mlvalues.h"
-
-extern asize_t caml_fl_cur_wsz;
-
-/* See [freelist.c] for usage info on these functions. */
-extern header_t *(*caml_fl_p_allocate) (mlsize_t wo_sz);
-extern void (*caml_fl_p_init_merge) (void);
-extern header_t *(*caml_fl_p_merge_block) (value bp, char *limit);
-extern void (*caml_fl_p_add_blocks) (value bp);
-extern void (*caml_fl_p_make_free_blocks)
-  (value *p, mlsize_t size, int do_merge, int color);
-#ifdef DEBUG
-extern void (*caml_fl_p_check) (void);
-#endif
-
-Caml_inline header_t *caml_fl_allocate (mlsize_t wo_sz)
-  { return (*caml_fl_p_allocate) (wo_sz); }
-
-Caml_inline void caml_fl_init_merge (void)
-  { (*caml_fl_p_init_merge) (); }
-
-Caml_inline header_t *caml_fl_merge_block (value bp, char *limit)
-  { return (*caml_fl_p_merge_block) (bp, limit); }
-
-Caml_inline void caml_fl_add_blocks (value bp)
-  { (*caml_fl_p_add_blocks) (bp); }
-
-Caml_inline void caml_make_free_blocks
-  (value *p, mlsize_t size, int do_merge, int color)
-  { (*caml_fl_p_make_free_blocks) (p, size, do_merge, color); }
-
-enum {
-  caml_policy_next_fit = 0,
-  caml_policy_first_fit = 1,
-  caml_policy_best_fit = 2,
-};
-extern void caml_set_allocation_policy (uintnat);
-
-extern void caml_fl_reset_and_switch_policy (intnat);
-/* -1 means do not change the allocation policy */
-
-#ifdef DEBUG
-Caml_inline void caml_fl_check (void)
-  { (*caml_fl_p_check) (); }
-#endif
-
-#endif /* CAML_INTERNALS */
-
-#endif /* CAML_FREELIST_H */
diff --git a/runtime/caml/gc.h b/runtime/caml/gc.h
index 854f9dba81f7..8d3505b1e872 100644
--- a/runtime/caml/gc.h
+++ b/runtime/caml/gc.h
@@ -19,25 +19,6 @@
 
 #include "mlvalues.h"
 
-#define Caml_white (0 << 8)
-#define Caml_gray  (1 << 8)
-#define Caml_blue  (2 << 8)
-#define Caml_black (3 << 8)
-
-#define Color_hd(hd) ((color_t) ((hd) & Caml_black))
-#define Color_hp(hp) (Color_hd (Hd_hp (hp)))
-#define Color_val(val) (Color_hd (Hd_val (val)))
-
-#define Is_white_hd(hd) (Color_hd (hd) == Caml_white)
-#define Is_gray_hd(hd) (Color_hd (hd) == Caml_gray)
-#define Is_blue_hd(hd) (Color_hd (hd) == Caml_blue)
-#define Is_black_hd(hd) (Color_hd (hd) == Caml_black)
-
-#define Whitehd_hd(hd) (((hd)  & ~Caml_black)/*| Caml_white*/)
-#define Grayhd_hd(hd)  (((hd)  & ~Caml_black)  | Caml_gray)
-#define Blackhd_hd(hd) (((hd)/*& ~Caml_black*/)| Caml_black)
-#define Bluehd_hd(hd)  (((hd)  & ~Caml_black)  | Caml_blue)
-
 /* This depends on the layout of the header.  See [mlvalues.h]. */
 #define Make_header(wosize, tag, color)                                       \
       (/*CAMLassert ((wosize) <= Max_wosize),*/                               \
@@ -46,22 +27,9 @@
                     + (tag_t) (tag)))                                         \
       )
 
-#ifdef WITH_PROFINFO
-#define Make_header_with_profinfo(wosize, tag, color, profinfo)               \
-      (Make_header(wosize, tag, color)                                        \
-        | ((((intnat) profinfo) & PROFINFO_MASK) << PROFINFO_SHIFT)           \
-      )
-#else
 #define Make_header_with_profinfo(wosize, tag, color, profinfo) \
   Make_header(wosize, tag, color)
-#endif
-
-#define Is_white_val(val) (Color_val(val) == Caml_white)
-#define Is_blue_val(val) (Color_val(val) == Caml_blue)
-#define Is_black_val(val) (Color_val(val) == Caml_black)
 
-/* For extern.c */
-#define Colornum_hd(hd) ((color_t) (((hd) >> 8) & 3))
-#define Coloredhd_hd(hd,colnum) (((hd) & ~Caml_black) | ((colnum) << 8))
+#define Whitehd_hd(hd) (((hd)  & ~(3 << 8)))
 
 #endif /* CAML_GC_H */
diff --git a/runtime/caml/gc_ctrl.h b/runtime/caml/gc_ctrl.h
index 184b6f58589f..ecbd4e54cd3b 100644
--- a/runtime/caml/gc_ctrl.h
+++ b/runtime/caml/gc_ctrl.h
@@ -20,34 +20,24 @@
 
 #include "misc.h"
 
-/* Global variables moved to Caml_state in 4.10 */
-#define caml_stat_minor_words (Caml_state_field(stat_minor_words))
-#define caml_stat_promoted_words (Caml_state_field(stat_promoted_words))
-#define caml_stat_major_words (Caml_state_field(stat_major_words))
-#define caml_stat_minor_collections (Caml_state_field(stat_minor_collections))
-#define caml_stat_major_collections (Caml_state_field(stat_major_collections))
-#define caml_stat_heap_wsz (Caml_state_field(stat_heap_wsz))
-#define caml_stat_top_heap_wsz (Caml_state_field(stat_top_heap_wsz))
-#define caml_stat_compactions (Caml_state_field(stat_compactions))
-#define caml_stat_heap_chunks (Caml_state_field(stat_heap_chunks))
-
-/*
-  minor_size: cf. minor_heap_size in gc.mli
-  major_size: Size in words of the initial major heap
-  major_incr: cf. major_heap_increment in gc.mli
-  percent_fr: cf. space_overhead in gc.mli
-  percent_m : cf. max_overhead in gc.mli
-  window    : cf. window_size in gc.mli
-  custom_maj: cf. custom_major_ratio in gc.mli
-  custom_min: cf. custom_minor_ratio in gc.mli
-  custom_bsz: cf. custom_minor_max_size in gc.mli
-  policy    : cf. allocation_policy in gc.mli
-*/
-void caml_init_gc (uintnat minor_size, uintnat major_size, uintnat major_incr,
-                   uintnat percent_fr, uintnat percent_m, uintnat window,
-                   uintnat custom_maj, uintnat custom_min, uintnat custom_bsz,
-                   uintnat policy);
-
+extern uintnat caml_max_stack_size;
+extern uintnat caml_fiber_wsz;
+
+void caml_init_gc (void);
+value caml_gc_stat(value);
+value caml_gc_major(value);
+
+
+#define caml_stat_top_heap_wsz caml_top_heap_words(Caml_state->shared_heap)
+#define caml_stat_compactions 0
+#define caml_stat_heap_wsz Wsize_bsize(caml_heap_size(Caml_state->shared_heap))
+#define caml_stat_heap_chunks caml_heap_blocks(Caml_state->shared_heap)
+#define caml_stat_major_collections Caml_state->stat_major_collections
+#define caml_stat_minor_collections Caml_state->stat_minor_collections
+#define caml_stat_promoted_words Caml_state->stat_promoted_words
+#define caml_allocated_words Caml_state->allocated_words
+#define caml_stat_major_words Caml_state->stat_major_words
+#define caml_stat_minor_words Caml_state->stat_minor_words
 
 #ifdef DEBUG
 void caml_heap_check (void);
diff --git a/runtime/caml/globroots.h b/runtime/caml/globroots.h
index 10fe66f5b778..31f59b00f20d 100644
--- a/runtime/caml/globroots.h
+++ b/runtime/caml/globroots.h
@@ -23,8 +23,12 @@
 #include "mlvalues.h"
 #include "roots.h"
 
-void caml_scan_global_roots(scanning_action f);
-void caml_scan_global_young_roots(scanning_action f);
+void caml_scan_global_roots(scanning_action f, void* fdata);
+void caml_scan_global_young_roots(scanning_action f, void* fdata);
+
+#ifdef NATIVE_CODE
+void caml_register_dyn_global(void *v);
+#endif
 
 #endif /* CAML_INTERNALS */
 
diff --git a/runtime/caml/instrtrace.h b/runtime/caml/instrtrace.h
index 17ee3ce0c733..ba15fa237145 100644
--- a/runtime/caml/instrtrace.h
+++ b/runtime/caml/instrtrace.h
@@ -23,7 +23,7 @@
 #include "mlvalues.h"
 #include "misc.h"
 
-extern intnat caml_icount;
+extern __thread intnat caml_icount;
 void caml_stop_here (void);
 void caml_disasm_instr (code_t pc);
 void caml_trace_value_file (value v, code_t prog, asize_t proglen, FILE * f);
diff --git a/runtime/caml/instruct.h b/runtime/caml/instruct.h
index d29a7b7bb5a2..50ebef7acb87 100644
--- a/runtime/caml/instruct.h
+++ b/runtime/caml/instruct.h
@@ -61,6 +61,7 @@ enum instructions {
   EVENT, BREAK,
   RERAISE, RAISE_NOTRACE,
   GETSTRINGCHAR,
+  PERFORM, RESUME, RESUMETERM, REPERFORMTERM,
 FIRST_UNIMPLEMENTED_OP};
 
 #endif /* CAML_INTERNALS */
diff --git a/runtime/caml/intext.h b/runtime/caml/intext.h
index a2a3fb8248f0..a4ddfb09a053 100644
--- a/runtime/caml/intext.h
+++ b/runtime/caml/intext.h
@@ -77,7 +77,7 @@
 #define CODE_DOUBLE_ARRAY64_LITTLE 0x17
 #define CODE_CODEPOINTER 0x10
 #define CODE_INFIXPOINTER 0x11
-#define CODE_CUSTOM 0x12 /* deprecated */
+/* #define CODE_CUSTOM 0x12  -- no longer supported */
 #define CODE_CUSTOM_LEN 0x18
 #define CODE_CUSTOM_FIXED 0x19
 
@@ -100,11 +100,15 @@
 #define ENTRIES_PER_TRAIL_BLOCK  1025
 #define SIZE_EXTERN_OUTPUT_BLOCK 8100
 
+void caml_free_extern_state (void);
+
 /* The entry points */
 
 void caml_output_val (struct channel * chan, value v, value flags);
   /* Output [v] with flags [flags] on the channel [chan]. */
 
+void caml_free_intern_state (void);
+
 #endif /* CAML_INTERNALS */
 
 #ifdef __cplusplus
@@ -137,7 +141,7 @@ CAMLextern value caml_input_value_from_malloc(char * data, intnat ofs);
      to the beginning of the buffer, and [ofs] is the offset of the
      beginning of the externed data in this buffer.  The buffer is
      deallocated with [free] on return, or if an exception is raised. */
-CAMLextern value caml_input_value_from_block(char * data, intnat len);
+CAMLextern value caml_input_value_from_block(const char * data, intnat len);
   /* Read a structured value from a user-provided buffer.  [data] points
      to the beginning of the externed data in this buffer,
      and [len] is the length in bytes of valid data in this buffer.
diff --git a/runtime/caml/io.h b/runtime/caml/io.h
index 26661160fd1d..54c312adb048 100644
--- a/runtime/caml/io.h
+++ b/runtime/caml/io.h
@@ -20,8 +20,10 @@
 
 #ifdef CAML_INTERNALS
 
+#include "camlatomic.h"
 #include "misc.h"
 #include "mlvalues.h"
+#include "platform.h"
 
 #ifndef IO_BUFFER_SIZE
 #define IO_BUFFER_SIZE 65536
@@ -40,9 +42,9 @@ struct channel {
   char * end;                   /* Physical end of the buffer */
   char * curr;                  /* Current position in the buffer */
   char * max;                   /* Logical end of the buffer (for input) */
-  void * mutex;                 /* Placeholder for mutex (for systhreads) */
+  caml_plat_mutex mutex;        /* Mutex protecting buffer */
   struct channel * next, * prev;/* Double chaining of channels (flush_all) */
-  int refcount;                 /* Number of custom blocks owning the channel */
+  atomic_uintnat refcount;      /* Number of custom blocks owning the channel */
   int flags;                    /* Bitfield */
   char buff[IO_BUFFER_SIZE];    /* The buffer itself */
   char * name;                  /* Optional name (to report fd leaks) */
diff --git a/runtime/caml/lf_skiplist.h b/runtime/caml/lf_skiplist.h
new file mode 100644
index 000000000000..f35f112256e6
--- /dev/null
+++ b/runtime/caml/lf_skiplist.h
@@ -0,0 +1,128 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*               Sadiq Jaffer, OCaml Labs Consultancy Ltd                 */
+/*               Xavier Leroy, projet Cambium, INRIA Paris                */
+/*                                                                        */
+/*   Copyright 2021 OCaml Labs Consultancy Ltd                            */
+/*   Copyright 2020 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+/* A concurrent dictionary data structure implemented as skip lists. See
+   implementation for much more detail. */
+
+/* Keys and associated data are natural-width integers (type [uintnat]).
+   Key values 0 and uintnat_max are reserved for internal use; do not use.
+   Pointers can be used too, modulo conversion to [uintnat]. */
+
+#ifndef CAML_SKIPLIST_H
+#define CAML_SKIPLIST_H
+
+#ifdef CAML_INTERNALS
+
+#include "config.h"
+
+#define NUM_LEVELS 17
+
+/* The head of a skip list */
+
+struct lf_skiplist {
+  struct lf_skipcell *head;
+  struct lf_skipcell *tail;
+  uintnat _Atomic search_level; /* racy level to start searches at */
+  struct lf_skipcell *_Atomic garbage_head;
+};
+
+/* The cells of a skip list */
+
+struct lf_skipcell {
+  uintnat key;
+  uintnat data;
+  uintnat top_level;
+  void *stat_block;
+  struct lf_skipcell *_Atomic garbage_next;
+#if (__STDC_VERSION__ >= 199901L)
+  struct lf_skipcell *_Atomic forward[]; /* variable-length array */
+#else
+  struct lf_skipcell *_Atomic forward[1]; /* variable-length array */
+#endif
+};
+
+/* Initialize a skip list */
+extern void caml_lf_skiplist_init(struct lf_skiplist *sk);
+
+/* Search a skip list.
+   If [key] is found, store associated data in [*data] and return 1.
+   If [key] is not found, return 0 and leave [*data] unchanged. */
+extern int caml_lf_skiplist_find(struct lf_skiplist *sk, uintnat key,
+                                 /*out*/ uintnat *data);
+
+/* Search the entry of the skip list that has the largest key less than
+   or equal to [k].
+   If such an entry exists, store its key in [*key], the associated data in
+   [*data], and return 1.
+   If no such entry exists (all keys in the skip list are strictly greater
+   than [k]), return 0 and leave [*key] and [*data] unchanged. */
+extern int caml_lf_skiplist_find_below(struct lf_skiplist *sk, uintnat k,
+                                       /*out*/ uintnat *key,
+                                       /*out*/ uintnat *data);
+/* Insertion in a skip list. [key] must be between 1 and UINTNAT_MAX-1.
+   If [key] was already there, change the associated data and return 1.
+   If [key] was not there, insert new [key, data] binding and return 0. */
+extern int caml_lf_skiplist_insert(struct lf_skiplist *sk, uintnat key,
+                                   uintnat data);
+
+/* Deletion in a skip list.
+   If [key] was there, remove it and return 1.
+   If [key] was not there, leave the skip list unchanged and return 0. */
+extern int caml_lf_skiplist_remove(struct lf_skiplist *sk, uintnat key);
+
+/* This must only be called by a single domain during a stop-the world
+    protected by global barriers. */
+extern void caml_lf_skiplist_free_garbage(struct lf_skiplist *sk);
+
+/* Macros used for marking pointers and that are unfortunately necessary
+  in the header for FOREACH_LF_SKIPLIST_ELEMENT to work */
+#define LF_SK_IS_MARKED(p) ((p)&1)
+#define LF_SK_MARKED(p) ((struct lf_skipcell *)(((uintptr_t)(p)) | 1))
+#define LF_SK_UNMARK(p) ((struct lf_skipcell *)(((uintptr_t)(p)) & ~1))
+#define LF_SK_EXTRACT(from, mark_to, ptr_to)                                   \
+  {                                                                            \
+    uintptr_t tmp =                                                            \
+        (uintptr_t)atomic_load_explicit(&from, memory_order_acquire);          \
+    mark_to = LF_SK_IS_MARKED(tmp);                                            \
+    ptr_to = LF_SK_UNMARK(tmp);                          \
+  }
+
+/* Iterate over a skip list, in increasing order of keys.
+   [var] designates the current element.
+   [action] can refer to [var->key] and [var->data].
+   [action] can safely remove the current element, i.e. call
+   [caml_skiplist_remove] on [var->key].  The traversal will
+   continue with the skiplist element following the removed element.
+   Other operations performed over the skiplist during its traversal have
+   unspecified effects on the traversal. */
+
+#define FOREACH_LF_SKIPLIST_ELEMENT(var, sk, action)                           \
+  {                                                                            \
+    struct lf_skipcell *var, *caml__next;                                      \
+    int caml__marked;                                                          \
+    var = (sk)->head->forward[0];                                              \
+    while (var != (sk)->tail) {                                                \
+      LF_SK_EXTRACT(var->forward[0], caml__marked, caml__next);                \
+      if (!caml__marked) {                                                     \
+        action;                                                                \
+      }                                                                        \
+      var = caml__next;                                                        \
+    }                                                                          \
+  }
+#endif /* CAML_INTERNALS */
+
+#endif /* CAML_SKIPLIST_H */
diff --git a/runtime/caml/major_gc.h b/runtime/caml/major_gc.h
index 054aa90dc8d6..c5c45d265d3a 100644
--- a/runtime/caml/major_gc.h
+++ b/runtime/caml/major_gc.h
@@ -18,84 +18,73 @@
 
 #ifdef CAML_INTERNALS
 
-#include "freelist.h"
-#include "misc.h"
-
-/* An interval of a single object to be scanned.
-   The end pointer must always be one-past-the-end of a heap block,
-   but the start pointer is not necessarily the start of the block */
-typedef struct {
-  value* start;
-  value* end;
-} mark_entry;
-
-typedef struct {
-  void *block;           /* address of the malloced block this chunk lives in */
-  asize_t allocated;     /* in bytes, used for compaction */
-  asize_t size;          /* in bytes */
-  char *next;
-  mark_entry redarken_first;  /* first block in chunk to redarken */
-  value* redarken_end;     /* one-past-end of last block for redarkening */
-} heap_chunk_head;
-
-#define Chunk_head(c) (((heap_chunk_head *) (c)) - 1)
-#define Chunk_size(c) Chunk_head(c)->size
-#define Chunk_alloc(c) Chunk_head(c)->allocated
-#define Chunk_next(c) Chunk_head(c)->next
-#define Chunk_block(c) Chunk_head(c)->block
-
-extern int caml_gc_phase;
-extern int caml_gc_subphase;
-extern uintnat caml_allocated_words;
-extern double caml_extra_heap_resources;
-extern uintnat caml_dependent_size, caml_dependent_allocated;
-extern uintnat caml_fl_wsz_at_phase_change;
-extern int caml_ephe_list_pure;
-
-#define Phase_mark 0
-#define Phase_clean 1
-#define Phase_sweep 2
-#define Phase_idle 3
-
-/* Subphase of mark */
-#define Subphase_mark_roots 10
-/* Subphase_mark_roots: At the end of this subphase all the global
-   roots are marked. */
-#define Subphase_mark_main 11
-/* Subphase_mark_main: At the end of this subphase all the value alive at
-   the start of this subphase and created during it are marked. */
-#define Subphase_mark_final 12
-/* Subphase_mark_final: At the start of this subphase register which
-   value with an ocaml finalizer are not marked, the associated
-   finalizer will be run later. So we mark now these values as alive,
-   since they must be available for their finalizer.
-  */
-
-CAMLextern char *caml_heap_start;
-extern uintnat total_heap_size;
-extern char *caml_gc_sweep_hp;
-
-extern int caml_major_window;
-extern double caml_major_ring[Max_major_window];
-extern int caml_major_ring_index;
-extern double caml_major_work_credit;
-extern double caml_gc_clock;
-
-/* [caml_major_gc_hook] is called just between the end of the mark
-   phase and the beginning of the sweep phase of the major GC.
-
-   This hook must not allocate, change any heap value, nor
-   call OCaml code. */
-CAMLextern void (*caml_major_gc_hook)(void);
-
-void caml_init_major_heap (asize_t);           /* size in bytes */
-asize_t caml_clip_heap_chunk_wsz (asize_t wsz);
-void caml_darken (value, value *);
+typedef enum {
+  Phase_sweep_and_mark_main,
+  Phase_mark_final,
+  Phase_sweep_ephe
+} gc_phase_t;
+extern gc_phase_t caml_gc_phase;
+
+Caml_inline char caml_gc_phase_char(gc_phase_t phase) {
+  switch (phase) {
+    case Phase_sweep_and_mark_main:
+      return 'M';
+    case Phase_mark_final:
+      return 'F';
+    case Phase_sweep_ephe:
+      return 'E';
+    default:
+      return 'U';
+  }
+}
+
+intnat caml_opportunistic_major_work_available (void);
+void caml_opportunistic_major_collection_slice (intnat);
+/* auto-triggered slice from within the GC */
+#define AUTO_TRIGGERED_MAJOR_SLICE -1
+/* external triggered slice, but GC will compute the amount of work */
+#define GC_CALCULATE_MAJOR_SLICE 0
 void caml_major_collection_slice (intnat);
-void caml_shrink_mark_stack ();
-void major_collection (void);
-void caml_finish_major_cycle (void);
-void caml_set_major_window (int);
+void caml_finish_sweeping(void);
+void caml_finish_marking (void);
+int caml_init_major_gc(caml_domain_state*);
+void caml_teardown_major_gc(void);
+void caml_darken(void*, value, value* ignored);
+void caml_darken_cont(value);
+void caml_mark_root(value, value*);
+void caml_empty_mark_stack(void);
+void caml_finish_major_cycle(void);
+
+/* Ephemerons and finalisers */
+void caml_ephe_todo_list_emptied(void);
+void caml_orphan_allocated_words(void);
+void caml_add_to_orphaned_ephe_list(struct caml_ephe_info* ephe_info);
+void caml_add_orphaned_finalisers (struct caml_final_info*);
+void caml_final_domain_terminate (caml_domain_state *domain_state);
+
+struct heap_stats {
+  intnat pool_words;
+  intnat pool_max_words;
+  intnat pool_live_words;
+  intnat pool_live_blocks;
+  intnat pool_frag_words;
+  intnat large_words;
+  intnat large_max_words;
+  intnat large_blocks;
+};
+void caml_accum_heap_stats(struct heap_stats* acc, const struct heap_stats* s);
+void caml_remove_heap_stats(struct heap_stats* acc, const struct heap_stats* s);
+
+struct gc_stats {
+  uint64_t minor_words;
+  uint64_t promoted_words;
+  uint64_t major_words;
+  uint64_t minor_collections;
+  uint64_t forced_major_collections;
+  struct heap_stats major_heap;
+};
+void caml_sample_gc_stats(struct gc_stats* buf);
+void caml_sample_gc_collect(caml_domain_state *domain);
 
 /* Forces finalisation of all heap-allocated values,
    disregarding both local and global roots.
@@ -106,10 +95,12 @@ void caml_set_major_window (int);
 */
 void caml_finalise_heap (void);
 
-#ifdef NAKED_POINTERS_CHECKER
-extern int caml_naked_pointers_detected;
-#endif
+/* This variable is only written with the world stopped,
+   so it need not be atomic */
+extern uintnat caml_major_cycles_completed;
+
+double caml_mean_space_overhead(void);
 
-#endif /* CAML_INTERNALiS */
+#endif /* CAML_INTERNALS */
 
 #endif /* CAML_MAJOR_GC_H */
diff --git a/runtime/caml/memory.h b/runtime/caml/memory.h
index 1e9cdf6d9ba4..48338a7d85c9 100644
--- a/runtime/caml/memory.h
+++ b/runtime/caml/memory.h
@@ -18,7 +18,7 @@
 #ifndef CAML_MEMORY_H
 #define CAML_MEMORY_H
 
-#ifndef CAML_NAME_SPACE
+#ifndef CAML_INTERNALS
 #include "compatibility.h"
 #endif
 #include "config.h"
@@ -27,42 +27,42 @@
 #include "major_gc.h"
 #include "minor_gc.h"
 #endif /* CAML_INTERNALS */
+#include "domain.h"
 #include "misc.h"
 #include "mlvalues.h"
-#include "domain.h"
+#include "alloc.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 CAMLextern value caml_alloc_shr (mlsize_t wosize, tag_t);
-
-/* Variant of [caml_alloc_shr] with explicit profinfo.
-   Equivalent to caml_alloc_shr unless WITH_PROFINFO is true */
+CAMLextern value caml_alloc_shr_noexc(mlsize_t wosize, tag_t);
+#ifdef WITH_PROFINFO
 CAMLextern value caml_alloc_shr_with_profinfo (mlsize_t, tag_t, intnat);
-
-/* Variant of [caml_alloc_shr] where no memprof sampling is performed. */
-CAMLextern value caml_alloc_shr_no_track_noexc (mlsize_t, tag_t);
-
-/* Variant of [caml_alloc_shr] where no memprof sampling is performed,
-   and re-using the profinfo associated with the header given in
-   parameter. */
-CAMLextern value caml_alloc_shr_for_minor_gc (mlsize_t, tag_t, header_t);
-
+CAMLextern value caml_alloc_shr_preserving_profinfo (mlsize_t, tag_t,
+                                                     header_t);
+#else
+#define caml_alloc_shr_with_profinfo(size, tag, profinfo) \
+  caml_alloc_shr(size, tag)
+#define caml_alloc_shr_preserving_profinfo(size, tag, header) \
+  caml_alloc_shr(size, tag)
+#endif /* WITH_PROFINFO */
+CAMLextern value caml_alloc_shr_no_raise (mlsize_t wosize, tag_t);
 CAMLextern void caml_adjust_gc_speed (mlsize_t, mlsize_t);
 CAMLextern void caml_alloc_dependent_memory (mlsize_t bsz);
 CAMLextern void caml_free_dependent_memory (mlsize_t bsz);
 CAMLextern void caml_modify (value *, value);
 CAMLextern void caml_initialize (value *, value);
+CAMLextern int caml_atomic_cas_field (value, intnat, value, value);
 CAMLextern value caml_check_urgent_gc (value);
-CAMLextern color_t caml_allocation_color (void *hp);
 #ifdef CAML_INTERNALS
 CAMLextern char *caml_alloc_for_heap (asize_t request);   /* Size in bytes. */
 CAMLextern void caml_free_for_heap (char *mem);
 CAMLextern int caml_add_to_heap (char *mem);
 #endif /* CAML_INTERNALS */
 
-CAMLextern int caml_huge_fallback_count;
+CAMLextern int caml_huge_fallback_count; /* FIXME KC: Make per domain */
 
 
 /* [caml_stat_*] functions below provide an interface to the static memory
@@ -192,11 +192,6 @@ extern uintnat caml_use_huge_pages;
     (((x) + (Heap_page_size - 1)) & ~ (Heap_page_size - 1))
 #endif
 
-
-int caml_page_table_add(int kind, void * start, void * end);
-int caml_page_table_remove(int kind, void * start, void * end);
-int caml_page_table_initialize(mlsize_t bytesize);
-
 #ifdef DEBUG
 #define DEBUG_clear(result, wosize) do{ \
   uintnat caml__DEBUG_i; \
@@ -208,57 +203,39 @@ int caml_page_table_initialize(mlsize_t bytesize);
 #define DEBUG_clear(result, wosize)
 #endif
 
-enum caml_alloc_small_flags {
-  CAML_DONT_TRACK = 0, CAML_DO_TRACK = 1,
-  CAML_FROM_C = 0,     CAML_FROM_CAML = 2
-};
-
-extern void caml_alloc_small_dispatch (intnat wosize, int flags,
-                                       int nallocs, unsigned char* alloc_lens);
-// Do not call asynchronous callbacks from allocation functions
-#define Alloc_small_origin CAML_FROM_C
-#define Alloc_small_aux(result, wosize, tag, profinfo, track) do {     \
-  CAMLassert ((wosize) >= 1);                                          \
-  CAMLassert ((tag_t) (tag) < 256);                                    \
-  CAMLassert ((wosize) <= Max_young_wosize);                           \
-  Caml_state_field(young_ptr) -= Whsize_wosize (wosize);               \
-  if (Caml_state_field(young_ptr) < Caml_state_field(young_limit)) {   \
-    Setup_for_gc;                                                      \
-    caml_alloc_small_dispatch((wosize), (track) | Alloc_small_origin,  \
-                              1, NULL);                                \
-    Restore_after_gc;                                                  \
-  }                                                                    \
-  Hd_hp (Caml_state_field(young_ptr)) =                                \
-    Make_header_with_profinfo ((wosize), (tag), 0, profinfo);          \
-  (result) = Val_hp (Caml_state_field(young_ptr));                     \
-  DEBUG_clear ((result), (wosize));                                    \
+#define Alloc_small_with_profinfo(result, wosize, tag, GC, profinfo) do{    \
+  caml_domain_state* dom_st;                                                \
+                                                CAMLassert ((wosize) >= 1); \
+                                          CAMLassert ((tag_t) (tag) < 256); \
+                                 CAMLassert ((wosize) <= Max_young_wosize); \
+  dom_st = Caml_state;                                                      \
+  dom_st->young_ptr -=  Whsize_wosize(wosize);                              \
+  while (Caml_check_gc_interrupt(dom_st)) {                                 \
+    dom_st->young_ptr += Whsize_wosize(wosize);                             \
+    { GC }                                                                  \
+    dom_st->young_ptr -= Whsize_wosize(wosize);                             \
+  }                                                                         \
+  Hd_hp (dom_st->young_ptr) =                                               \
+    Make_header_with_profinfo ((wosize), (tag), 0, profinfo);               \
+  (result) = Val_hp (dom_st->young_ptr);                                    \
+  DEBUG_clear ((result), (wosize));                                         \
 }while(0)
 
-#define Alloc_small_with_profinfo(result, wosize, tag, profinfo) \
-  Alloc_small_aux(result, wosize, tag, profinfo, CAML_DO_TRACK)
-
-#define Alloc_small(result, wosize, tag) \
-  Alloc_small_with_profinfo(result, wosize, tag, (uintnat) 0)
-#define Alloc_small_no_track(result, wosize, tag) \
-  Alloc_small_aux(result, wosize, tag, (uintnat) 0, CAML_DONT_TRACK)
-
-/* Deprecated alias for [caml_modify] */
-
-#define Modify(fp,val) \
-  CAML_DEPRECATED("Modify", "caml_modify") \
-  caml_modify((fp), (val))
+#define Alloc_small(result, wosize, tag, GC) \
+  Alloc_small_with_profinfo(result, wosize, tag, GC, (uintnat)0)
 
 #endif /* CAML_INTERNALS */
 
+struct caml__mutex_unwind;
 struct caml__roots_block {
   struct caml__roots_block *next;
+  struct caml__mutex_unwind *mutexes;
   intnat ntables;
   intnat nitems;
   value *tables [5];
 };
 
-/* Global variable moved to Caml_state in 4.10 */
-#define caml_local_roots (Caml_state_field(local_roots))
+#define CAML_LOCAL_ROOTS (Caml_state->local_roots)
 
 /* The following macros are used to declare C local variables and
    function parameters of type [value].
@@ -292,7 +269,8 @@ struct caml__roots_block {
 */
 
 #define CAMLparam0() \
-  struct caml__roots_block *caml__frame = Caml_state_field(local_roots)
+  struct caml__roots_block** caml_local_roots_ptr = &CAML_LOCAL_ROOTS;\
+  struct caml__roots_block *caml__frame = *caml_local_roots_ptr
 
 #define CAMLparam1(x) \
   CAMLparam0 (); \
@@ -318,34 +296,12 @@ struct caml__roots_block {
   CAMLparam0 (); \
   CAMLxparamN (x, (size))
 
-/* CAMLunused is preserved for compatibility reasons.
-   Instead of the legacy GCC/Clang-only
-     CAMLunused foo;
-   you should prefer
-     CAMLunused_start foo CAMLunused_end;
-   which supports both GCC/Clang and MSVC.
-*/
-#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 7))
-  #define CAMLunused_start __attribute__ ((unused))
-  #define CAMLunused_end
-  #define CAMLunused __attribute__ ((unused))
-#elif _MSC_VER >= 1500
-  #define CAMLunused_start  __pragma( warning (push) )           \
-    __pragma( warning (disable:4189 ) )
-  #define CAMLunused_end __pragma( warning (pop))
-  #define CAMLunused
-#else
-  #define CAMLunused_start
-  #define CAMLunused_end
-  #define CAMLunused
-#endif
-
 #define CAMLxparam1(x) \
   struct caml__roots_block caml__roots_##x; \
   CAMLunused_start int caml__dummy_##x = ( \
-    (void) caml__frame, \
-    (caml__roots_##x.next = Caml_state_field(local_roots)), \
-    (Caml_state_field(local_roots) = &caml__roots_##x), \
+    (caml__roots_##x.next = *caml_local_roots_ptr), \
+    (*caml_local_roots_ptr = &caml__roots_##x), \
+    (caml__roots_##x.mutexes = 0), \
     (caml__roots_##x.nitems = 1), \
     (caml__roots_##x.ntables = 1), \
     (caml__roots_##x.tables [0] = &x), \
@@ -355,9 +311,9 @@ struct caml__roots_block {
 #define CAMLxparam2(x, y) \
   struct caml__roots_block caml__roots_##x; \
   CAMLunused_start int caml__dummy_##x = ( \
-    (void) caml__frame, \
-    (caml__roots_##x.next = Caml_state_field(local_roots)), \
-    (Caml_state_field(local_roots) = &caml__roots_##x), \
+    (caml__roots_##x.next = *caml_local_roots_ptr), \
+    (*caml_local_roots_ptr = &caml__roots_##x), \
+    (caml__roots_##x.mutexes = 0), \
     (caml__roots_##x.nitems = 1), \
     (caml__roots_##x.ntables = 2), \
     (caml__roots_##x.tables [0] = &x), \
@@ -368,9 +324,9 @@ struct caml__roots_block {
 #define CAMLxparam3(x, y, z) \
   struct caml__roots_block caml__roots_##x; \
   CAMLunused_start int caml__dummy_##x = ( \
-    (void) caml__frame, \
-    (caml__roots_##x.next = Caml_state_field(local_roots)), \
-    (Caml_state_field(local_roots) = &caml__roots_##x), \
+    (caml__roots_##x.next = *caml_local_roots_ptr), \
+    (*caml_local_roots_ptr = &caml__roots_##x), \
+    (caml__roots_##x.mutexes = 0), \
     (caml__roots_##x.nitems = 1), \
     (caml__roots_##x.ntables = 3), \
     (caml__roots_##x.tables [0] = &x), \
@@ -382,9 +338,9 @@ struct caml__roots_block {
 #define CAMLxparam4(x, y, z, t) \
   struct caml__roots_block caml__roots_##x; \
   CAMLunused_start int caml__dummy_##x = ( \
-    (void) caml__frame, \
-    (caml__roots_##x.next = Caml_state_field(local_roots)), \
-    (Caml_state_field(local_roots) = &caml__roots_##x), \
+    (caml__roots_##x.next = *caml_local_roots_ptr), \
+    (*caml_local_roots_ptr = &caml__roots_##x), \
+    (caml__roots_##x.mutexes = 0), \
     (caml__roots_##x.nitems = 1), \
     (caml__roots_##x.ntables = 4), \
     (caml__roots_##x.tables [0] = &x), \
@@ -397,9 +353,9 @@ struct caml__roots_block {
 #define CAMLxparam5(x, y, z, t, u) \
   struct caml__roots_block caml__roots_##x; \
   CAMLunused_start int caml__dummy_##x = ( \
-    (void) caml__frame, \
-    (caml__roots_##x.next = Caml_state_field(local_roots)), \
-    (Caml_state_field(local_roots) = &caml__roots_##x), \
+    (caml__roots_##x.next = *caml_local_roots_ptr), \
+    (*caml_local_roots_ptr = &caml__roots_##x), \
+    (caml__roots_##x.mutexes = 0), \
     (caml__roots_##x.nitems = 1), \
     (caml__roots_##x.ntables = 5), \
     (caml__roots_##x.tables [0] = &x), \
@@ -412,10 +368,10 @@ struct caml__roots_block {
 
 #define CAMLxparamN(x, size) \
   struct caml__roots_block caml__roots_##x; \
-  CAMLunused_start int caml__dummy_##x = (     \
-    (void) caml__frame, \
-    (caml__roots_##x.next = Caml_state_field(local_roots)), \
-    (Caml_state_field(local_roots) = &caml__roots_##x), \
+  CAMLunused_start int caml__dummy_##x = ( \
+    (caml__roots_##x.next = *caml_local_roots_ptr), \
+    (*caml_local_roots_ptr = &caml__roots_##x), \
+    (caml__roots_##x.mutexes = 0), \
     (caml__roots_##x.nitems = (size)), \
     (caml__roots_##x.ntables = 1), \
     (caml__roots_##x.tables[0] = &(x[0])), \
@@ -445,13 +401,28 @@ struct caml__roots_block {
 #define CAMLlocalN(x, size) \
   value x [(size)]; \
   int caml__i_##x; \
+  CAMLxparamN (x, (size)); \
   for (caml__i_##x = 0; caml__i_##x < size; caml__i_##x ++) { \
     x[caml__i_##x] = Val_unit; \
-  } \
-  CAMLxparamN (x, (size))
+  }
 
+#ifdef DEBUG
+#define CAMLcheck_mutexes do {   \
+  struct caml__roots_block* r;   \
+  for (r = CAML_LOCAL_ROOTS;     \
+       r != caml__frame;         \
+       r = r->next) {            \
+    CAMLassert(r->mutexes == 0); \
+  }                              \
+} while (0)
+#else
+#define CAMLcheck_mutexes do {} while(0)
+#endif
 
-#define CAMLdrop Caml_state_field(local_roots) = caml__frame
+#define CAMLdrop do{              \
+  CAMLcheck_mutexes;              \
+  *caml_local_roots_ptr = caml__frame; \
+}while (0)
 
 #define CAMLreturn0 do{ \
   CAMLdrop; \
@@ -461,7 +432,7 @@ struct caml__roots_block {
 #define CAMLreturnT(type, result) do{ \
   type caml__temp_result = (result); \
   CAMLdrop; \
-  return caml__temp_result; \
+  return (caml__temp_result); \
 }while(0)
 
 #define CAMLreturn(result) CAMLreturnT(value, result)
@@ -500,16 +471,20 @@ struct caml__roots_block {
 
 #define Begin_roots1(r0) { \
   struct caml__roots_block caml__roots_block; \
-  caml__roots_block.next = Caml_state_field(local_roots); \
-  Caml_state_field(local_roots) = &caml__roots_block; \
+  caml_domain_state* domain_state = Caml_state; \
+  caml__roots_block.next = domain_state->local_roots; \
+  domain_state->local_roots = &caml__roots_block; \
+  caml__roots_block.mutexes = 0; \
   caml__roots_block.nitems = 1; \
   caml__roots_block.ntables = 1; \
   caml__roots_block.tables[0] = &(r0);
 
 #define Begin_roots2(r0, r1) { \
   struct caml__roots_block caml__roots_block; \
-  caml__roots_block.next = Caml_state_field(local_roots); \
-  Caml_state_field(local_roots) = &caml__roots_block; \
+  caml_domain_state* domain_state = Caml_state; \
+  caml__roots_block.next = domain_state->local_roots; \
+  domain_state->local_roots = &caml__roots_block; \
+  caml__roots_block.mutexes = 0; \
   caml__roots_block.nitems = 1; \
   caml__roots_block.ntables = 2; \
   caml__roots_block.tables[0] = &(r0); \
@@ -517,8 +492,10 @@ struct caml__roots_block {
 
 #define Begin_roots3(r0, r1, r2) { \
   struct caml__roots_block caml__roots_block; \
-  caml__roots_block.next = Caml_state_field(local_roots); \
-  Caml_state_field(local_roots) = &caml__roots_block; \
+  caml_domain_state* domain_state = Caml_state; \
+  caml__roots_block.next = domain_state->local_roots; \
+  domain_state->local_roots = &caml__roots_block; \
+  caml__roots_block.mutexes = 0; \
   caml__roots_block.nitems = 1; \
   caml__roots_block.ntables = 3; \
   caml__roots_block.tables[0] = &(r0); \
@@ -527,8 +504,10 @@ struct caml__roots_block {
 
 #define Begin_roots4(r0, r1, r2, r3) { \
   struct caml__roots_block caml__roots_block; \
-  caml__roots_block.next = Caml_state_field(local_roots); \
-  Caml_state_field(local_roots) = &caml__roots_block; \
+  caml_domain_state* domain_state = Caml_state; \
+  caml__roots_block.next = domain_state->local_roots; \
+  domain_state->local_roots = &caml__roots_block; \
+  caml__roots_block.mutexes = 0; \
   caml__roots_block.nitems = 1; \
   caml__roots_block.ntables = 4; \
   caml__roots_block.tables[0] = &(r0); \
@@ -538,8 +517,10 @@ struct caml__roots_block {
 
 #define Begin_roots5(r0, r1, r2, r3, r4) { \
   struct caml__roots_block caml__roots_block; \
-  caml__roots_block.next = Caml_state_field(local_roots); \
-  Caml_state_field(local_roots) = &caml__roots_block; \
+  caml_domain_state* domain_state = Caml_state; \
+  caml__roots_block.next = domain_state->local_roots; \
+  domain_state->local_roots = &caml__roots_block; \
+  caml__roots_block.mutexes = 0; \
   caml__roots_block.nitems = 1; \
   caml__roots_block.ntables = 5; \
   caml__roots_block.tables[0] = &(r0); \
@@ -550,13 +531,15 @@ struct caml__roots_block {
 
 #define Begin_roots_block(table, size) { \
   struct caml__roots_block caml__roots_block; \
-  caml__roots_block.next = Caml_state_field(local_roots); \
-  Caml_state_field(local_roots) = &caml__roots_block; \
+  caml_domain_state* domain_state = Caml_state; \
+  caml__roots_block.next = domain_state->local_roots; \
+  domain_state->local_roots = &caml__roots_block; \
+  caml__roots_block.mutexes = 0; \
   caml__roots_block.nitems = (size); \
   caml__roots_block.ntables = 1; \
   caml__roots_block.tables[0] = (table);
 
-#define End_roots() Caml_state_field(local_roots) = caml__roots_block.next; }
+#define End_roots() CAML_LOCAL_ROOTS = caml__roots_block.next; }
 
 
 /* [caml_register_global_root] registers a global C variable as a memory root
diff --git a/runtime/caml/minor_gc.h b/runtime/caml/minor_gc.h
index eefd38507310..d9b9ddf8bd66 100644
--- a/runtime/caml/minor_gc.h
+++ b/runtime/caml/minor_gc.h
@@ -16,22 +16,16 @@
 #ifndef CAML_MINOR_GC_H
 #define CAML_MINOR_GC_H
 
-#include "address_class.h"
+#include "misc.h"
 #include "config.h"
 
-/* Global variables moved to Caml_state in 4.10 */
-#define caml_young_start (Caml_state_field(young_start))
-#define caml_young_end (Caml_state_field(young_end))
-#define caml_young_ptr (Caml_state_field(young_ptr))
-#define caml_young_limit (Caml_state_field(young_limit))
-#define caml_young_alloc_start (Caml_state_field(young_alloc_start))
-#define caml_young_alloc_end (Caml_state_field(young_alloc_end))
-#define caml_young_alloc_mid (Caml_state_field(young_alloc_mid))
-#define caml_young_trigger (Caml_state_field(young_trigger))
-#define caml_minor_heap_wsz (Caml_state_field(minor_heap_wsz))
-#define caml_in_minor_collection (Caml_state_field(in_minor_collection))
-#define caml_extra_heap_resources_minor \
-  (Caml_state_field(extra_heap_resources_minor))
+#define caml_young_end Caml_state->young_end
+#define caml_young_ptr Caml_state->young_ptr
+#define caml_young_start Caml_state->young_start
+#define caml_young_limit Caml_state->young_limit
+#define caml_young_alloc_start Caml_state->young_start
+#define caml_young_alloc_end Caml_state->young_end
+#define caml_minor_heap_wsz Caml_state->minor_heap_wsz
 
 
 #define CAML_TABLE_STRUCT(t) { \
@@ -50,7 +44,6 @@ struct caml_ephe_ref_elt {
   value ephe;      /* an ephemeron in major heap */
   mlsize_t offset; /* the offset that points in the minor heap  */
 };
-
 struct caml_ephe_ref_table CAML_TABLE_STRUCT(struct caml_ephe_ref_elt);
 
 struct caml_custom_elt {
@@ -58,52 +51,46 @@ struct caml_custom_elt {
   mlsize_t mem;    /* The parameters for adjusting GC speed. */
   mlsize_t max;
 };
-
 struct caml_custom_table CAML_TABLE_STRUCT(struct caml_custom_elt);
-/* Table of custom blocks in the minor heap that contain finalizers
-   or GC speed parameters. */
+
+struct caml_minor_tables {
+  struct caml_ref_table major_ref;
+  struct caml_ephe_ref_table ephe_ref;
+  struct caml_custom_table custom;
+};
 
 CAMLextern void caml_minor_collection (void);
 
 #ifdef CAML_INTERNALS
 extern void caml_set_minor_heap_size (asize_t); /* size in bytes */
-extern void caml_empty_minor_heap (void);
-extern void caml_gc_dispatch (void);
-extern void caml_garbage_collection (void); /* runtime/signals_nat.c */
-extern void caml_oldify_one (value, value *);
-extern void caml_oldify_mopup (void);
-
+extern void caml_empty_minor_heap_no_major_slice_from_stw
+  (caml_domain_state* domain, void* unused, int participating_count,
+    caml_domain_state** participating); /* in STW */
+extern int caml_try_stw_empty_minor_heap_on_all_domains(void); /* out STW */
+extern void caml_empty_minor_heaps_once(void); /* out STW */
+CAMLextern void garbage_collection (void); /* def in asmrun/signals.c */
+header_t caml_get_header_val(value v);
+void caml_alloc_table (struct caml_ref_table *tbl, asize_t sz, asize_t rsv);
 extern void caml_realloc_ref_table (struct caml_ref_table *);
-extern void caml_alloc_table (struct caml_ref_table *, asize_t, asize_t);
 extern void caml_realloc_ephe_ref_table (struct caml_ephe_ref_table *);
-extern void caml_alloc_ephe_table (struct caml_ephe_ref_table *,
-                                   asize_t, asize_t);
 extern void caml_realloc_custom_table (struct caml_custom_table *);
-extern void caml_alloc_custom_table (struct caml_custom_table *,
-                                     asize_t, asize_t);
-void caml_alloc_minor_tables (void);
-
-/* Asserts that a word is a valid header for a young object */
-#define CAMLassert_young_header(hd)                \
-  CAMLassert(Wosize_hd(hd) > 0 &&                  \
-             Wosize_hd(hd) <= Max_young_wosize &&  \
-             Color_hd(hd) == 0)
-
-#define Oldify(p) do{ \
-    value __oldify__v__ = *p; \
-    if (Is_block (__oldify__v__) && Is_young (__oldify__v__)){ \
-      caml_oldify_one (__oldify__v__, (p)); \
-    } \
-  }while(0)
-
-Caml_inline void add_to_ref_table (struct caml_ref_table *tbl, value *p)
-{
-  if (tbl->ptr >= tbl->limit){
-    CAMLassert (tbl->ptr == tbl->limit);
-    caml_realloc_ref_table (tbl);
-  }
-  *tbl->ptr++ = p;
-}
+struct caml_minor_tables* caml_alloc_minor_tables(void);
+void caml_free_minor_tables(struct caml_minor_tables*);
+void caml_empty_minor_heap_setup(caml_domain_state* domain);
+
+#ifdef DEBUG
+extern int caml_debug_is_minor(value val);
+extern int caml_debug_is_major(value val);
+#endif
+
+#define Ref_table_add(ref_table, x) do {                                \
+    struct caml_ref_table* ref = (ref_table);                           \
+    if (ref->ptr >= ref->limit) {                                       \
+      CAMLassert (ref->ptr == ref->limit);                              \
+      caml_realloc_ref_table (ref);                                     \
+    }                                                                   \
+    *ref->ptr++ = (x);                                                  \
+  } while (0)
 
 Caml_inline void add_to_ephe_ref_table (struct caml_ephe_ref_table *tbl,
                                           value ar, mlsize_t offset)
diff --git a/runtime/caml/misc.h b/runtime/caml/misc.h
index 5915c30a7b5e..57babf46313b 100644
--- a/runtime/caml/misc.h
+++ b/runtime/caml/misc.h
@@ -29,6 +29,8 @@
 #include <stdlib.h>
 #include <stdarg.h>
 
+#include "camlatomic.h"
+
 /* Deprecation warnings */
 
 #if defined(__GNUC__) || defined(__clang__)
@@ -233,6 +235,36 @@ CAMLnoreturn_end;
 #define CAMLassert(x) ((void) 0)
 #endif
 
+#ifdef __GNUC__
+#define CAMLcheckresult __attribute__((warn_unused_result))
+#define CAMLlikely(e)   __builtin_expect((e), 1)
+#define CAMLunlikely(e) __builtin_expect((e), 0)
+#else
+#define CAMLcheckresult
+#define CAMLlikely(e) (e)
+#define CAMLunlikely(e) (e)
+#endif
+
+/* GC status assertions.
+
+   CAMLnoalloc at the start of a block means that the GC must not be
+   invoked during the block. */
+#if defined(__GNUC__) && defined(DEBUG)
+int caml_noalloc_begin(void);
+void caml_noalloc_end(int*);
+void caml_alloc_point_here(void);
+#define CAMLnoalloc                          \
+  int caml__noalloc                          \
+  __attribute__((cleanup(caml_noalloc_end),unused)) \
+    = caml_noalloc_begin()
+#define CAMLalloc_point_here (caml_alloc_point_here())
+#else
+#define CAMLnoalloc
+#define CAMLalloc_point_here ((void)0)
+#endif
+
+#define Is_power_of_2(x) ((x) > 0 && ((x) & ((x) - 1)) == 0)
+
 /* This hook is called when a fatal error occurs in the OCaml
    runtime. It is given arguments to be passed to the [vprintf]-like
    functions in order to synthetize the error message.
@@ -248,6 +280,11 @@ CAMLextern void caml_fatal_error (char *, ...)
   __attribute__ ((format (printf, 1, 2)))
 #endif
 CAMLnoreturn_end;
+CAMLextern void caml_fatal_error_arg (const char *fmt, const char *arg)
+                                     Noreturn;
+CAMLextern void caml_fatal_error_arg2 (const char *fmt1, const char *arg1,
+                                       const char *fmt2, const char *arg2)
+                                      Noreturn;
 
 /* Detection of available C built-in functions, the Clang way. */
 
@@ -425,7 +462,12 @@ CAMLextern int caml_read_directory(char_os * dirname,
 
 /* GC flags and messages */
 
-extern uintnat caml_verb_gc;
+void caml_gc_log (char *, ...)
+#ifdef __GNUC__
+  __attribute__ ((format (printf, 1, 2)))
+#endif
+;
+
 void caml_gc_message (int, char *, ...)
 #ifdef __GNUC__
   __attribute__ ((format (printf, 2, 3)))
@@ -441,8 +483,13 @@ int caml_runtime_warnings_active(void);
 #define Debug_tag(x) (INT64_LITERAL(0xD700D7D7D700D6D7u) \
                       | ((uintnat) (x) << 16) \
                       | ((uintnat) (x) << 48))
+#define Is_debug_tag(x) \
+  (((x) & \
+      INT64_LITERAL(0xff00ffffff00ffffu)) == INT64_LITERAL(0xD700D7D7D700D6D7u))
 #else
 #define Debug_tag(x) (0xD700D6D7ul | ((uintnat) (x) << 16))
+#define Is_debug_tag(x) \
+  (((x) & 0xff00fffful) == 0xD700D6D7ul)
 #endif /* ARCH_SIXTYFOUR */
 
 /*
@@ -473,10 +520,6 @@ int caml_runtime_warnings_active(void);
 
 #define Debug_uninit_stat    0xD7
 
-/* Note: the first argument is in fact a [value] but we don't have this
-   type available yet because we can't include [mlvalues.h] in this file.
-*/
-extern void caml_set_fields (intnat v, uintnat, uintnat);
 #endif /* DEBUG */
 
 
@@ -496,6 +539,9 @@ extern int caml_snwprintf(wchar_t * buf,
 #define snprintf_os snprintf
 #endif
 
+/* platform dependent thread naming */
+extern int caml_thread_setname(const char* name);
+
 /* Macro used to deactivate thread and address sanitizers on some
    functions. */
 #define CAMLno_tsan
diff --git a/runtime/caml/mlvalues.h b/runtime/caml/mlvalues.h
index 677e44e183a5..3ab4e93c6993 100644
--- a/runtime/caml/mlvalues.h
+++ b/runtime/caml/mlvalues.h
@@ -22,6 +22,14 @@
 #include "config.h"
 #include "misc.h"
 
+/* Needed here for domain_state */
+typedef intnat value;
+typedef atomic_intnat atomic_value;
+typedef int32_t opcode_t;
+typedef opcode_t * code_t;
+
+#include "domain_state.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -57,7 +65,6 @@ extern "C" {
          This is for use only by the GC.
 */
 
-typedef intnat value;
 typedef uintnat header_t;
 typedef uintnat mlsize_t;
 typedef unsigned int tag_t;             /* Actually, an unsigned char */
@@ -135,9 +142,8 @@ bits  63        (64-P) (63-P)        10 9     8 7   0
 #define Profinfo_hd(hd) NO_PROFINFO
 #endif /* WITH_PROFINFO */
 
-#define Hd_val(val) (((header_t *) (val)) [-1])        /* Also an l-value. */
-#define Hd_op(op) (Hd_val (op))                        /* Also an l-value. */
-#define Hd_bp(bp) (Hd_val (bp))                        /* Also an l-value. */
+#define Hd_val(val) (((header_t *) (val)) [-1] + 0)
+#define Hp_atomic_val(val) ((atomic_uintnat *)(val) - 1)
 #define Hd_hp(hp) (* ((header_t *) (hp)))              /* Also an l-value. */
 #define Hp_val(val) (((header_t *) (val)) - 1)
 #define Hp_op(op) (Hp_val (op))
@@ -202,11 +208,18 @@ bits  63        (64-P) (63-P)        10 9     8 7   0
 
 /* Pointer to the first field. */
 #define Op_val(x) ((value *) (x))
+#define Op_atomic_val(x) ((atomic_value *) (x))
 /* Fields are numbered from 0. */
 #define Field(x, i) (((value *)(x)) [i])           /* Also an l-value. */
 
-typedef int32_t opcode_t;
-typedef opcode_t * code_t;
+/* Is_young(val) is true iff val is in the reserved area for minor heaps */
+
+#define Is_young(val) \
+  (CAMLassert (Is_block (val)), \
+   (char *)(val) < (char *)caml_minor_heaps_end && \
+   (char *)(val) > (char *)caml_minor_heaps_base)
+
+#define Is_block_and_young(val) (Is_block(val) && Is_young(val))
 
 /* NOTE: [Forward_tag] and [Infix_tag] must be just under
    [No_scan_tag], with [Infix_tag] the lower one.
@@ -219,6 +232,7 @@ typedef opcode_t * code_t;
    See stdlib/lazy.ml. */
 #define Forward_tag 250
 #define Forward_val(v) Field(v, 0)
+/* FIXME: not immutable once shortcutting is implemented */
 
 /* If tag == Infix_tag : an infix header inside a closure */
 /* Infix_tag must be odd so that the infix header is scanned as an integer */
@@ -240,6 +254,17 @@ CAMLextern value caml_get_public_method (value obj, value tag);
    Note however that tags being hashed, same tag does not necessarily mean
    same method name. */
 
+Caml_inline value Val_ptr(void* p)
+{
+  CAMLassert(((value)p & 1) == 0);
+  return (value)p + 1;
+}
+Caml_inline void* Ptr_val(value val)
+{
+  CAMLassert(val & 1);
+  return (void*)(val - 1);
+}
+
 /* Special case of tuples of fields: closures */
 #define Closure_tag 247
 #define Code_val(val) (((code_t *) (val)) [0])     /* Also an l-value. */
@@ -261,10 +286,17 @@ CAMLextern value caml_get_public_method (value obj, value tag);
   (((uintnat)(arity) << 24) + ((uintnat)(delta) << 1) + 1)
 #endif
 
-/* This tag is used (with Forward_tag) to implement lazy values.
+/* This tag is used (with Forcing_tag & Forward_tag) to implement lazy values.
    See major_gc.c and stdlib/lazy.ml. */
 #define Lazy_tag 246
 
+/* Tag used for continuations (see fiber.c) */
+#define Cont_tag 245
+
+/* This tag is used (with Lazy_tag & Forward_tag) to implement lazy values.
+ * See major_gc.c and stdlib/lazy.ml. */
+#define Forcing_tag 244
+
 /* Another special case: variants */
 CAMLextern value caml_hash_variant(char const * tag);
 
@@ -363,7 +395,7 @@ CAMLextern int caml_is_double_array (value);   /* 0 is false, 1 is true */
    the GC; therefore, they must not contain any [value].
    See [custom.h] for operations on method suites. */
 #define Custom_tag 255
-#define Data_custom_val(v) ((void *) &Field((v), 1))
+#define Data_custom_val(v) ((void *) (Op_val(v) + 1))
 struct custom_operations;       /* defined in [custom.h] */
 
 /* Int32.t, Int64.t and Nativeint.t are represented as custom blocks. */
@@ -379,8 +411,8 @@ CAMLextern int64_t caml_Int64_val(value v);
 
 /* 3- Atoms are 0-tuples.  They are statically allocated once and for all. */
 
-CAMLextern header_t *caml_atom_table;
-#define Atom(tag) (Val_hp (&(caml_atom_table [(tag)])))
+CAMLextern value caml_atom(tag_t);
+#define Atom(tag) caml_atom(tag)
 
 /* Booleans are integers 0 or 1 */
 
@@ -406,10 +438,6 @@ CAMLextern header_t *caml_atom_table;
 #define Is_none(v) ((v) == Val_none)
 #define Is_some(v) Is_block(v)
 
-/* The table of global identifiers */
-
-extern value caml_global_data;
-
 CAMLextern value caml_set_oo_id(value obj);
 
 /* Header for out-of-heap blocks. */
@@ -417,8 +445,8 @@ CAMLextern value caml_set_oo_id(value obj);
 #define Caml_out_of_heap_header(wosize, tag)                                  \
       (/*CAMLassert ((wosize) <= Max_wosize),*/                               \
        ((header_t) (((header_t) (wosize) << 10)                               \
-                    + (3 << 8) /* matches [Caml_black]. See [gc.h] */         \
-                    + (tag_t) (tag)))                                         \
+           + (3 << 8) /* matches [NOT_MARKABLE]. See [shared_heap.h]. */      \
+           + (tag_t) (tag)))                                                  \
       )
 
 #ifdef __cplusplus
diff --git a/runtime/caml/osdeps.h b/runtime/caml/osdeps.h
index bc8cd3b9e682..f011375cffeb 100644
--- a/runtime/caml/osdeps.h
+++ b/runtime/caml/osdeps.h
@@ -138,6 +138,8 @@ CAMLextern clock_t caml_win32_clock(void);
 
 #endif /* _WIN32 */
 
+extern void caml_init_os_params(void);
+
 #endif /* CAML_INTERNALS */
 
 #ifdef _WIN32
diff --git a/runtime/caml/platform.h b/runtime/caml/platform.h
new file mode 100644
index 000000000000..68a5875a6da7
--- /dev/null
+++ b/runtime/caml/platform.h
@@ -0,0 +1,167 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                   Stephen Dolan, University of Cambridge               */
+/*                                                                        */
+/*   Copyright 2016 Indian Institute of Technology, Madras                */
+/*   Copyright 2016 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#ifndef CAML_PLAT_THREADS_H
+#define CAML_PLAT_THREADS_H
+/* Platform-specific concurrency and memory primitives */
+
+#include <pthread.h>
+#include <errno.h>
+#include <string.h>
+#include "config.h"
+#include "mlvalues.h"
+#include "signals.h"
+
+#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+/* Loads and stores with acquire and release semantics respectively */
+
+Caml_inline void cpu_relax() {
+#if defined(__x86_64__) || defined(__i386__)
+  asm volatile("pause" ::: "memory");
+#elif defined(__aarch64__)
+  asm volatile ("yield" ::: "memory");
+#else
+  #warning "cpu_relax() undefined for this architecture!"
+#endif
+}
+
+Caml_inline uintnat atomic_load_acq(atomic_uintnat* p) {
+  return atomic_load_explicit(p, memory_order_acquire);
+}
+
+Caml_inline void atomic_store_rel(atomic_uintnat* p, uintnat v) {
+  atomic_store_explicit(p, v, memory_order_release);
+}
+
+/* Spin-wait loops */
+
+#define Max_spins 1000
+
+unsigned caml_plat_spin_wait(unsigned spins,
+                             const char* file, int line,
+                             const char* function);
+
+#define GENSYM_3(name, l) name##l
+#define GENSYM_2(name, l) GENSYM_3(name, l)
+#define GENSYM(name) GENSYM_2(name, __LINE__)
+
+#define SPIN_WAIT                                                       \
+  unsigned GENSYM(caml__spins) = 0;                                     \
+  for (; 1; cpu_relax(),                                                \
+         GENSYM(caml__spins) =                                          \
+           CAMLlikely(GENSYM(caml__spins) < Max_spins) ?                \
+         GENSYM(caml__spins) + 1 :                                      \
+         caml_plat_spin_wait(GENSYM(caml__spins),                       \
+                             __FILE__, __LINE__, __func__))
+
+Caml_inline uintnat atomic_load_wait_nonzero(atomic_uintnat* p) {
+  SPIN_WAIT {
+    uintnat v = atomic_load_acq(p);
+    if (v) return v;
+  }
+}
+
+/* Atomic read-modify-write instructions, with full fences */
+
+Caml_inline uintnat atomic_fetch_add_verify_ge0(atomic_uintnat* p, uintnat v) {
+  uintnat result = atomic_fetch_add(p,v);
+  CAMLassert ((intnat)result > 0);
+  return result;
+}
+
+
+typedef pthread_mutex_t caml_plat_mutex;
+#define CAML_PLAT_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+void caml_plat_mutex_init(caml_plat_mutex*);
+Caml_inline void caml_plat_lock(caml_plat_mutex*);
+Caml_inline int caml_plat_try_lock(caml_plat_mutex*);
+void caml_plat_assert_locked(caml_plat_mutex*);
+void caml_plat_assert_all_locks_unlocked(void);
+Caml_inline void caml_plat_unlock(caml_plat_mutex*);
+void caml_plat_mutex_free(caml_plat_mutex*);
+typedef struct { pthread_cond_t cond; caml_plat_mutex* mutex; } caml_plat_cond;
+#define CAML_PLAT_COND_INITIALIZER(m) { PTHREAD_COND_INITIALIZER, m }
+void caml_plat_cond_init(caml_plat_cond*, caml_plat_mutex*);
+void caml_plat_wait(caml_plat_cond*);
+/* like caml_plat_wait, but if nanoseconds surpasses the second parameter
+   without a signal, then this function returns 1. */
+void caml_plat_broadcast(caml_plat_cond*);
+void caml_plat_signal(caml_plat_cond*);
+void caml_plat_cond_free(caml_plat_cond*);
+
+struct caml__mutex_unwind {
+  caml_plat_mutex* mutex;
+  struct caml__mutex_unwind* next;
+};
+
+/* Memory management primitives (mmap) */
+
+uintnat caml_mem_round_up_pages(uintnat size);
+void* caml_mem_map(uintnat size, uintnat alignment, int reserve_only);
+void* caml_mem_commit(void* mem, uintnat size);
+void caml_mem_decommit(void* mem, uintnat size);
+void caml_mem_unmap(void* mem, uintnat size);
+
+
+Caml_inline void check_err(char* action, int err)
+{
+  if (err) {
+    caml_fatal_error_arg2(
+      "Fatal error during %s", action, ": %s\n", strerror(err));
+  }
+}
+
+#ifdef DEBUG
+static __thread int lockdepth;
+#define DEBUG_LOCK(m) (lockdepth++)
+#define DEBUG_UNLOCK(m) (lockdepth--)
+#else
+#define DEBUG_LOCK(m)
+#define DEBUG_UNLOCK(m)
+#endif
+
+Caml_inline void caml_plat_lock(caml_plat_mutex* m)
+{
+  check_err("lock", pthread_mutex_lock(m));
+  DEBUG_LOCK(m);
+}
+
+Caml_inline int caml_plat_try_lock(caml_plat_mutex* m)
+{
+  int r = pthread_mutex_trylock(m);
+  if (r == EBUSY) {
+    return 0;
+  } else {
+    check_err("try_lock", r);
+    DEBUG_LOCK(m);
+    return 1;
+  }
+}
+
+Caml_inline void caml_plat_unlock(caml_plat_mutex* m)
+{
+  DEBUG_UNLOCK(m);
+  check_err("unlock", pthread_mutex_unlock(m));
+}
+
+/* On Windows, the SYSTEM_INFO.dwPageSize is a DWORD (32-bit), but conveniently
+   long is also 32-bit */
+extern long caml_sys_pagesize;
+
+#endif /* CAML_PLATFORM_H */
diff --git a/runtime/caml/roots.h b/runtime/caml/roots.h
index 8ac9d8d26359..e31652078e46 100644
--- a/runtime/caml/roots.h
+++ b/runtime/caml/roots.h
@@ -21,26 +21,16 @@
 #include "misc.h"
 #include "memory.h"
 
-typedef void (*scanning_action) (value, value *);
-
-void caml_oldify_local_roots (void);
-void caml_darken_all_roots_start (void);
-intnat caml_darken_all_roots_slice (intnat);
-void caml_do_roots (scanning_action, int);
-extern uintnat caml_incremental_roots_count;
-#ifndef NATIVE_CODE
-CAMLextern void caml_do_local_roots_byt (scanning_action, value *, value *,
-                                         struct caml__roots_block *);
-#define caml_do_local_roots caml_do_local_roots_byt
-#else
-CAMLextern void caml_do_local_roots_nat (
-                     scanning_action f, char * c_bottom_of_stack,
-                     uintnat last_retaddr, value * v_gc_regs,
-                     struct caml__roots_block * gc_local_roots);
-#define caml_do_local_roots caml_do_local_roots_nat
-#endif
-
-CAMLextern void (*caml_scan_roots_hook) (scanning_action);
+typedef void (*scanning_action) (void*, value, value *);
+CAMLextern void (*caml_scan_roots_hook)(scanning_action, void*,
+                                        caml_domain_state*);
+
+CAMLextern void caml_do_roots (scanning_action f, void* data,
+                               caml_domain_state* d, int do_final_val);
+CAMLextern void caml_do_local_roots(scanning_action f, void* data,
+                                    struct caml__roots_block* local_roots,
+                                    struct stack_info *current_stack,
+                                    value * v_gc_regs);
 
 #endif /* CAML_INTERNALS */
 
diff --git a/runtime/caml/s.h.in b/runtime/caml/s.h.in
index 8e4efdcc7285..ad524a4a02cc 100644
--- a/runtime/caml/s.h.in
+++ b/runtime/caml/s.h.in
@@ -70,6 +70,10 @@
 
 #undef HAS_ISSETUGID
 
+#undef HAS_STDATOMIC_H
+
+#undef HAS_SYS_MMAN_H
+
 /* 2. For the Unix library. */
 
 #undef HAS_SOCKETS
diff --git a/runtime/caml/shared_heap.h b/runtime/caml/shared_heap.h
new file mode 100644
index 000000000000..b393a94ac54c
--- /dev/null
+++ b/runtime/caml/shared_heap.h
@@ -0,0 +1,101 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                 Stephen Dolan, University of Cambridge                 */
+/*                                                                        */
+/*   Copyright 2015 Indian Institute of Technology, Madras                */
+/*   Copyright 2015 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+#ifndef CAML_SHARED_HEAP_H
+#define CAML_SHARED_HEAP_H
+
+#ifdef CAML_INTERNALS
+
+#include "config.h"
+#include "roots.h"
+#include "domain.h"
+#include "misc.h"
+
+struct caml_heap_state;
+struct pool;
+
+struct caml_heap_state* caml_init_shared_heap(void);
+void caml_teardown_shared_heap(struct caml_heap_state* heap);
+
+value* caml_shared_try_alloc(struct caml_heap_state*, mlsize_t, tag_t, int);
+
+void caml_sample_heap_stats(struct caml_heap_state*, struct heap_stats*);
+
+uintnat caml_heap_size(struct caml_heap_state*);
+uintnat caml_top_heap_words(struct caml_heap_state*);
+uintnat caml_heap_blocks(struct caml_heap_state*);
+
+struct pool* caml_pool_of_shared_block(value v);
+
+void caml_shared_unpin(value v);
+
+/* always readable by all threads
+   written only by a single thread during STW periods */
+typedef uintnat status;
+struct global_heap_state {
+  status MARKED, UNMARKED, GARBAGE;
+};
+extern struct global_heap_state caml_global_heap_state;
+
+/* CR mshinwell: ensure this matches [Emitaux] */
+enum {NOT_MARKABLE = 3 << 8};
+
+Caml_inline int Has_status_hd(header_t hd, status s) {
+  return (hd & (3 << 8)) == s;
+}
+
+Caml_inline header_t With_status_hd(header_t hd, status s) {
+  return (hd & ~(3 << 8)) | s;
+}
+
+Caml_inline int is_garbage(value v) {
+  return Has_status_hd(Hd_val(v), caml_global_heap_state.GARBAGE);
+}
+
+Caml_inline int is_unmarked(value v) {
+  return Has_status_hd(Hd_val(v), caml_global_heap_state.UNMARKED);
+}
+
+Caml_inline int is_marked(value v) {
+  return Has_status_hd(Hd_val(v), caml_global_heap_state.MARKED);
+}
+
+void caml_redarken_pool(struct pool*, scanning_action, void*);
+
+intnat caml_sweep(struct caml_heap_state*, intnat);
+
+
+/* must be called during STW */
+void caml_cycle_heap_stw(void);
+
+/* must be called on each domain
+   (after caml_cycle_heap_stw) */
+void caml_cycle_heap(struct caml_heap_state*);
+
+/* Heap invariant verification (for debugging) */
+
+/* caml_verify_begin must only be called while all domains are paused */
+struct heap_verify_state* caml_verify_begin(void);
+void caml_verify_root(void*, value, value*);
+void caml_verify_heap(struct heap_verify_state*); /* deallocates arg */
+
+#ifdef DEBUG
+/* [is_garbage(v)] returns true if [v] is a garbage value */
+int is_garbage (value);
+#endif
+
+#endif /* CAML_INTERNALS */
+
+#endif /* CAML_SHARED_HEAP_H */
diff --git a/runtime/caml/signals.h b/runtime/caml/signals.h
index c6aeebfc78ac..d6041f941ae5 100644
--- a/runtime/caml/signals.h
+++ b/runtime/caml/signals.h
@@ -43,37 +43,14 @@ CAMLextern void caml_process_pending_actions (void);
 CAMLextern int caml_check_pending_actions (void);
 /* Returns 1 if there are pending actions, 0 otherwise. */
 
-CAMLextern value caml_process_pending_actions_exn (void);
-/* Same as [caml_process_pending_actions], but returns the exception
-   if any (otherwise returns [Val_unit]). */
-
 #ifdef CAML_INTERNALS
-CAMLextern intnat volatile caml_pending_signals[];
-
-/* When an action is pending, either [caml_something_to_do] is 1, or
-   there is a function currently running which will end by either
-   executing all actions, or set [caml_something_to_do] back to 1. We
-   set it to 0 when starting executing all callbacks.
-
-   In the case there are two different callbacks (say, a signal and a
-   finaliser) arriving at the same time, then the processing of one
-   awaits the return of the other. In case of long-running callbacks,
-   we may want to run the second one without waiting the end of the
-   first one. We do this by provoking an additional polling every
-   minor collection and every major slice. To guarantee a low latency
-   for signals, we avoid delaying signal handlers in that case by
-   calling them first.
-
-   FIXME: We could get into caml_process_pending_actions when
-   caml_something_to_do is seen as set but not caml_pending_signals,
-   making us miss the signal.
-*/
-CAMLextern int volatile caml_something_to_do;
+CAMLextern atomic_intnat caml_pending_signals[];
 
 /* Global variables moved to Caml_state in 4.10 */
 #define caml_requested_major_slice (Caml_state_field(requested_major_slice))
 #define caml_requested_minor_gc (Caml_state_field(requested_minor_gc))
 
+int caml_check_for_pending_signals(void);
 void caml_update_young_limit(void);
 void caml_request_major_slice (void);
 void caml_request_minor_gc (void);
@@ -82,20 +59,17 @@ CAMLextern int caml_rev_convert_signal_number (int);
 value caml_execute_signal_exn(int signal_number, int in_signal_handler);
 CAMLextern void caml_record_signal(int signal_number);
 CAMLextern value caml_process_pending_signals_exn(void);
+CAMLextern void caml_process_pending_signals(void);
 void caml_set_action_pending (void);
-value caml_do_pending_actions_exn (void);
-value caml_process_pending_actions_with_root (value extra_root); // raises
-value caml_process_pending_actions_with_root_exn (value extra_root);
 int caml_set_signal_action(int signo, int action);
-CAMLextern int caml_setup_stack_overflow_detection(void);
-CAMLextern int caml_stop_stack_overflow_detection(void);
-CAMLextern void caml_init_signals(void);
-CAMLextern void caml_terminate_signals(void);
+
+CAMLextern value caml_process_pending_signals_with_root_exn (value extra_root);
+void caml_init_signal_handling(void);
+int caml_init_signal_stack(void);
+void caml_free_signal_stack(void);
+
 CAMLextern void (*caml_enter_blocking_section_hook)(void);
 CAMLextern void (*caml_leave_blocking_section_hook)(void);
-#ifdef POSIX_SIGNALS
-CAMLextern int (*caml_sigmask_hook)(int, const sigset_t *, sigset_t *);
-#endif
 #endif /* CAML_INTERNALS */
 
 #ifdef __cplusplus
diff --git a/runtime/caml/sizeclasses.h b/runtime/caml/sizeclasses.h
new file mode 100644
index 000000000000..83d9ccf2f556
--- /dev/null
+++ b/runtime/caml/sizeclasses.h
@@ -0,0 +1,19 @@
+/* This file is generated by tools/gen_sizeclasses.ml */
+#define POOL_WSIZE 4096
+#define POOL_HEADER_WSIZE 4
+#define SIZECLASS_MAX 128
+#define NUM_SIZECLASSES 32
+static const unsigned int wsize_sizeclass[NUM_SIZECLASSES] = 
+{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 26, 29, 33, 37, 42,
+  47, 53, 59, 65, 73, 81, 89, 99, 108, 118, 128 };
+static const unsigned char wastage_sizeclass[NUM_SIZECLASSES] = 
+{ 0, 0, 0, 0, 2, 0, 4, 4, 6, 2, 0, 4, 12, 6, 12, 21, 10, 3, 0, 22, 18, 3, 11,
+  21, 62, 4, 42, 87, 33, 96, 80, 124 };
+static const unsigned char sizeclass_wsize[SIZECLASS_MAX + 1] = 
+{ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+  15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20,
+  20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23,
+  23, 24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+  26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28,
+  28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30,
+  30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31 };
diff --git a/runtime/caml/stack.h b/runtime/caml/stack.h
index 9c182ee6a88f..9b42d1ae813f 100644
--- a/runtime/caml/stack.h
+++ b/runtime/caml/stack.h
@@ -24,23 +24,15 @@
 
 #ifdef TARGET_i386
 #define Saved_return_address(sp) *((intnat *)((sp) - 4))
-#ifndef SYS_win32
-#define Callback_link(sp) ((struct caml_context *)((sp) + 16))
-#else
-#define Callback_link(sp) ((struct caml_context *)((sp) + 8))
-#endif
 #endif
 
 #ifdef TARGET_power
 #if defined(MODEL_ppc)
 #define Saved_return_address(sp) *((intnat *)((sp) - 4))
-#define Callback_link(sp) ((struct caml_context *)((sp) + 16))
 #elif defined(MODEL_ppc64)
 #define Saved_return_address(sp) *((intnat *)((sp) + 16))
-#define Callback_link(sp) ((struct caml_context *)((sp) + (48 + 32)))
 #elif defined(MODEL_ppc64le)
 #define Saved_return_address(sp) *((intnat *)((sp) + 16))
-#define Callback_link(sp) ((struct caml_context *)((sp) + (32 + 32)))
 #else
 #error "TARGET_power: wrong MODEL"
 #endif
@@ -52,95 +44,41 @@
 #ifdef TARGET_s390x
 #define Saved_return_address(sp) *((intnat *)((sp) - SIZEOF_PTR))
 #define Trap_frame_size 16
-#define Callback_link(sp) ((struct caml_context *)((sp) + Trap_frame_size))
 #endif
 
 #ifdef TARGET_arm
 #define Saved_return_address(sp) *((intnat *)((sp) - 4))
-#define Callback_link(sp) ((struct caml_context *)((sp) + 8))
 #endif
 
 #ifdef TARGET_amd64
+/* Size of the gc_regs structure, in words.
+   See amd64.S and amd64/proc.ml for the indices */
+#define Wosize_gc_regs (13 /* int regs */ + 16 /* float regs */)
 #define Saved_return_address(sp) *((intnat *)((sp) - 8))
-#define Callback_link(sp) ((struct caml_context *)((sp) + 16))
 #endif
 
 #ifdef TARGET_arm64
 #define Saved_return_address(sp) *((intnat *)((sp) - 8))
-#define Callback_link(sp) ((struct caml_context *)((sp) + 16))
+#define Context_needs_padding /* keep stack 16-byte aligned */
 #endif
 
 #ifdef TARGET_riscv
 #define Saved_return_address(sp) *((intnat *)((sp) - 8))
-#define Callback_link(sp) ((struct caml_context *)((sp) + 16))
 #endif
 
 /* Structure of OCaml callback contexts */
 
 struct caml_context {
-  char * bottom_of_stack;       /* beginning of OCaml stack chunk */
-  uintnat last_retaddr;         /* last return address in OCaml code */
+  uintnat exception_ptr;        /* exception pointer */
   value * gc_regs;              /* pointer to register block */
+#ifdef Context_needs_padding
+  value padding;
+#endif
 };
 
-/* Structure of frame descriptors */
-
-typedef struct {
-  uintnat retaddr;
-  unsigned short frame_size;
-  unsigned short num_live;
-  unsigned short live_ofs[1 /* num_live */];
-  /*
-    If frame_size & 2, then allocation info follows:
-  unsigned char num_allocs;
-  unsigned char alloc_lengths[num_alloc];
-
-    If frame_size & 1, then debug info follows:
-  uint32_t debug_info_offset[num_debug];
-
-    Debug info is stored as relative offsets to debuginfo structures.
-    num_debug is num_alloc if frame_size & 2, otherwise 1. */
-} frame_descr;
-
-/* Allocation lengths are encoded as 0-255, giving sizes 1-256 */
-#define Wosize_encoded_alloc_len(n) ((uintnat)(n) + 1)
-
-/* Used to compute offsets in frame tables.
-   ty must have power-of-2 size */
-#define Align_to(p, ty) \
-  (void*)(((uintnat)(p) + sizeof(ty) - 1) & -sizeof(ty))
-
-
-/* Hash table of frame descriptors */
-
-extern frame_descr ** caml_frame_descriptors;
-extern uintnat caml_frame_descriptors_mask;
-
-#define Hash_retaddr(addr) \
-  (((uintnat)(addr) >> 3) & caml_frame_descriptors_mask)
-
-extern void caml_init_frame_descriptors(void);
-extern void caml_register_frametable(intnat *);
-extern void caml_unregister_frametable(intnat *);
-extern void caml_register_dyn_global(void *);
-
-extern uintnat caml_stack_usage (void);
-extern uintnat (*caml_stack_usage_hook)(void);
-
 /* Declaration of variables used in the asm code */
 extern value * caml_globals[];
-extern char caml_globals_map[];
 extern intnat caml_globals_inited;
-extern intnat * caml_frametable[];
-
-/* Global variables moved to Caml_state in 4.10 */
-#define caml_top_of_stack (Caml_state_field(top_of_stack))
-#define caml_bottom_of_stack (Caml_state_field(bottom_of_stack))
-#define caml_last_return_address (Caml_state_field(last_return_address))
-#define caml_gc_regs (Caml_state_field(gc_regs))
-#define caml_exception_pointer (Caml_state_field(exception_pointer))
-
-CAMLextern frame_descr * caml_next_frame_descriptor(uintnat * pc, char ** sp);
 
 #endif /* CAML_INTERNALS */
 
diff --git a/runtime/caml/stacks.h b/runtime/caml/stacks.h
deleted file mode 100644
index d309141f9b90..000000000000
--- a/runtime/caml/stacks.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-/* structure of the stacks */
-
-#ifndef CAML_STACKS_H
-#define CAML_STACKS_H
-
-#ifdef CAML_INTERNALS
-
-#include "misc.h"
-#include "mlvalues.h"
-#include "memory.h"
-
-/* Global variables moved to Caml_state in 4.10 */
-#define caml_stack_low (Caml_state_field(stack_low))
-#define caml_stack_high (Caml_state_field(stack_high))
-#define caml_stack_threshold (Caml_state_field(stack_threshold))
-#define caml_extern_sp (Caml_state_field(extern_sp))
-#define caml_trapsp (Caml_state_field(trapsp))
-#define caml_trap_barrier (Caml_state_field(trap_barrier))
-
-#define Trap_pc(tp) (((code_t *)(tp))[0])
-#define Trap_link_offset(tp) (((value *)(tp))[1])
-
-void caml_init_stack (uintnat init_max_size);
-void caml_realloc_stack (asize_t required_size);
-void caml_change_max_stack_size (uintnat new_max_size);
-uintnat caml_stack_usage (void);
-
-CAMLextern uintnat (*caml_stack_usage_hook)(void);
-
-#endif /* CAML_INTERNALS */
-
-#endif /* CAML_STACKS_H */
diff --git a/runtime/caml/startup.h b/runtime/caml/startup.h
index f3e1fe6dd87e..e66b76ae3bb7 100644
--- a/runtime/caml/startup.h
+++ b/runtime/caml/startup.h
@@ -20,6 +20,7 @@
 
 #include "mlvalues.h"
 #include "exec.h"
+#include "startup_aux.h"
 
 CAMLextern void caml_startup_code(
            code_t code, asize_t code_size,
diff --git a/runtime/caml/startup_aux.h b/runtime/caml/startup_aux.h
index 16d160be4930..160a8064b6c4 100644
--- a/runtime/caml/startup_aux.h
+++ b/runtime/caml/startup_aux.h
@@ -23,21 +23,42 @@
 extern void caml_init_locale(void);
 extern void caml_free_locale(void);
 
-extern void caml_init_atom_table (void);
-
-extern uintnat caml_init_percent_free;
-extern uintnat caml_init_max_percent_free;
-extern uintnat caml_init_minor_heap_wsz;
-extern uintnat caml_init_heap_chunk_sz;
-extern uintnat caml_init_heap_wsz;
-extern uintnat caml_init_max_stack_wsz;
-extern uintnat caml_init_major_window;
-extern uintnat caml_init_custom_major_ratio;
-extern uintnat caml_init_custom_minor_ratio;
-extern uintnat caml_init_custom_minor_max_bsz;
-extern uintnat caml_init_policy;
-extern uintnat caml_trace_level;
-extern int caml_cleanup_on_exit;
+/* readonly after startup */
+struct caml_params {
+  const char_os* exe_name;
+
+  /* for meta.c */
+  const char* section_table;
+  asize_t section_table_size;
+
+  const char_os* cds_file;
+
+  uintnat verb_gc;
+  uintnat parser_trace;
+  uintnat trace_level;
+  uintnat eventlog_enabled;
+  uintnat verify_heap;
+  uintnat print_magic;
+  uintnat print_config;
+
+  uintnat init_percent_free;
+  uintnat init_max_percent_free;
+  uintnat init_minor_heap_wsz;
+  uintnat init_heap_chunk_sz;
+  uintnat init_heap_wsz;
+  uintnat init_custom_major_ratio;
+  uintnat init_custom_minor_ratio;
+  uintnat init_custom_minor_max_bsz;
+
+  uintnat init_max_stack_wsz;
+  uintnat init_fiber_wsz;
+
+  uintnat backtrace_enabled;
+  uintnat runtime_warnings;
+  uintnat cleanup_on_exit;
+};
+
+extern const struct caml_params* const caml_params;
 
 extern void caml_parse_ocamlrunparam (void);
 
@@ -46,6 +67,10 @@ extern void caml_parse_ocamlrunparam (void);
    If [pooling] is 0, [caml_stat_*] functions will not be backed by a pool. */
 extern int caml_startup_aux (int pooling);
 
+void caml_init_exe_name(const char_os* exe_name);
+void caml_init_section_table(const char* section_table,
+                             asize_t section_table_size);
+
 #endif /* CAML_INTERNALS */
 
 #endif /* CAML_STARTUP_AUX_H */
diff --git a/runtime/caml/sync.h b/runtime/caml/sync.h
new file mode 100644
index 000000000000..9f3bf64570a4
--- /dev/null
+++ b/runtime/caml/sync.h
@@ -0,0 +1,92 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*          Xavier Leroy and Damien Doligez, INRIA Rocquencourt           */
+/*                                                                        */
+/*   Copyright 1996 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+/* POSIX thread implementation of the user facing Mutex and Condition */
+
+#define CAML_INTERNALS
+
+#include "alloc.h"
+#include "custom.h"
+#include "fail.h"
+#include "memory.h"
+
+#include <pthread.h>
+
+typedef int sync_retcode;
+
+/* Mutexes */
+
+typedef pthread_mutex_t * sync_mutex;
+
+#define Mutex_val(v) (* ((sync_mutex *) Data_custom_val(v)))
+
+Caml_inline int sync_mutex_lock(sync_mutex m)
+{
+  return pthread_mutex_lock(m);
+}
+
+#define MUTEX_PREVIOUSLY_UNLOCKED 0
+#define MUTEX_ALREADY_LOCKED EBUSY
+
+Caml_inline int sync_mutex_trylock(sync_mutex m)
+{
+  return pthread_mutex_trylock(m);
+}
+
+Caml_inline int sync_mutex_unlock(sync_mutex m)
+{
+  return pthread_mutex_unlock(m);
+}
+
+/* Condition variables */
+
+typedef pthread_cond_t * sync_condvar;
+
+#define Condition_val(v) (* (sync_condvar *) Data_custom_val(v))
+
+Caml_inline int sync_condvar_signal(sync_condvar c)
+{
+ return pthread_cond_signal(c);
+}
+
+Caml_inline int sync_condvar_broadcast(sync_condvar c)
+{
+    return pthread_cond_broadcast(c);
+}
+
+Caml_inline int sync_condvar_wait(sync_condvar c, sync_mutex m)
+{
+  return pthread_cond_wait(c, m);
+}
+
+/* Reporting errors */
+
+Caml_inline void sync_check_error(int retcode, char * msg)
+{
+  char * err;
+  int errlen, msglen;
+  value str;
+
+  if (retcode == 0) return;
+  if (retcode == ENOMEM) caml_raise_out_of_memory();
+  err = strerror(retcode);
+  msglen = strlen(msg);
+  errlen = strlen(err);
+  str = caml_alloc_string(msglen + 2 + errlen);
+  memcpy (&Byte(str, 0), msg, msglen);
+  memcpy (&Byte(str, msglen), ": ", 2);
+  memcpy (&Byte(str, msglen + 2), err, errlen);
+  caml_raise_sys_error(str);
+}
diff --git a/runtime/caml/sys.h b/runtime/caml/sys.h
index 75b97818c108..a22e0bbfb4ff 100644
--- a/runtime/caml/sys.h
+++ b/runtime/caml/sys.h
@@ -41,8 +41,6 @@ CAMLnoreturn_start
 CAMLextern void caml_do_exit (int)
 CAMLnoreturn_end;
 
-extern char_os * caml_exe_name;
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/runtime/caml/ui.h b/runtime/caml/ui.h
index 3047ba7fb75d..9551ff4853e6 100644
--- a/runtime/caml/ui.h
+++ b/runtime/caml/ui.h
@@ -18,8 +18,6 @@
 #ifndef CAML_UI_H
 #define CAML_UI_H
 
-#ifdef CAML_INTERNALS
-
 #include "config.h"
 
 void ui_exit (int return_code);
@@ -27,6 +25,4 @@ int ui_read (int file_desc, char *buf, unsigned int length);
 int ui_write (int file_desc, char *buf, unsigned int length);
 void ui_print_stderr (char *format, void *arg);
 
-#endif /* CAML_INTERNALS */
-
 #endif /* CAML_UI_H */
diff --git a/runtime/caml/weak.h b/runtime/caml/weak.h
index 8192496f0e80..293f0cef690a 100644
--- a/runtime/caml/weak.h
+++ b/runtime/caml/weak.h
@@ -24,124 +24,24 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
-/** The requirements of the functions must be satisfied, it is
-    unspecified what happens if they are not. The debugging runtime
-    could check some of them. */
-
-CAMLextern value caml_ephemeron_create(mlsize_t len);
-/** Create an ephemeron with the given number of keys.
-    This function allocates.
- */
-
-CAMLextern mlsize_t caml_ephemeron_num_keys(value eph);
-/** Return the number of key in the ephemeron. The valid key offset goes
-    from [0] to the predecessor of the returned value. */
-
-CAMLextern int caml_ephemeron_key_is_set(value eph, mlsize_t offset);
-/** Return 1 if the key in the ephemeron at the given offset is set.
-    Otherwise 0. The value [eph] must be an ephemeron and [offset] a
-    valid key offset.
-*/
-
-CAMLextern void caml_ephemeron_set_key(value eph, mlsize_t offset, value k);
-/** Set the key of the given ephemeron [eph] at the given offset
-    [offset] to the given value [k]. The value [eph] must be an
-    ephemeron, [offset] a valid key offset and [k] a block.
-*/
-
-CAMLextern void caml_ephemeron_unset_key(value eph, mlsize_t offset);
-/** Unset the key of the given ephemeron at the given offset. The
-    value [eph] must be an ephemeron and [offset] a valid key offset.
-*/
-
-CAMLextern int caml_ephemeron_get_key(value eph, mlsize_t offset, value *key);
-/** Return 1 if the key in the ephemeron at the given offset is set.
-    Otherwise 0. When returning 1, set [*key] to the pointed value.
-
-    The value [eph] must be an ephemeron and [offset] a valid key
-    offset.
-*/
-
-CAMLextern int caml_ephemeron_get_key_copy(value eph, mlsize_t offset,
-                                           value *key);
-/** Return 1 if the key in the ephemeron at the given offset is set.
-    Otherwise 0. When returning 1, set [*key] to a shallow copy of the
-    key. This function allocates.
-
-    The value [eph] must be an ephemeron and [offset] a valid key
-    offset.
-*/
-
-CAMLextern void caml_ephemeron_blit_key(value eph1, mlsize_t off1,
-                                        value eph2, mlsize_t off2,
-                                        mlsize_t len);
-/** Fill the given range of keys of [eph2] with the given range of
-    keys of [eph1]. Contrary to using caml_ephemeron_get_key followed
-    by caml_ephemeron_set_key or caml_ephemeron_unset_key, this
-    function does not prevent the incremental GC from erasing the
-    value in its current cycle. The value [eph1] (resp. [eph2]) must
-    be an ephemeron and the offsets between [off1] and [off1+len]
-    (resp. between [off2] and [off2+offset]) must be valid keys of
-    [eph1] (resp. [eph2]).
-*/
-
-CAMLextern int caml_ephemeron_data_is_set(value eph);
-/** Return 1 if the data in the ephemeron is set.
-    Otherwise 0. The value [eph] must be an ephemeron.
-*/
-
-CAMLextern void caml_ephemeron_set_data(value eph, value k);
-/** Set the data of the given ephemeron [eph] to the given value
-    [k]. The value [eph] must be an ephemeron and [k] a block.
-*/
-
-CAMLextern void caml_ephemeron_unset_data(value eph);
-/** Unset the data of the given ephemeron. The value [eph] must be an
-    ephemeron.
-*/
-
-CAMLextern int caml_ephemeron_get_data(value eph, value *data);
-/** Return 1 if the data in the ephemeron at the given offset is set.
-    Otherwise 0. When returning 1, set [*data] to the pointed value.
-
-    The value [eph] must be an ephemeron and [offset] a valid key
-    offset.
-*/
-
-CAMLextern int caml_ephemeron_get_data_copy(value eph, value *data);
-/** Return 1 if the data in the ephemeron at the given offset is set.
-    Otherwise 0. When returning 1, set [*data] to a shallow copy of
-    the data. This function allocates.
-
-    The value [eph] must be an ephemeron and [offset] a valid key
-    offset.
-*/
-
-CAMLextern void caml_ephemeron_blit_data(value eph1, value eph2);
-/** Sets the data of [eph2] to be the same as the data of [eph1].
-    Contrary to using caml_ephemeron_get_data followed by
-    caml_ephemeron_set_data or caml_ephemeron_unset_data, this
-    function does not prevent the incremental GC from erasing the
-    value in its current cycle. The values [eph1] and [eph2] must be
-    ephemerons.
-*/
-
-
-#define caml_weak_array_length caml_ephemeron_num_keys
-#define caml_weak_array_create caml_ephemeron_create
-#define caml_weak_array_check caml_ephemeron_key_is_set
-#define caml_weak_array_unset caml_ephemeron_unset_key
-#define caml_weak_array_set caml_ephemeron_set_key
-#define caml_weak_array_get caml_ephemeron_get_key
-#define caml_weak_array_get_copy caml_ephemeron_get_key_copy
-#define caml_weak_array_blit caml_ephemeron_blit_key
+extern value caml_ephe_none;
 
 #ifdef CAML_INTERNALS
 
-extern value caml_ephe_list_head;
-extern value caml_ephe_none;
-
+struct caml_ephe_info {
+  value todo; /* These are ephemerons which need to be marked and swept in the
+                 current cycle. If the ephemeron is alive, after marking, they
+                 go into the live list after cleaning them off the unreachable
+                 keys and releasing values if any of the keys are unreachable.
+                 */
+  value live; /* These are ephemerons which are alive in the current cycle,
+                 whose keys and data are live (or not set). */
+  uintnat cycle;
+  struct {
+    value* todop;
+    uintnat cycle;
+  } cursor;
+};
 
 /** The first field 0:  weak list;
        second field 1:  data;
@@ -159,70 +59,11 @@ extern value caml_ephe_none;
 #define CAML_EPHE_FIRST_KEY 2
 #define CAML_EPHE_MAX_WOSIZE (Max_wosize - CAML_EPHE_FIRST_KEY)
 
-/* In the header, in order to let major_gc.c
-   and weak.c see the body of the function */
-Caml_inline void caml_ephe_clean_partial (value v,
-                                            mlsize_t offset_start,
-                                            mlsize_t offset_end) {
-  value child;
-  int release_data = 0;
-  mlsize_t i;
-  CAMLassert(caml_gc_phase == Phase_clean);
-  CAMLassert(2 <= offset_start
-             && offset_start <= offset_end
-             && offset_end <= Wosize_hd (Hd_val(v)));
-
-  for (i = offset_start; i < offset_end; i++){
-    child = Field (v, i);
-  ephemeron_again:
-    if (child != caml_ephe_none
-        && Is_block (child) && Is_in_value_area (child)){
-      if (Tag_val (child) == Forward_tag){
-        value f = Forward_val (child);
-        if (Is_block (f)) {
-          if (!Is_in_value_area(f) || Tag_val (f) == Forward_tag
-              || Tag_val (f) == Lazy_tag || Tag_val (f) == Double_tag){
-            /* Do not short-circuit the pointer. */
-          }else{
-            Field (v, i) = child = f;
-            if (Is_block (f) && Is_young (f))
-              add_to_ephe_ref_table(Caml_state_field(ephe_ref_table), v, i);
-            goto ephemeron_again;
-          }
-        }
-      }
-      if (Tag_val (child) == Infix_tag) child -= Infix_offset_val (child);
-      if (Is_white_val (child) && !Is_young (child)){
-        release_data = 1;
-        Field (v, i) = caml_ephe_none;
-      }
-    }
-  }
-
-  child = Field (v, 1);
-  if(child != caml_ephe_none){
-    if (release_data) Field (v, 1) = caml_ephe_none;
-#ifdef DEBUG
-    else if (offset_start == 2 && offset_end == Wosize_hd (Hd_val(v)) &&
-             Is_block (child) && Is_in_heap (child)) {
-      if (Tag_val (child) == Infix_tag) child -= Infix_offset_val (child);
-      /* If we scanned all the keys and the data field remains filled,
-         then the mark phase must have marked it */
-      CAMLassert( !Is_white_val (child) );
-    }
-#endif
-  }
-}
-
-Caml_inline void caml_ephe_clean (value v) {
-  mlsize_t size;
-  header_t hd;
-  hd = Hd_val (v);
-  size = Wosize_hd (hd);
-
-  caml_ephe_clean_partial(v, 2, size);
-}
+#define Ephe_link(e) (*(Op_val(e) + CAML_EPHE_LINK_OFFSET))
+#define Ephe_data(e) (*(Op_val(e) + CAML_EPHE_DATA_OFFSET))
 
+struct caml_ephe_info* caml_alloc_ephe_info (void);
+void caml_ephe_clean(value e);
 
 #endif /* CAML_INTERNALS */
 
diff --git a/runtime/codefrag.c b/runtime/codefrag.c
index 2ab957167109..56a9f624316b 100644
--- a/runtime/codefrag.c
+++ b/runtime/codefrag.c
@@ -17,25 +17,37 @@
 
 /* A table of all code fragments (main program and dynlinked modules) */
 
-#include <string.h>
-#include <stddef.h>
 #include "caml/codefrag.h"
-#include "caml/misc.h"
+#include "caml/lf_skiplist.h"
 #include "caml/md5.h"
 #include "caml/memory.h"
-#include "caml/skiplist.h"
+#include "caml/misc.h"
+#include <stdatomic.h>
+#include <stddef.h>
+#include <string.h>
+
+struct code_fragment_garbage {
+  struct code_fragment *cf;
+  struct code_fragment_garbage *next;
+};
+
+static struct code_fragment_garbage *_Atomic garbage_head = NULL;
 
-static struct skiplist code_fragments_by_pc = SKIPLIST_STATIC_INITIALIZER;
+static struct lf_skiplist code_fragments_by_pc;
 
-static struct skiplist code_fragments_by_num = SKIPLIST_STATIC_INITIALIZER;
+static struct lf_skiplist code_fragments_by_num;
 
-static int code_fragments_counter = 0;
+static int _Atomic code_fragments_counter = 1;
 
-int caml_register_code_fragment(char * start, char * end,
+void caml_init_codefrag(void) {
+  caml_lf_skiplist_init(&code_fragments_by_pc);
+  caml_lf_skiplist_init(&code_fragments_by_num);
+}
+
+int caml_register_code_fragment(char *start, char *end,
                                 enum digest_status digest_kind,
-                                unsigned char * opt_digest)
-{
-  struct code_fragment * cf = caml_stat_alloc(sizeof(struct code_fragment));
+                                unsigned char *opt_digest) {
+  struct code_fragment *cf = caml_stat_alloc(sizeof(struct code_fragment));
 
   cf->code_start = start;
   cf->code_end = end;
@@ -53,47 +65,59 @@ int caml_register_code_fragment(char * start, char * end,
     break;
   }
   cf->digest_status = digest_kind;
-  cf->fragnum = code_fragments_counter++;
-  caml_skiplist_insert(&code_fragments_by_pc,
-                       (uintnat) start, (uintnat) cf);
-  caml_skiplist_insert(&code_fragments_by_num,
-                       (uintnat) cf->fragnum, (uintnat) cf);
+  cf->fragnum = atomic_fetch_add_explicit
+                  (&code_fragments_counter, 1, memory_order_relaxed);
+  caml_lf_skiplist_insert(&code_fragments_by_pc, (uintnat)start, (uintnat)cf);
+  caml_lf_skiplist_insert(&code_fragments_by_num, (uintnat)cf->fragnum,
+                          (uintnat)cf);
   return cf->fragnum;
 }
 
-void caml_remove_code_fragment(struct code_fragment * cf)
-{
-  caml_skiplist_remove(&code_fragments_by_pc, (uintnat) cf->code_start);
-  caml_skiplist_remove(&code_fragments_by_num, cf->fragnum);
-  caml_stat_free(cf);
+void caml_remove_code_fragment(struct code_fragment *cf) {
+  struct code_fragment_garbage *cf_cell;
+
+  caml_lf_skiplist_remove(&code_fragments_by_pc, (uintnat)cf->code_start);
+
+  /* This is conditional on remove returning success because it's possible
+    for [caml_remove_code_fragment] to be called concurrently and we need
+    to ensure that only one code_fragment is put on to the garbage list */
+  if (caml_lf_skiplist_remove(&code_fragments_by_num, cf->fragnum)) {
+    cf_cell = (struct code_fragment_garbage *)caml_stat_alloc(
+        sizeof(struct code_fragment_garbage));
+
+    cf_cell->cf = cf;
+
+    do {
+      cf_cell->next = atomic_load_explicit(&garbage_head, memory_order_acquire);
+    } while (!atomic_compare_exchange_strong(&garbage_head, &cf_cell->next,
+                                             cf_cell));
+  }
 }
 
-struct code_fragment * caml_find_code_fragment_by_pc(char *pc)
-{
-  struct code_fragment * cf;
+struct code_fragment *caml_find_code_fragment_by_pc(char *pc) {
+  struct code_fragment *cf;
   uintnat key, data;
 
-  if (caml_skiplist_find_below(&code_fragments_by_pc,
-                               (uintnat) pc, &key, &data)) {
-    cf = (struct code_fragment *) data;
+  if (caml_lf_skiplist_find_below(&code_fragments_by_pc, (uintnat)pc, &key,
+                                  &data)) {
+    cf = (struct code_fragment *)data;
     CAMLassert(cf->code_start <= pc);
-    if (pc < cf->code_end) return cf;
+    if (pc < cf->code_end)
+      return cf;
   }
   return NULL;
 }
 
-struct code_fragment * caml_find_code_fragment_by_num(int fragnum)
-{
+struct code_fragment *caml_find_code_fragment_by_num(int fragnum) {
   uintnat data;
-  if (caml_skiplist_find(&code_fragments_by_num, fragnum, &data)) {
-    return (struct code_fragment *) data;
+  if (caml_lf_skiplist_find(&code_fragments_by_num, fragnum, &data)) {
+    return (struct code_fragment *)data;
   } else {
     return NULL;
   }
 }
 
-unsigned char * caml_digest_of_code_fragment(struct code_fragment * cf)
-{
+unsigned char *caml_digest_of_code_fragment(struct code_fragment *cf) {
   if (cf->digest_status == DIGEST_IGNORE)
     return NULL;
   if (cf->digest_status == DIGEST_LATER) {
@@ -104,12 +128,34 @@ unsigned char * caml_digest_of_code_fragment(struct code_fragment * cf)
 }
 
 struct code_fragment *
-   caml_find_code_fragment_by_digest(unsigned char digest[16])
-{
-  FOREACH_SKIPLIST_ELEMENT(e, &code_fragments_by_pc, {
-    struct code_fragment * cf = (struct code_fragment *) e->data;
-    unsigned char * d = caml_digest_of_code_fragment(cf);
-    if (d != NULL && memcmp(digest, d, 16) == 0) return cf;
+caml_find_code_fragment_by_digest(unsigned char digest[16]) {
+  FOREACH_LF_SKIPLIST_ELEMENT(e, &code_fragments_by_pc, {
+    struct code_fragment *cf = (struct code_fragment *)e->data;
+    unsigned char *d = caml_digest_of_code_fragment(cf);
+    if (d != NULL && memcmp(digest, d, 16) == 0)
+      return cf;
   })
   return NULL;
 }
+
+/* This is only ever called from a stw by one domain */
+void caml_code_fragment_cleanup (void)
+{
+  struct code_fragment_garbage *curr;
+
+  caml_lf_skiplist_free_garbage(&code_fragments_by_pc);
+  caml_lf_skiplist_free_garbage(&code_fragments_by_num);
+
+  curr = atomic_load_explicit(&garbage_head, memory_order_acquire);
+
+  while (curr != NULL) {
+    struct code_fragment_garbage *next = curr->next;
+
+    caml_stat_free(curr->cf);
+    caml_stat_free(curr);
+
+    curr = next;
+  }
+
+  atomic_store_explicit(&garbage_head, NULL, memory_order_release);
+}
diff --git a/runtime/compact.c b/runtime/compact.c
deleted file mode 100644
index 8ba168e4d2bc..000000000000
--- a/runtime/compact.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*              Damien Doligez, projet Para, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-#define CAML_INTERNALS
-
-#include <string.h>
-
-#include "caml/address_class.h"
-#include "caml/config.h"
-#include "caml/finalise.h"
-#include "caml/freelist.h"
-#include "caml/gc.h"
-#include "caml/gc_ctrl.h"
-#include "caml/major_gc.h"
-#include "caml/memory.h"
-#include "caml/mlvalues.h"
-#include "caml/roots.h"
-#include "caml/weak.h"
-#include "caml/compact.h"
-#include "caml/memprof.h"
-#include "caml/eventlog.h"
-
-extern uintnat caml_percent_free;                   /* major_gc.c */
-extern void caml_shrink_heap (char *);              /* memory.c */
-
-/* Colors
-
-   We use the GC's color bits in the following way:
-
-   - White words are headers of live blocks except for 0, which is a
-     fragment.
-   - Blue words are headers of free blocks.
-   - Black words are headers of out-of-heap "blocks".
-   - Gray words are the encoding of pointers in inverted lists.
-
-   Encoded pointers:
-   Pointers always have their two low-order bits clear. We make use of
-   this to encode pointers by shifting bits 2-9 to 0-7:
-   ...XXXyyyyyyyy00 becomes ...XXX01yyyyyyyy
-   Note that 01 corresponds to the "gray" color of the GC, so we can now
-   mix pointers and headers because there are no gray headers anywhere in
-   the heap (or outside) when we start a compaction (which must be done at
-   the end of a sweep phase).
-*/
-
-typedef uintnat word;
-
-#define eptr(p) \
-  (((word) (p) & ~0x3FF) | ((((word) p) & 0x3FF) >> 2) | Caml_gray)
-#define dptr(p) ((word *) (((word) (p) & ~0x3FF) | ((((word) p) & 0xFF) << 2)))
-
-static void invert_pointer_at (word *p)
-{
-  word q = *p;
-  header_t h;
-
-  CAMLassert (((uintnat) p & 3) == 0);
-
-  if (Is_block (q) && Is_in_value_area (q)){
-    h = Hd_val (q);
-    switch (Color_hd (h)){
-    case Caml_white:
-      if (Tag_hd (h) == Infix_tag){
-        value realvalue = (value) q - Infix_offset_val (q);
-        if (Is_black_val (realvalue)) break;
-      }
-      /* FALL THROUGH */
-    case Caml_gray:
-      CAMLassert (Is_in_heap (q));
-      /* [q] points to some inverted list, insert it. */
-      *p = h;
-      Hd_val (q) = eptr (p);
-      break;
-    case Caml_black:
-      /* [q] points to an out-of-heap value. Leave it alone. */
-      break;
-    default: /* Caml_blue */
-      /* We found a pointer to a free block. This cannot happen. */
-      CAMLassert (0);
-      break;
-    }
-  }
-}
-
-void caml_invert_root (value v, value *p)
-{
-#ifdef NO_NAKED_POINTERS
-  /* Note: this assertion will become tautological and should be removed when
-     we finally get rid of the page table in NNP mode.
-  */
-  CAMLassert (Is_long (*p) || Is_in_heap (*p) || Is_black_val (*p)
-              || Tag_val (*p) == Infix_tag);
-#endif
-  invert_pointer_at ((word *) p);
-}
-
-static char *compact_fl;
-
-static void init_compact_allocate (void)
-{
-  char *ch = caml_heap_start;
-  while (ch != NULL){
-    Chunk_alloc (ch) = 0;
-    ch = Chunk_next (ch);
-  }
-  compact_fl = caml_heap_start;
-}
-
-/* [size] is a number of bytes and includes the header size */
-static char *compact_allocate (mlsize_t size)
-{
-  char *chunk, *adr;
-
-  while (Chunk_size(compact_fl) - Chunk_alloc(compact_fl) < Bhsize_wosize(1)){
-    compact_fl = Chunk_next (compact_fl);
-    CAMLassert (compact_fl != NULL);
-  }
-  chunk = compact_fl;
-  while (Chunk_size (chunk) - Chunk_alloc (chunk) < size){
-    chunk = Chunk_next (chunk);
-    CAMLassert (chunk != NULL);
-  }
-  adr = chunk + Chunk_alloc (chunk);
-  Chunk_alloc (chunk) += size;
-  return adr;
-}
-
-static void do_compaction (intnat new_allocation_policy)
-{
-  char *ch, *chend;
-  CAMLassert (caml_gc_phase == Phase_idle);
-  caml_gc_message (0x10, "Compacting heap...\n");
-
-#ifdef DEBUG
-  caml_heap_check ();
-#endif
-
-  /* Make sure the heap is in the right state for compaction:
-     - all free blocks are blue
-     - all other blocks are white and contain valid pointers
-  */
-  caml_fl_reset_and_switch_policy (new_allocation_policy);
-
-  /* First pass: removed in 4.12 thanks to the new closure representation. */
-
-
-  /* Second pass: invert pointers.
-     Don't forget roots and weak pointers.
-     This is a mark-like pass. */
-  {
-    caml_do_roots (caml_invert_root, 1);
-    /* The values to be finalised are not roots but should still be inverted */
-    caml_final_invert_finalisable_values ();
-    /* Idem for memprof tracked blocks */
-    caml_memprof_invert_tracked ();
-
-    ch = caml_heap_start;
-    while (ch != NULL){
-      word *p = (word *) ch;
-      chend = ch + Chunk_size (ch);
-
-      while ((char *) p < chend){
-        word q = *p;
-        mlsize_t wosz, i, first_field;
-        tag_t t;
-
-        while (Is_gray_hd (q)) q = * dptr (q);
-        wosz = Wosize_hd (q);
-        if (Is_white_hd (q)){
-          t = Tag_hd (q);
-          CAMLassert (t != Infix_tag);
-          if (t < No_scan_tag){
-            value v = Val_hp (p);
-            if (t == Closure_tag){
-              first_field = Start_env_closinfo (Closinfo_val (v));
-            }else{
-              first_field = 0;
-            }
-            for (i = first_field; i < wosz; i++){
-              invert_pointer_at ((word *) &Field (v,i));
-            }
-          }
-        }
-        p += Whsize_wosize (wosz);
-      }
-      ch = Chunk_next (ch);
-    }
-    /* Invert weak pointers. */
-    {
-      value *pp = &caml_ephe_list_head;
-      value p;
-      word q;
-      size_t sz, i;
-
-      while (1){
-        p = *pp;
-        if (p == (value) NULL) break;
-        q = Hd_val (p);
-        while (Is_gray_hd (q)) q = * dptr (q);
-        CAMLassert (Is_white_hd (q));
-        sz = Wosize_hd (q);
-        for (i = 1; i < sz; i++){
-          if (Field (p,i) != caml_ephe_none){
-            invert_pointer_at ((word *) &(Field (p,i)));
-          }
-        }
-        invert_pointer_at ((word *) pp);
-        pp = &Field (p, 0);
-      }
-    }
-  }
-
-
-  /* Third pass: reallocate virtually; revert pointers.
-     This is a sweep-like pass. */
-  {
-    init_compact_allocate ();
-    ch = caml_heap_start;
-    while (ch != NULL){
-      word *p = (word *) ch;
-
-      chend = ch + Chunk_size (ch);
-      while ((char *) p < chend){
-        header_t h = Hd_hp (p);
-        size_t sz;
-
-        while (Is_gray_hd (h)) h = * dptr (h);
-        sz = Whsize_hd (h);
-
-        CAMLassert (!Is_black_hd (h));
-        CAMLassert (!Is_gray_hd (h));
-        if (h != 0 && Is_white_hd (h)){
-          word q;
-          tag_t t;
-          char *newadr;
-
-          t = Tag_hd (h);
-          CAMLassert (t != Infix_tag);
-
-          newadr = compact_allocate (Bsize_wsize (sz));
-          q = *p;
-          while (Is_gray_hd (q)){
-            word *pp = dptr (q);
-            q = *pp;
-            *pp = (word) Val_hp (newadr);
-          }
-          CAMLassert (q == h);
-          *p = q;
-
-          if (t == Closure_tag){
-            /* Revert the infix pointers to this block. */
-            mlsize_t i, startenv;
-            value v;
-
-            v = Val_hp (p);
-            startenv = Start_env_closinfo (Closinfo_val (v));
-            i = 0;
-            while (1){
-              int arity = Arity_closinfo (Field (v, i+1));
-              i += 2 + (arity != 0 && arity != 1);
-              if (i >= startenv) break;
-
-              /* Revert the inverted list for infix header at offset [i]. */
-              q = Field (v, i);
-              while (Is_gray_hd (q)){
-                word *pp = dptr (q);
-                q = *pp;
-                *pp = (word) Val_hp ((header_t *) &Field (Val_hp (newadr), i));
-              }
-              CAMLassert (Tag_hd (q) == Infix_tag);
-              Field (v, i) = q;
-              ++i;
-            }
-          }
-        }
-        p += sz;
-      }
-      ch = Chunk_next (ch);
-    }
-  }
-
-
-  /* Fourth pass: reallocate and move objects.
-     Use the exact same allocation algorithm as pass 3. */
-  {
-    init_compact_allocate ();
-    ch = caml_heap_start;
-    while (ch != NULL){
-      word *p = (word *) ch;
-
-      chend = ch + Chunk_size (ch);
-      while ((char *) p < chend){
-        word q = *p;
-        if (q != 0 && Is_white_hd (q)){
-          size_t sz = Bhsize_hd (q);
-          char *newadr = compact_allocate (sz);
-          memmove (newadr, p, sz);
-          p += Wsize_bsize (sz);
-        }else{
-          CAMLassert (q == 0 || Is_blue_hd (q));
-          p += Whsize_hd (q);
-        }
-      }
-      ch = Chunk_next (ch);
-    }
-  }
-
-  /* Shrink the heap if needed. */
-  {
-    /* Find the amount of live data and the unshrinkable free space. */
-    asize_t live = 0;
-    asize_t free = 0;
-    asize_t wanted;
-
-    ch = caml_heap_start;
-    while (ch != NULL){
-      if (Chunk_alloc (ch) != 0){
-        live += Wsize_bsize (Chunk_alloc (ch));
-        free += Wsize_bsize (Chunk_size (ch) - Chunk_alloc (ch));
-      }
-      ch = Chunk_next (ch);
-    }
-
-    /* Add up the empty chunks until there are enough, then remove the
-       other empty chunks. */
-    wanted = caml_percent_free * (live / 100 + 1);
-    ch = caml_heap_start;
-    while (ch != NULL){
-      char *next_chunk = Chunk_next (ch);  /* Chunk_next (ch) will be erased */
-
-      if (Chunk_alloc (ch) == 0){
-        if (free < wanted){
-          free += Wsize_bsize (Chunk_size (ch));
-        }else{
-          caml_shrink_heap (ch);
-        }
-      }
-      ch = next_chunk;
-    }
-  }
-
-  /* Rebuild the free list. This is the right time for a change of
-     allocation policy, since we are rebuilding the allocator's data
-     structures from scratch. */
-  {
-    ch = caml_heap_start;
-    caml_fl_init_merge ();
-    while (ch != NULL){
-      if (Chunk_size (ch) > Chunk_alloc (ch)){
-        caml_make_free_blocks ((value *) (ch + Chunk_alloc (ch)),
-                               Wsize_bsize (Chunk_size(ch)-Chunk_alloc(ch)), 1,
-                               Caml_white);
-      }
-      ch = Chunk_next (ch);
-    }
-  }
-  ++ Caml_state->stat_compactions;
-
-  caml_shrink_mark_stack();
-
-  caml_gc_message (0x10, "done.\n");
-}
-
-uintnat caml_percent_max;  /* used in gc_ctrl.c and memory.c */
-
-void caml_compact_heap (intnat new_allocation_policy)
-{
-  uintnat target_wsz, live;
-
-  CAMLassert (Caml_state->young_ptr == Caml_state->young_alloc_end);
-  CAMLassert (Caml_state->ref_table->ptr ==
-              Caml_state->ref_table->base);
-  CAMLassert (Caml_state->ephe_ref_table->ptr ==
-              Caml_state->ephe_ref_table->base);
-  CAMLassert (Caml_state->custom_table->ptr ==
-              Caml_state->custom_table->base);
-
-  CAML_EV_BEGIN(EV_COMPACT_MAIN);
-  do_compaction (new_allocation_policy);
-  CAML_EV_END(EV_COMPACT_MAIN);
-  /* Compaction may fail to shrink the heap to a reasonable size
-     because it deals in complete chunks: if a very large chunk
-     is at the beginning of the heap, everything gets moved to
-     it and it is not freed.
-
-     In that case, we allocate a new chunk of the desired heap
-     size, chain it at the beginning of the heap (thus pretending
-     its address is smaller), and launch a second compaction.
-     This will move all data to this new chunk and free the
-     very large chunk.
-
-     See PR#5389
-  */
-  /* We compute:
-     freewords = caml_fl_cur_wsz                   (exact)
-     heapwords = Wsize_bsize (caml_heap_size)      (exact)
-     live = heapwords - freewords
-     wanted = caml_percent_free * (live / 100 + 1) (same as in do_compaction)
-     target_wsz = live + wanted
-     We add one page to make sure a small difference in counting sizes
-     won't make [do_compaction] keep the second block (and break all sorts
-     of invariants).
-
-     We recompact if target_wsz < heap_size / 2
-  */
-  live = Caml_state->stat_heap_wsz - caml_fl_cur_wsz;
-  target_wsz = live + caml_percent_free * (live / 100 + 1)
-                 + Wsize_bsize (Page_size);
-  target_wsz = caml_clip_heap_chunk_wsz (target_wsz);
-
-#ifdef HAS_HUGE_PAGES
-  if (caml_use_huge_pages
-      && Bsize_wsize (Caml_state->stat_heap_wsz) <= HUGE_PAGE_SIZE)
-    return;
-#endif
-
-  if (target_wsz < Caml_state->stat_heap_wsz / 2){
-    /* Recompact. */
-    char *chunk;
-
-    caml_gc_message (0x10, "Recompacting heap (target=%"
-                     ARCH_INTNAT_PRINTF_FORMAT "uk words)\n",
-                     target_wsz / 1024);
-
-    chunk = caml_alloc_for_heap (Bsize_wsize (target_wsz));
-    if (chunk == NULL) return;
-    /* PR#5757: we need to make the new blocks blue, or they won't be
-       recognized as free by the recompaction. */
-    caml_make_free_blocks ((value *) chunk,
-                           Wsize_bsize (Chunk_size (chunk)), 0, Caml_blue);
-    if (caml_page_table_add (In_heap, chunk, chunk + Chunk_size (chunk)) != 0){
-      caml_free_for_heap (chunk);
-      return;
-    }
-    Chunk_next (chunk) = caml_heap_start;
-    caml_heap_start = chunk;
-    ++ Caml_state->stat_heap_chunks;
-    Caml_state->stat_heap_wsz += Wsize_bsize (Chunk_size (chunk));
-    if (Caml_state->stat_heap_wsz > Caml_state->stat_top_heap_wsz){
-      Caml_state->stat_top_heap_wsz = Caml_state->stat_heap_wsz;
-    }
-    CAML_EV_BEGIN(EV_COMPACT_RECOMPACT);
-    do_compaction (-1);
-    CAMLassert (Caml_state->stat_heap_chunks == 1);
-    CAMLassert (Chunk_next (caml_heap_start) == NULL);
-    CAMLassert (Caml_state->stat_heap_wsz == Wsize_bsize (Chunk_size (chunk)));
-    CAML_EV_END(EV_COMPACT_RECOMPACT);
-  }
-}
-
-void caml_compact_heap_maybe (double previous_overhead)
-{
-  CAMLassert (caml_gc_phase == Phase_idle);
-  if (caml_percent_max >= 1000000) return;
-  if (Caml_state->stat_major_collections < 3) return;
-  if (Caml_state->stat_heap_wsz <= 2 * caml_clip_heap_chunk_wsz (0)) return;
-
-#ifdef HAS_HUGE_PAGES
-  if (caml_use_huge_pages
-      && Bsize_wsize (Caml_state->stat_heap_wsz) <= HUGE_PAGE_SIZE)
-    return;
-#endif
-
-  if (previous_overhead >= caml_percent_max){
-    double current_overhead;
-
-    caml_gc_message (0x200, "Automatic compaction triggered.\n");
-    caml_empty_minor_heap ();  /* minor heap must be empty for compaction */
-    caml_gc_message
-      (0x1, "Finishing major GC cycle (triggered by compaction)\n");
-    caml_finish_major_cycle ();
-    ++ Caml_state->stat_forced_major_collections;
-
-    /* Note: There is no floating garbage because we just did a complete
-       major cycle*/
-    current_overhead =
-      100.0 * caml_fl_cur_wsz / (Caml_state->stat_heap_wsz - caml_fl_cur_wsz);
-    caml_gc_message (0x200, "Current overhead: %"
-                            ARCH_INTNAT_PRINTF_FORMAT "u%%\n",
-                     (uintnat) current_overhead);
-    if (current_overhead >= caml_percent_max)
-      caml_compact_heap (-1);
-    else
-      caml_gc_message (0x200, "Automatic compaction aborted.\n");
-  }
-}
diff --git a/runtime/compare.c b/runtime/compare.c
index 4a0eb6eac0a5..a061b0664860 100644
--- a/runtime/compare.c
+++ b/runtime/compare.c
@@ -127,8 +127,6 @@ static intnat do_compare_val(struct compare_stack* stk,
       if (Is_long(v2))
         return Long_val(v1) - Long_val(v2);
       /* Subtraction above cannot overflow and cannot result in UNORDERED */
-      if (!Is_in_value_area(v2))
-        return LESS;
       switch (Tag_val(v2)) {
         case Forward_tag:
           v2 = Forward_val(v2);
@@ -145,11 +143,10 @@ static intnat do_compare_val(struct compare_stack* stk,
         }
         default: /*fallthrough*/;
         }
+
       return LESS;                /* v1 long < v2 block */
     }
     if (Is_long(v2)) {
-      if (!Is_in_value_area(v1))
-        return GREATER;
       switch (Tag_val(v1)) {
         case Forward_tag:
           v1 = Forward_val(v1);
@@ -168,14 +165,6 @@ static intnat do_compare_val(struct compare_stack* stk,
         }
       return GREATER;            /* v1 block > v2 long */
     }
-    /* If one of the objects is outside the heap (but is not an atom),
-       use address comparison. Since both addresses are 2-aligned,
-       shift lsb off to avoid overflow in subtraction. */
-    if (! Is_in_value_area(v1) || ! Is_in_value_area(v2)) {
-      if (v1 == v2) goto next_item;
-      return (v1 >> 1) - (v2 >> 1);
-      /* Subtraction above cannot result in UNORDERED */
-    }
     t1 = Tag_val(v1);
     t2 = Tag_val(v2);
     if (t1 != t2) {
@@ -253,6 +242,9 @@ static intnat do_compare_val(struct compare_stack* stk,
     case Infix_tag:
       compare_free_stack(stk);
       caml_invalid_argument("compare: functional value");
+    case Cont_tag:
+      compare_free_stack(stk);
+      caml_invalid_argument("compare: continuation value");
     case Object_tag: {
       intnat oid1 = Oid_val(v1);
       intnat oid2 = Oid_val(v2);
diff --git a/runtime/custom.c b/runtime/custom.c
index 3ff5462c344c..8186b215711e 100644
--- a/runtime/custom.c
+++ b/runtime/custom.c
@@ -23,6 +23,7 @@
 #include "caml/gc_ctrl.h"
 #include "caml/memory.h"
 #include "caml/mlvalues.h"
+#include "caml/shared_heap.h"
 #include "caml/signals.h"
 #include "caml/memprof.h"
 
@@ -30,7 +31,7 @@ uintnat caml_custom_major_ratio = Custom_major_ratio_def;
 uintnat caml_custom_minor_ratio = Custom_minor_ratio_def;
 uintnat caml_custom_minor_max_bsz = Custom_minor_max_bsz_def;
 
-static value alloc_custom_gen (struct custom_operations * ops,
+static value alloc_custom_gen (const struct custom_operations * ops,
                                uintnat bsz,
                                mlsize_t mem,
                                mlsize_t max_major,
@@ -55,28 +56,29 @@ static value alloc_custom_gen (struct custom_operations * ops,
       }
       /* The remaining [mem_minor] will be counted if the block survives a
          minor GC */
-      add_to_custom_table (Caml_state->custom_table, result,
-                           mem_minor, max_major);
+      add_to_custom_table (&Caml_state->minor_tables->custom, result,
+                           mem, max_major);
       /* Keep track of extra resources held by custom block in
          minor heap. */
       if (mem_minor != 0) {
         if (max_minor == 0) max_minor = 1;
         Caml_state->extra_heap_resources_minor +=
           (double) mem_minor / (double) max_minor;
-        if (Caml_state->extra_heap_resources_minor > 1.0)
-          caml_minor_collection ();
+        if (Caml_state->extra_heap_resources_minor > 1.0) {
+          caml_request_minor_gc ();
+        }
       }
     }
   } else {
     result = caml_alloc_shr(wosize, Custom_tag);
     Custom_ops_val(result) = ops;
     caml_adjust_gc_speed(mem, max_major);
-    caml_check_urgent_gc(Val_unit);
+    result = caml_check_urgent_gc(result);
   }
   CAMLreturn(result);
 }
 
-CAMLexport value caml_alloc_custom(struct custom_operations * ops,
+CAMLexport value caml_alloc_custom(const struct custom_operations * ops,
                                    uintnat bsz,
                                    mlsize_t mem,
                                    mlsize_t max)
@@ -84,10 +86,11 @@ CAMLexport value caml_alloc_custom(struct custom_operations * ops,
   return alloc_custom_gen (ops, bsz, mem, max, mem, max);
 }
 
-CAMLexport value caml_alloc_custom_mem(struct custom_operations * ops,
+CAMLexport value caml_alloc_custom_mem(const struct custom_operations * ops,
                                        uintnat bsz,
                                        mlsize_t mem)
 {
+
   mlsize_t mem_minor =
     mem < caml_custom_minor_max_bsz ? mem : caml_custom_minor_max_bsz;
   mlsize_t max_major =
@@ -100,22 +103,22 @@ CAMLexport value caml_alloc_custom_mem(struct custom_operations * ops,
        the major GC takes 1.5 cycles (previous cycle + marking phase) before
        it starts to deallocate dead blocks allocated during the previous cycle.
        [heap_size / 150] is really [heap_size * (2/3) / 100] (but faster). */
-    Bsize_wsize (Caml_state->stat_heap_wsz) / 150 * caml_custom_major_ratio;
+    caml_heap_size(Caml_state->shared_heap) / 150 * caml_custom_major_ratio;
   mlsize_t max_minor =
     Bsize_wsize (Caml_state->minor_heap_wsz) / 100 * caml_custom_minor_ratio;
   value v = alloc_custom_gen (ops, bsz, mem, max_major, mem_minor, max_minor);
-  caml_memprof_track_custom(v, mem);
   return v;
 }
 
 struct custom_operations_list {
-  struct custom_operations * ops;
+  const struct custom_operations * ops;
   struct custom_operations_list * next;
 };
 
 static struct custom_operations_list * custom_ops_table = NULL;
 
-CAMLexport void caml_register_custom_operations(struct custom_operations * ops)
+CAMLexport void
+caml_register_custom_operations(const struct custom_operations * ops)
 {
   struct custom_operations_list * l =
     caml_stat_alloc(sizeof(struct custom_operations_list));
@@ -130,7 +133,8 @@ struct custom_operations * caml_find_custom_operations(char * ident)
 {
   struct custom_operations_list * l;
   for (l = custom_ops_table; l != NULL; l = l->next)
-    if (strcmp(l->ops->identifier, ident) == 0) return l->ops;
+    if (strcmp(l->ops->identifier, ident) == 0)
+      return (struct custom_operations*)l->ops;
   return NULL;
 }
 
@@ -141,7 +145,7 @@ struct custom_operations * caml_final_custom_operations(final_fun fn)
   struct custom_operations_list * l;
   struct custom_operations * ops;
   for (l = custom_ops_final_table; l != NULL; l = l->next)
-    if (l->ops->finalize == fn) return l->ops;
+    if (l->ops->finalize == fn) return (struct custom_operations*)l->ops;
   ops = caml_stat_alloc(sizeof(struct custom_operations));
   ops->identifier = "_final";
   ops->finalize = fn;
diff --git a/runtime/debugger.c b/runtime/debugger.c
index 97d22832a0bc..9bc08d7cbdc5 100644
--- a/runtime/debugger.c
+++ b/runtime/debugger.c
@@ -28,13 +28,13 @@
 #include "caml/config.h"
 #include "caml/debugger.h"
 #include "caml/misc.h"
+#include "caml/memory.h"
 #include "caml/osdeps.h"
 #include "caml/skiplist.h"
 
 int caml_debugger_in_use = 0;
 uintnat caml_event_count;
 int caml_debugger_fork_mode = 1; /* parent by default */
-
 #if !defined(HAS_SOCKETS) || defined(NATIVE_CODE)
 
 void caml_debugger_init(void)
@@ -87,10 +87,10 @@ struct sockaddr_un {
 #include "caml/intext.h"
 #include "caml/io.h"
 #include "caml/mlvalues.h"
-#include "caml/stacks.h"
+#include "caml/fiber.h"
 #include "caml/sys.h"
 
-static value marshal_flags = Val_emptylist;
+static value marshal_flags;
 
 static int sock_domain;         /* Socket domain for the debugger */
 static union {                  /* Socket address for the debugger */
@@ -176,12 +176,14 @@ void caml_debugger_init(void)
   char_os * a;
   char * port, * p;
   struct hostent * host;
+  value flags;
   int n;
 
-  caml_register_global_root(&marshal_flags);
-  marshal_flags = caml_alloc(2, Tag_cons);
-  Store_field(marshal_flags, 0, Val_int(1)); /* Marshal.Closures */
-  Store_field(marshal_flags, 1, Val_emptylist);
+  flags = caml_alloc(2, Tag_cons);
+  Store_field(flags, 0, Val_int(1)); /* Marshal.Closures */
+  Store_field(flags, 1, Val_emptylist);
+  marshal_flags = flags;
+  caml_register_generational_global_root(&marshal_flags);
 
   a = caml_secure_getenv(T("CAML_DEBUG_SOCKET"));
   address = a ? caml_stat_strdup_of_os(a) : NULL;
@@ -244,7 +246,8 @@ void caml_debugger_init(void)
   }
   open_connection();
   caml_debugger_in_use = 1;
-  Caml_state->trap_barrier = Caml_state->stack_high;
+  /* Bigger than default caml_trap_sp_off (1) */
+  Caml_state->trap_barrier_off = 2;
 }
 
 static value getval(struct channel *chan)
@@ -262,12 +265,14 @@ static void putval(struct channel *chan, value val)
 
 static void safe_output_value(struct channel *chan, value val)
 {
-  struct longjmp_buffer raise_buf, * saved_external_raise;
+  struct longjmp_buffer raise_buf;
+  struct caml_exception_context exception_ctx = {&raise_buf, CAML_LOCAL_ROOTS};
+  struct caml_exception_context* saved_external_raise;
 
   /* Catch exceptions raised by [caml_output_val] */
   saved_external_raise = Caml_state->external_raise;
   if (sigsetjmp(raise_buf.buf, 0) == 0) {
-    Caml_state->external_raise = &raise_buf;
+    Caml_state->external_raise = &exception_ctx;
     caml_output_val(chan, val, marshal_flags);
   } else {
     /* Send wrong magic number, will cause [caml_input_value] to fail */
@@ -349,13 +354,14 @@ void caml_debugger(enum event_kind event, value param)
   value *frame, *newframe;
   intnat i, pos;
   value val;
+  value* stack_high = Stack_high(Caml_state->current_stack);
   int frag;
   struct code_fragment *cf;
 
   if (dbg_socket == -1) return;  /* Not connected to a debugger. */
 
   /* Reset current frame */
-  frame = Caml_state->extern_sp + 1;
+  frame = Caml_state->current_stack->sp + 1;
 
   /* Report the event to the debugger */
   switch(event) {
@@ -397,7 +403,7 @@ void caml_debugger(enum event_kind event, value param)
   }
   caml_putword(dbg_out, caml_event_count);
   if (event == EVENT_COUNT || event == BREAKPOINT) {
-    caml_putword(dbg_out, Caml_state->stack_high - frame);
+    caml_putword(dbg_out, stack_high - frame);
     cf = caml_find_code_fragment_by_pc((char*) Pc(frame));
     CAMLassert(cf != NULL);
     caml_putword(dbg_out, cf->fragnum);
@@ -458,11 +464,11 @@ void caml_debugger(enum event_kind event, value param)
 #endif
       break;
     case REQ_INITIAL_FRAME:
-      frame = Caml_state->extern_sp + 1;
+      frame = Caml_state->current_stack->sp + 1;
       /* Fall through */
     case REQ_GET_FRAME:
-      caml_putword(dbg_out, Caml_state->stack_high - frame);
-      if (frame < Caml_state->stack_high &&
+      caml_putword(dbg_out, stack_high - frame);
+      if (frame < stack_high &&
           (cf = caml_find_code_fragment_by_pc((char*) Pc(frame))) != NULL) {
         caml_putword(dbg_out, cf->fragnum);
         caml_putword(dbg_out, (char*) Pc(frame) - cf->code_start);
@@ -474,17 +480,17 @@ void caml_debugger(enum event_kind event, value param)
       break;
     case REQ_SET_FRAME:
       i = caml_getword(dbg_in);
-      frame = Caml_state->stack_high - i;
+      frame = stack_high - i;
       break;
     case REQ_UP_FRAME:
       i = caml_getword(dbg_in);
       newframe = frame + Extra_args(frame) + i + 3;
-      if (newframe >= Caml_state->stack_high ||
+      if (newframe >= stack_high ||
           (cf = caml_find_code_fragment_by_pc((char *) Pc(newframe))) == NULL) {
         caml_putword(dbg_out, -1);
       } else {
         frame = newframe;
-        caml_putword(dbg_out, Caml_state->stack_high - frame);
+        caml_putword(dbg_out, stack_high - frame);
         caml_putword(dbg_out, cf->fragnum);
         caml_putword(dbg_out, (char*) Pc(frame) - cf->code_start);
       }
@@ -492,7 +498,7 @@ void caml_debugger(enum event_kind event, value param)
       break;
     case REQ_SET_TRAP_BARRIER:
       i = caml_getword(dbg_in);
-      Caml_state->trap_barrier = Caml_state->stack_high - i;
+      Caml_state->trap_barrier_off = -i;
       break;
     case REQ_GET_LOCAL:
       i = caml_getword(dbg_in);
@@ -510,7 +516,7 @@ void caml_debugger(enum event_kind event, value param)
       caml_flush(dbg_out);
       break;
     case REQ_GET_ACCU:
-      putval(dbg_out, *Caml_state->extern_sp);
+      putval(dbg_out, *Caml_state->current_stack->sp);
       caml_flush(dbg_out);
       break;
     case REQ_GET_HEADER:
diff --git a/runtime/domain.c b/runtime/domain.c
index d4d8de53fcf1..1da0e8344358 100644
--- a/runtime/domain.c
+++ b/runtime/domain.c
@@ -4,7 +4,9 @@
 /*                                                                        */
 /*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
 /*                 Stephen Dolan, University of Cambridge                 */
+/*                   Tom Kelly, OCaml Labs Consultancy                    */
 /*                                                                        */
+/*   Copyright 2021 OCaml Labs Consultancy Ltd                            */
 /*   Copyright 2019 Indian Institute of Technology, Madras                */
 /*   Copyright 2019 University of Cambridge                               */
 /*                                                                        */
@@ -16,79 +18,1441 @@
 
 #define CAML_INTERNALS
 
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+#include "caml/alloc.h"
+#include "caml/backtrace.h"
+#include "caml/callback.h"
+#include "caml/domain.h"
 #include "caml/domain_state.h"
+#include "caml/eventlog.h"
+#include "caml/fail.h"
+#include "caml/fiber.h"
+#include "caml/finalise.h"
+#include "caml/gc_ctrl.h"
+#include "caml/globroots.h"
+#include "caml/intext.h"
+#include "caml/major_gc.h"
+#include "caml/minor_gc.h"
 #include "caml/memory.h"
+#include "caml/osdeps.h"
+#include "caml/platform.h"
+#include "caml/shared_heap.h"
+#include "caml/signals.h"
+#include "caml/startup.h"
+#include "caml/sync.h"
+#include "caml/weak.h"
 
-CAMLexport caml_domain_state* Caml_state;
+/* From a runtime perspective, domains must handle stop-the-world (STW)
+   sections, during which:
+    - they are within a section no mutator code is running
+    - all domains will execute the section in parallel
+    - barriers are provided to know all domains have reached the
+      same stage within a section
 
-void caml_init_domain ()
+   Stop-the-world sections are used to handle duties such as:
+    - minor GC
+    - major GC to trigger major state machine phase changes
+
+   Two invariants for STW sections:
+    - domains only execute mutator code if in the stop-the-world set
+    - domains in the stop-the-world set guarantee to service the sections
+*/
+
+/* The main C-stack for a domain can enter a blocking call.
+   In this scenario a 'backup thread' will become responsible for
+   servicing the STW sections on behalf of the domain. Care is needed
+   to hand off duties for servicing STW sections between the main
+   pthread and the backup pthread when caml_enter_blocking_section
+   and caml_leave_blocking_section are called.
+
+   When the state for the backup thread is BT_IN_BLOCKING_SECTION
+   the backup thread will service the STW section.
+
+   The state machine for the backup thread (and its transitions)
+   are:
+
+           BT_INIT  <---------------------------------------+
+              |                                             |
+   (install_backup_thread)                                  |
+       [main pthread]                                       |
+              |                                             |
+              v                                             |
+       BT_ENTERING_OCAML  <-----------------+               |
+              |                             |               |
+(caml_enter_blocking_section)               |               |
+       [main pthread]                       |               |
+              |                             |               |
+              |                             |               |
+              |               (caml_leave_blocking_section) |
+              |                      [main pthread]         |
+              v                             |               |
+    BT_IN_BLOCKING_SECTION  ----------------+               |
+              |                                             |
+     (domain_terminate)                                     |
+       [main pthread]                                       |
+              |                                             |
+              v                                             |
+        BT_TERMINATE                               (backup_thread_func)
+              |                                      [backup pthread]
+              |                                             |
+              +---------------------------------------------+
+
+ */
+#define BT_IN_BLOCKING_SECTION 0
+#define BT_ENTERING_OCAML 1
+#define BT_TERMINATE 2
+#define BT_INIT 3
+
+/* control of STW interrupts */
+struct interruptor {
+  atomic_uintnat* interrupt_word;
+  caml_plat_mutex lock;
+  caml_plat_cond cond;
+
+  int running;
+  int terminating;
+  /* unlike the domain ID, this ID number is not reused */
+  uintnat unique_id;
+
+  atomic_uintnat interrupt_pending;
+};
+
+struct dom_internal {
+  /* readonly fields, initialised and never modified */
+  int id;
+  caml_domain_state* state;
+  struct interruptor interruptor;
+
+  /* backup thread */
+  int backup_thread_running;
+  pthread_t backup_thread;
+  atomic_uintnat backup_thread_msg;
+  caml_plat_mutex domain_lock;
+  caml_plat_cond domain_cond;
+
+  /* readonly */
+  uintnat tls_area;
+  uintnat tls_area_end;
+  uintnat minor_heap_area;
+  uintnat minor_heap_area_end;
+};
+typedef struct dom_internal dom_internal;
+
+
+static struct {
+  atomic_uintnat domains_still_running;
+  atomic_uintnat num_domains_still_processing;
+  void (*callback)(caml_domain_state*,
+                   void*,
+                   int participating_count,
+                   caml_domain_state** others_participating);
+  void* data;
+  void (*enter_spin_callback)(caml_domain_state*, void*);
+  void* enter_spin_data;
+
+  /* barrier state */
+  int num_domains;
+  atomic_uintnat barrier;
+
+  caml_domain_state* participating[Max_domains];
+} stw_request = {
+  ATOMIC_UINTNAT_INIT(0),
+  ATOMIC_UINTNAT_INIT(0),
+  NULL,
+  NULL,
+  NULL,
+  NULL,
+  0,
+  ATOMIC_UINTNAT_INIT(0),
+  { 0 },
+};
+
+static caml_plat_mutex all_domains_lock = CAML_PLAT_MUTEX_INITIALIZER;
+static caml_plat_cond all_domains_cond =
+    CAML_PLAT_COND_INITIALIZER(&all_domains_lock);
+static atomic_uintnat /* dom_internal* */ stw_leader = 0;
+static struct dom_internal all_domains[Max_domains];
+
+CAMLexport atomic_uintnat caml_num_domains_running;
+
+CAMLexport uintnat caml_minor_heaps_base;
+CAMLexport uintnat caml_minor_heaps_end;
+CAMLexport uintnat caml_tls_areas_base;
+static __thread dom_internal* domain_self;
+
+/*
+ * This structure is protected by all_domains_lock
+ * [0, participating_domains) are all the domains taking part in STW sections
+ * [participating_domains, Max_domains) are all those domains free to be used
+ */
+static struct {
+  int participating_domains;
+  dom_internal* domains[Max_domains];
+} stw_domains = {
+  0,
+  { 0 }
+};
+
+static void add_to_stw_domains(dom_internal* dom) {
+  int i;
+  CAMLassert(stw_domains.participating_domains < Max_domains);
+  for(i=stw_domains.participating_domains; stw_domains.domains[i]!=dom; ++i) {
+    CAMLassert(i<Max_domains);
+  }
+
+  /* swap passed domain with domain at stw_domains.participating_domains */
+  dom = stw_domains.domains[stw_domains.participating_domains];
+  stw_domains.domains[stw_domains.participating_domains] =
+      stw_domains.domains[i];
+  stw_domains.domains[i] = dom;
+  stw_domains.participating_domains++;
+}
+
+static void remove_from_stw_domains(dom_internal* dom) {
+  int i;
+  for(i=0; stw_domains.domains[i]!=dom; ++i) {
+    CAMLassert(i<Max_domains);
+  }
+  CAMLassert(i < stw_domains.participating_domains);
+
+  /* swap passed domain to first free domain */
+  stw_domains.participating_domains--;
+  stw_domains.domains[i] =
+      stw_domains.domains[stw_domains.participating_domains];
+  stw_domains.domains[stw_domains.participating_domains] = dom;
+}
+
+static dom_internal* next_free_domain() {
+  if (stw_domains.participating_domains == Max_domains)
+    return NULL;
+
+  CAMLassert(stw_domains.participating_domains < Max_domains);
+  return stw_domains.domains[stw_domains.participating_domains];
+}
+
+#ifdef __APPLE__
+/* OSX has issues with dynamic loading + exported TLS.
+    This is slower but works */
+CAMLexport pthread_key_t caml_domain_state_key;
+static pthread_once_t key_once = PTHREAD_ONCE_INIT;
+
+static void caml_make_domain_state_key (void)
 {
-  if (Caml_state != NULL)
-    return;
+  (void) pthread_key_create (&caml_domain_state_key, NULL);
+}
+
+void caml_init_domain_state_key (void)
+{
+  pthread_once(&key_once, caml_make_domain_state_key);
+}
+
+#else
+CAMLexport __thread caml_domain_state* Caml_state;
+#endif
+
+/* Interrupt functions */
+static const uintnat INTERRUPT_MAGIC = (uintnat)(-1);
+
+Caml_inline void interrupt_domain(struct interruptor* s) {
+  atomic_store_rel(s->interrupt_word, INTERRUPT_MAGIC);
+}
+
+int caml_incoming_interrupts_queued(void)
+{
+  return atomic_load_acq(&domain_self->interruptor.interrupt_pending);
+}
+
+/* must NOT be called with s->lock held */
+static void stw_handler(caml_domain_state* domain);
+static uintnat handle_incoming(struct interruptor* s)
+{
+  uintnat handled = atomic_load_acq(&s->interrupt_pending);
+  CAMLassert (s->running);
+  if (handled) {
+    atomic_store_rel(&s->interrupt_pending, 0);
+
+    stw_handler(domain_self->state);
+  }
+  return handled;
+}
+
+static void handle_incoming_otherwise_relax (struct interruptor* self)
+{
+  if (!handle_incoming(self))
+    cpu_relax();
+}
+
+void caml_handle_incoming_interrupts(void)
+{
+  handle_incoming(&domain_self->interruptor);
+}
+
+int caml_send_interrupt(struct interruptor* target)
+{
+  /* signal that there is an interrupt pending */
+  CAMLassert(!atomic_load_acq(&target->interrupt_pending));
+  atomic_store_rel(&target->interrupt_pending, 1);
+
+  /* Signal the condition variable, in case the target is
+     itself waiting for an interrupt to be processed elsewhere */
+  caml_plat_lock(&target->lock);
+  caml_plat_broadcast(&target->cond); // OPT before/after unlock? elide?
+  caml_plat_unlock(&target->lock);
+
+  interrupt_domain(target);
+
+  return 1;
+}
+
+static void caml_wait_interrupt_serviced(struct interruptor* target)
+{
+  int i;
+
+  /* Often, interrupt handlers are fast, so spin for a bit before waiting */
+  for (i=0; i<1000; i++) {
+    if (!atomic_load_acq(&target->interrupt_pending)) {
+      return;
+    }
+    cpu_relax();
+  }
+
+  {
+    SPIN_WAIT {
+      if (!atomic_load_acq(&target->interrupt_pending))
+        return;
+    }
+  }
+}
+
+#define MAX_DOMAIN_NAME_LENGTH 16
+void caml_domain_set_name(char *name)
+{
+  char thread_name[MAX_DOMAIN_NAME_LENGTH];
+  snprintf(thread_name, MAX_DOMAIN_NAME_LENGTH,
+           "%s%d", name, Caml_state->id);
+  caml_thread_setname(thread_name);
+}
+
+asize_t caml_norm_minor_heap_size (intnat wsize)
+{
+  asize_t bs, max;
+  if (wsize < Minor_heap_min) wsize = Minor_heap_min;
+  bs = caml_mem_round_up_pages(Bsize_wsize (wsize));
+
+  max = Bsize_wsize(Minor_heap_max);
+
+  if (bs > max) bs = max;
+
+  return Wsize_bsize(bs);
+}
+
+int caml_reallocate_minor_heap(asize_t wsize)
+{
+  caml_domain_state* domain_state = Caml_state;
+  CAMLassert(domain_state->young_ptr == domain_state->young_end);
+
+  /* free old minor heap.
+     instead of unmapping the heap, we decommit it, so there's
+     no race whereby other code could attempt to reuse the memory. */
+  caml_mem_decommit(
+      (void*)domain_self->minor_heap_area,
+      domain_self->minor_heap_area_end - domain_self->minor_heap_area);
+
+  wsize = caml_norm_minor_heap_size(wsize);
+
+  if (!caml_mem_commit(
+          (void*)domain_self->minor_heap_area, Bsize_wsize(wsize))) {
+    return -1;
+  }
+
+#ifdef DEBUG
+  {
+    uintnat* p = (uintnat*)domain_self->minor_heap_area;
+    for (;
+         p < (uintnat*)(domain_self->minor_heap_area + Bsize_wsize(wsize));
+         p++)
+      *p = Debug_uninit_align;
+  }
+#endif
+
+  domain_state->minor_heap_wsz = wsize;
+
+  domain_state->young_start = (value*)domain_self->minor_heap_area;
+  domain_state->young_end =
+      (value*)(domain_self->minor_heap_area + Bsize_wsize(wsize));
+  domain_state->young_limit = (uintnat) domain_state->young_start;
+  domain_state->young_ptr = domain_state->young_end;
+  return 0;
+}
+
+/* must be run on the domain's thread */
+static void create_domain(uintnat initial_minor_heap_wsize) {
+  dom_internal* d = 0;
+  CAMLassert (domain_self == 0);
+
+  /* take the all_domains_lock so that we can alter the STW participant
+     set atomically */
+  caml_plat_lock(&all_domains_lock);
+
+  /* wait until any in-progress STW sections end */
+  while (atomic_load_acq(&stw_leader)) caml_plat_wait(&all_domains_cond);
+
+  d = next_free_domain();
+  if (d) {
+    struct interruptor* s = &d->interruptor;
+    CAMLassert(!s->running);
+    CAMLassert(!s->interrupt_pending);
+    if (!s->interrupt_word) {
+      caml_domain_state* domain_state;
+      atomic_uintnat* young_limit;
+      /* never been started before, so set up minor heap */
+      if (!caml_mem_commit(
+              (void*)d->tls_area, (d->tls_area_end - d->tls_area))) {
+        /* give up now */
+        d = 0;
+        goto domain_init_complete;
+      }
+      domain_state = (caml_domain_state*)(d->tls_area);
+      young_limit = (atomic_uintnat*)&domain_state->young_limit;
+      s->interrupt_word = young_limit;
+      atomic_store_rel(young_limit, (uintnat)domain_state->young_start);
+    }
+    s->running = 1;
+    atomic_fetch_add(&caml_num_domains_running, 1);
+  }
+
+  if (d) {
+    caml_domain_state* domain_state;
+    domain_self = d;
+    SET_Caml_state((void*)(d->tls_area));
+    domain_state = (caml_domain_state*)(d->tls_area);
+    caml_plat_lock(&d->domain_lock);
+
+    domain_state->id = d->id;
+    domain_state->unique_id = d->interruptor.unique_id;
+    d->state = domain_state;
+    CAMLassert(!d->interruptor.interrupt_pending);
+
+    domain_state->extra_heap_resources = 0.0;
+    domain_state->extra_heap_resources_minor = 0.0;
+
+    domain_state->dependent_size = 0;
+    domain_state->dependent_allocated = 0;
+
+    if (caml_init_signal_stack() < 0) {
+      goto init_signal_stack_failure;
+    }
+
+    domain_state->young_start = domain_state->young_end =
+      domain_state->young_ptr = 0;
+    domain_state->minor_tables = caml_alloc_minor_tables();
+    if(domain_state->minor_tables == NULL) {
+      goto alloc_minor_tables_failure;
+    }
+
+    d->state->shared_heap = caml_init_shared_heap();
+    if(d->state->shared_heap == NULL) {
+      goto init_shared_heap_failure;
+    }
+
+    if (caml_init_major_gc(domain_state) < 0) {
+      goto init_major_gc_failure;
+    }
+
+    if(caml_reallocate_minor_heap(initial_minor_heap_wsize) < 0) {
+      goto reallocate_minor_heap_failure;
+    }
+
+    domain_state->dls_root = Val_unit;
+    caml_register_generational_global_root(&domain_state->dls_root);
+
+    domain_state->stack_cache = caml_alloc_stack_cache();
+    if(domain_state->stack_cache == NULL) {
+      goto create_stack_cache_failure;
+    }
+
+    domain_state->extern_state = NULL;
+
+    domain_state->intern_state = NULL;
+
+    domain_state->current_stack =
+        caml_alloc_main_stack(Stack_size / sizeof(value));
+    if(domain_state->current_stack == NULL) {
+      goto alloc_main_stack_failure;
+    }
+
+    domain_state->c_stack = NULL;
+    domain_state->exn_handler = NULL;
+
+    domain_state->gc_regs_buckets = NULL;
+    domain_state->gc_regs = NULL;
+    domain_state->gc_regs_slot = NULL;
+
+    domain_state->allocated_words = 0;
+    domain_state->swept_words = 0;
+
+    domain_state->local_roots = NULL;
+
+    domain_state->backtrace_buffer = NULL;
+    domain_state->backtrace_last_exn = Val_unit;
+    domain_state->backtrace_active = 0;
+    caml_register_generational_global_root(&domain_state->backtrace_last_exn);
+
+    domain_state->compare_unordered = 0;
+    domain_state->oo_next_id_local = 0;
+
+    domain_state->requested_major_slice = 0;
+    domain_state->requested_minor_gc = 0;
+    domain_state->requested_external_interrupt = 0;
+
+    domain_state->parser_trace = 0;
+
+    if (caml_params->backtrace_enabled) {
+      caml_record_backtraces(1);
+    }
+
+#ifndef NATIVE_CODE
+    domain_state->external_raise = NULL;
+    domain_state->trap_sp_off = 1;
+    domain_state->trap_barrier_off = 0;
+#endif
+
+    add_to_stw_domains(domain_self);
+    goto domain_init_complete;
+
+alloc_main_stack_failure:
+create_stack_cache_failure:
+  caml_remove_generational_global_root(&domain_state->dls_root);
+reallocate_minor_heap_failure:
+  caml_teardown_major_gc();
+init_major_gc_failure:
+  caml_teardown_shared_heap(d->state->shared_heap);
+init_shared_heap_failure:
+  caml_free_minor_tables(domain_state->minor_tables);
+  domain_state->minor_tables = NULL;
+alloc_minor_tables_failure:
+  caml_free_signal_stack();
+init_signal_stack_failure:
+  domain_self = NULL;
+
+  }
+domain_init_complete:
+  caml_plat_unlock(&all_domains_lock);
+}
+
+CAMLexport void caml_reset_domain_lock(void)
+{
+  dom_internal* self = domain_self;
+  // This is only used to reset the domain_lock state on fork.
+  caml_plat_mutex_init(&self->domain_lock);
+  caml_plat_cond_init(&self->domain_cond, &self->domain_lock);
+
+  return;
+}
+
+void caml_init_domains(uintnat minor_heap_wsz) {
+  int i;
+  uintnat size;
+  uintnat tls_size;
+  uintnat tls_areas_size;
+  void* heaps_base;
+  void* tls_base;
+
+  /* sanity check configuration */
+  if (caml_mem_round_up_pages(Bsize_wsize(Minor_heap_max))
+          != Bsize_wsize(Minor_heap_max))
+    caml_fatal_error("Minor_heap_max misconfigured for this platform");
+
+  /* reserve memory space for minor heaps and tls_areas */
+  size = (uintnat)Bsize_wsize(Minor_heap_max) * Max_domains;
+  tls_size = caml_mem_round_up_pages(sizeof(caml_domain_state));
+  tls_areas_size = tls_size * Max_domains;
+
+  heaps_base = caml_mem_map(size, size, 1 /* reserve_only */);
+  tls_base =
+      caml_mem_map(tls_areas_size, tls_areas_size, 1 /* reserve_only */);
+  if (!heaps_base || !tls_base)
+    caml_fatal_error("Not enough heap memory to start up");
+
+  caml_minor_heaps_base = (uintnat) heaps_base;
+  caml_minor_heaps_end = (uintnat) heaps_base + size;
+  caml_tls_areas_base = (uintnat) tls_base;
+
+  for (i = 0; i < Max_domains; i++) {
+    struct dom_internal* dom = &all_domains[i];
+    uintnat domain_minor_heap_base;
+    uintnat domain_tls_base;
+
+    stw_domains.domains[i] = dom;
+
+    dom->id = i;
+
+    dom->interruptor.interrupt_word = 0;
+    caml_plat_mutex_init(&dom->interruptor.lock);
+    caml_plat_cond_init(&dom->interruptor.cond,
+                        &dom->interruptor.lock);
+    dom->interruptor.running = 0;
+    dom->interruptor.terminating = 0;
+    dom->interruptor.unique_id = i;
+    dom->interruptor.interrupt_pending = 0;
+
+    caml_plat_mutex_init(&dom->domain_lock);
+    caml_plat_cond_init(&dom->domain_cond, &dom->domain_lock);
+    dom->backup_thread_running = 0;
+    dom->backup_thread_msg = BT_INIT;
+
+    domain_minor_heap_base = caml_minor_heaps_base +
+      (uintnat)Bsize_wsize(Minor_heap_max) * (uintnat)i;
+    domain_tls_base = caml_tls_areas_base + tls_size * (uintnat)i;
+    dom->tls_area = domain_tls_base;
+    dom->tls_area_end = domain_tls_base + tls_size;
+    dom->minor_heap_area = domain_minor_heap_base;
+    dom->minor_heap_area_end =
+         domain_minor_heap_base + Bsize_wsize(Minor_heap_max);
+  }
+
+  create_domain(minor_heap_wsz);
+  if (!domain_self) caml_fatal_error("Failed to create main domain");
+
+  caml_init_signal_handling();
+
+  CAML_EVENTLOG_INIT();
+  caml_domain_set_name("Domain");
+}
+
+void caml_init_domain_self(int domain_id) {
+  CAMLassert (domain_id >= 0 && domain_id < Max_domains);
+  domain_self = &all_domains[domain_id];
+  SET_Caml_state(domain_self->state);
+}
+
+enum domain_status { Dom_starting, Dom_started, Dom_failed };
+
+struct domain_ml_values {
+  value callback;
+  value mutex;
+  /* this mutex is taken when a domain starts and released when it terminates
+    which provides a simple way to block domains attempting to join this domain
+   */
+};
+
+static void init_domain_ml_values(
+  struct domain_ml_values* ml_values,
+  value callback,
+  value mutex)
+{
+  ml_values->callback = callback;
+  ml_values->mutex = mutex;
+  caml_register_generational_global_root(&ml_values->callback);
+  caml_register_generational_global_root(&ml_values->mutex);
+}
+
+static void free_domain_ml_values(struct domain_ml_values* ml_values) {
+  caml_remove_generational_global_root(&ml_values->callback);
+  caml_remove_generational_global_root(&ml_values->mutex);
+  caml_stat_free(ml_values);
+}
+
+struct domain_startup_params {
+  struct interruptor* parent;
+  enum domain_status status;
+  struct domain_ml_values* ml_values;
+  dom_internal* newdom;
+  uintnat unique_id;
+#ifndef _WIN32
+  /* signal mask to set after it is safe to do so */
+  sigset_t mask;
+#endif
+};
+
+static void* backup_thread_func(void* v)
+{
+  dom_internal* di = (dom_internal*)v;
+  uintnat msg;
+  struct interruptor* s = &di->interruptor;
+
+  domain_self = di;
+  SET_Caml_state((void*)(di->tls_area));
+
+  caml_domain_set_name("BackupThread");
+
+  CAML_EVENTLOG_IS_BACKUP_THREAD();
+
+  /* TODO: how does the backup thread interact with the eventlog infra?
+   * caml_ev_tag_self_as_backup_thread(); */
+
+  msg = atomic_load_acq (&di->backup_thread_msg);
+  while (msg != BT_TERMINATE) {
+    CAMLassert (msg <= BT_TERMINATE);
+    switch (msg) {
+      case BT_IN_BLOCKING_SECTION:
+        /* Handle interrupts on behalf of the main thread:
+         *  - must hold domain_lock to handle interrupts
+         *  - need to guarantee no blocking so that backup thread
+         *    can be signalled from caml_leave_blocking_section
+         */
+        if (caml_incoming_interrupts_queued()) {
+          if (caml_plat_try_lock(&di->domain_lock)) {
+            caml_handle_incoming_interrupts();
+            caml_plat_unlock(&di->domain_lock);
+          }
+        }
+        /* Wait safely if there is nothing to do.
+         * Will be woken from caml_leave_blocking_section
+         */
+        caml_plat_lock(&s->lock);
+        msg = atomic_load_acq (&di->backup_thread_msg);
+        if (msg == BT_IN_BLOCKING_SECTION &&
+            !caml_incoming_interrupts_queued())
+          caml_plat_wait(&s->cond);
+        caml_plat_unlock(&s->lock);
+        break;
+      case BT_ENTERING_OCAML:
+        /* Main thread wants to enter OCaml
+         * Will be woken from caml_bt_exit_ocaml
+         * or domain_terminate
+         */
+        caml_plat_lock(&di->domain_lock);
+        msg = atomic_load_acq (&di->backup_thread_msg);
+        if (msg == BT_ENTERING_OCAML)
+          caml_plat_wait(&di->domain_cond);
+        caml_plat_unlock(&di->domain_lock);
+        break;
+      default:
+        cpu_relax();
+        break;
+    };
+    msg = atomic_load_acq (&di->backup_thread_msg);
+  }
+
+  /* doing terminate */
+  atomic_store_rel(&di->backup_thread_msg, BT_INIT);
+
+  return 0;
+}
+
+static void install_backup_thread (dom_internal* di)
+{
+  int err;
+#ifndef _WIN32
+  sigset_t mask, old_mask;
+#endif
+
+  if (di->backup_thread_running == 0) {
+    CAMLassert (di->backup_thread_msg == BT_INIT || /* Using fresh domain */
+            di->backup_thread_msg == BT_TERMINATE); /* Reusing domain */
+
+    while (atomic_load_acq(&di->backup_thread_msg) != BT_INIT) {
+      /* Give a chance for backup thread on this domain to terminate */
+      caml_plat_unlock (&di->domain_lock);
+      cpu_relax ();
+      caml_plat_lock (&di->domain_lock);
+    }
+
+#ifndef _WIN32
+    /* No signals on the backup thread */
+    sigfillset(&mask);
+    pthread_sigmask(SIG_BLOCK, &mask, &old_mask);
+#endif
+
+    atomic_store_rel(&di->backup_thread_msg, BT_ENTERING_OCAML);
+    err = pthread_create(&di->backup_thread, 0, backup_thread_func, (void*)di);
+
+#ifndef _WIN32
+    pthread_sigmask(SIG_SETMASK, &old_mask, NULL);
+#endif
+
+    if (err)
+      caml_failwith("failed to create domain backup thread");
+    di->backup_thread_running = 1;
+    pthread_detach(di->backup_thread);
+  }
+}
+
+static void caml_domain_stop_default(void)
+{
+  return;
+}
+
+static void caml_domain_start_default(void)
+{
+  return;
+}
+
+static void caml_domain_external_interrupt_hook_default(void)
+{
+  return;
+}
+
+CAMLexport void (*caml_domain_start_hook)(void) =
+   caml_domain_start_default;
+
+CAMLexport void (*caml_domain_stop_hook)(void) =
+   caml_domain_stop_default;
+
+CAMLexport void (*caml_domain_external_interrupt_hook)(void) =
+   caml_domain_external_interrupt_hook_default;
+
+static void domain_terminate();
 
-  Caml_state =
-    (caml_domain_state*)caml_stat_alloc_noexc(sizeof(caml_domain_state));
-  if (Caml_state == NULL)
-    caml_fatal_error ("cannot initialize domain state");
-
-  Caml_state->young_limit = NULL;
-  Caml_state->exception_pointer = NULL;
-
-  Caml_state->young_ptr = NULL;
-  Caml_state->young_base = NULL;
-  Caml_state->young_start = NULL;
-  Caml_state->young_end = NULL;
-  Caml_state->young_alloc_start = NULL;
-  Caml_state->young_alloc_mid = NULL;
-  Caml_state->young_alloc_end = NULL;
-  Caml_state->young_trigger = NULL;
-  Caml_state->minor_heap_wsz = 0;
-  Caml_state->in_minor_collection = 0;
-  Caml_state->extra_heap_resources_minor = 0;
-  caml_alloc_minor_tables();
-
-  Caml_state->stack_low = NULL;
-  Caml_state->stack_high = NULL;
-  Caml_state->stack_threshold = NULL;
-  Caml_state->extern_sp = NULL;
-  Caml_state->trapsp = NULL;
-  Caml_state->trap_barrier = NULL;
-  Caml_state->external_raise = NULL;
-  Caml_state->exn_bucket = Val_unit;
-
-  Caml_state->top_of_stack = NULL;
-  Caml_state->bottom_of_stack = NULL; /* no stack initially */
-  Caml_state->last_return_address = 1; /* not in OCaml code initially */
-  Caml_state->gc_regs = NULL;
-
-  Caml_state->stat_minor_words = 0.0;
-  Caml_state->stat_promoted_words = 0.0;
-  Caml_state->stat_major_words = 0.0;
-  Caml_state->stat_minor_collections = 0;
-  Caml_state->stat_major_collections = 0;
-  Caml_state->stat_heap_wsz = 0;
-  Caml_state->stat_top_heap_wsz = 0;
-  Caml_state->stat_compactions = 0;
-  Caml_state->stat_forced_major_collections = 0;
-  Caml_state->stat_heap_chunks = 0;
-
-  Caml_state->backtrace_active = 0;
-  Caml_state->backtrace_pos = 0;
-  Caml_state->backtrace_buffer = NULL;
-  Caml_state->backtrace_last_exn = Val_unit;
-
-  Caml_state->compare_unordered = 0;
-  Caml_state->local_roots = NULL;
-  Caml_state->requested_major_slice = 0;
-  Caml_state->requested_minor_gc = 0;
-
-  Caml_state->eventlog_enabled = 0;
-  Caml_state->eventlog_paused = 0;
-  Caml_state->eventlog_startup_pid = 0;
-  Caml_state->eventlog_startup_timestamp = 0;
-  Caml_state->eventlog_out = NULL;
-
-#if defined(NAKED_POINTERS_CHECKER) && !defined(_WIN32)
-  Caml_state->checking_pointer_pc = NULL;
+static void* domain_thread_func(void* v)
+{
+  sync_mutex terminate_mutex = NULL;
+  struct domain_startup_params* p = v;
+  struct domain_ml_values *ml_values = p->ml_values;
+
+  create_domain(caml_params->init_minor_heap_wsz);
+  /* this domain is now part of the STW participant set */
+  p->newdom = domain_self;
+
+  /* handshake with the parent domain */
+  caml_plat_lock(&p->parent->lock);
+  if (domain_self) {
+    /* this domain is part of STW sections, so can read ml_values */
+    terminate_mutex = Mutex_val(ml_values->mutex);
+    /* we lock terminate_mutex here and unlock when the domain is torn down
+      this provides a simple block for domains attempting to join */
+    sync_mutex_lock(terminate_mutex);
+    p->status = Dom_started;
+    p->unique_id = domain_self->interruptor.unique_id;
+  } else {
+    p->status = Dom_failed;
+  }
+  caml_plat_broadcast(&p->parent->cond);
+  caml_plat_unlock(&p->parent->lock);
+  /* Cannot access p below here. */
+
+  if (domain_self) {
+    install_backup_thread(domain_self);
+
+#ifndef _WIN32
+    /* It is now safe for us to handle signals */
+    pthread_sigmask(SIG_SETMASK, &p->mask, NULL);
+#endif
+
+    caml_gc_log("Domain starting (unique_id = %"ARCH_INTNAT_PRINTF_FORMAT"u)",
+                domain_self->interruptor.unique_id);
+    caml_domain_set_name("Domain");
+    caml_domain_start_hook();
+    caml_callback(ml_values->callback, Val_unit);
+    domain_terminate();
+    /* Joining domains will lock/unlock the terminate_mutex so this unlock will
+       release them if any domains are waiting. */
+    sync_mutex_unlock(terminate_mutex);
+    free_domain_ml_values(ml_values);
+  } else {
+    caml_gc_log("Failed to create domain");
+  }
+  return 0;
+}
+
+CAMLprim value caml_domain_spawn(value callback, value mutex)
+{
+  CAMLparam2 (callback, mutex);
+  struct domain_startup_params p;
+  pthread_t th;
+  int err;
+#ifndef _WIN32
+  sigset_t mask, old_mask;
+#endif
+
+  CAML_EV_BEGIN(EV_DOMAIN_SPAWN);
+  p.parent = &domain_self->interruptor;
+  p.status = Dom_starting;
+
+  p.ml_values =
+      (struct domain_ml_values*) caml_stat_alloc_noexc(
+                                    sizeof(struct domain_ml_values));
+  if (!p.ml_values) {
+    caml_failwith("failed to create ml values for domain thread");
+  }
+  init_domain_ml_values(p.ml_values, callback, mutex);
+
+/* We block all signals while we spawn the new domain. This is because
+   pthread_create inherits the current signals set, and we want to avoid a
+   signal handler being triggered in the new domain before the domain_state is
+   fully populated. */
+#ifndef _WIN32
+  /* FIXME Spawning threads -> unix.c/win32.c */
+  sigfillset(&mask);
+  pthread_sigmask(SIG_BLOCK, &mask, &old_mask);
+  p.mask = old_mask;
+#endif
+  err = pthread_create(&th, 0, domain_thread_func, (void*)&p);
+#ifndef _WIN32
+  /* We can restore the signal mask we had initially now. */
+  pthread_sigmask(SIG_SETMASK, &old_mask, NULL);
+#endif
+
+  if (err) {
+    caml_failwith("failed to create domain thread");
+  }
+
+  /* While waiting for the child thread to start up, we need to service any
+     stop-the-world requests as they come in. */
+  caml_plat_lock(&domain_self->interruptor.lock);
+  while (p.status == Dom_starting) {
+    if (caml_incoming_interrupts_queued()) {
+      caml_plat_unlock(&domain_self->interruptor.lock);
+      handle_incoming(&domain_self->interruptor);
+      caml_plat_lock(&domain_self->interruptor.lock);
+    } else {
+      caml_plat_wait(&domain_self->interruptor.cond);
+    }
+  }
+  caml_plat_unlock(&domain_self->interruptor.lock);
+
+  if (p.status == Dom_started) {
+    /* successfully created a domain.
+       p.ml_values is now owned by that domain */
+    pthread_detach(th);
+  } else {
+    CAMLassert (p.status == Dom_failed);
+    /* failed */
+    pthread_join(th, 0);
+    free_domain_ml_values(p.ml_values);
+    caml_failwith("failed to allocate domain");
+  }
+  /* When domain 0 first spawns a domain, the backup thread is not active, we
+     ensure it is started here. */
+  install_backup_thread(domain_self);
+  CAML_EV_END(EV_DOMAIN_SPAWN);
+  CAMLreturn (Val_long(p.unique_id));
+}
+
+CAMLprim value caml_ml_domain_id(value unit)
+{
+  CAMLnoalloc;
+  return Val_int(domain_self->interruptor.unique_id);
+}
+
+CAMLprim value caml_ml_domain_unique_token (value unit)
+{
+  return Val_unit;
+}
+
+/* sense-reversing barrier */
+#define BARRIER_SENSE_BIT 0x100000
+
+barrier_status caml_global_barrier_begin(void)
+{
+  uintnat b = 1 + atomic_fetch_add(&stw_request.barrier, 1);
+  return b;
+}
+
+int caml_global_barrier_is_final(barrier_status b)
+{
+  return ((b & ~BARRIER_SENSE_BIT) == stw_request.num_domains);
+}
+
+void caml_global_barrier_end(barrier_status b)
+{
+  uintnat sense = b & BARRIER_SENSE_BIT;
+  if (caml_global_barrier_is_final(b)) {
+    /* last domain into the barrier, flip sense */
+    atomic_store_rel(&stw_request.barrier, sense ^ BARRIER_SENSE_BIT);
+  } else {
+    /* wait until another domain flips the sense */
+    SPIN_WAIT {
+      uintnat barrier = atomic_load_acq(&stw_request.barrier);
+      if ((barrier & BARRIER_SENSE_BIT) != sense) break;
+    }
+  }
+}
+
+void caml_global_barrier(void)
+{
+  barrier_status b = caml_global_barrier_begin();
+  caml_global_barrier_end(b);
+}
+
+int caml_global_barrier_num_domains(void)
+{
+  return stw_request.num_domains;
+}
+
+static void decrement_stw_domains_still_processing(void)
+{
+  /* we check if we are the last to leave a stw section
+     if so, clear the stw_leader to allow the new stw sections to start.
+   */
+  intnat am_last =
+      atomic_fetch_add(&stw_request.num_domains_still_processing, -1) == 1;
+
+  if( am_last ) {
+    /* release the STW lock to allow new STW sections */
+    caml_plat_lock(&all_domains_lock);
+    atomic_store_rel(&stw_leader, 0);
+    caml_plat_broadcast(&all_domains_cond);
+    caml_gc_log("clearing stw leader");
+    caml_plat_unlock(&all_domains_lock);
+  }
+}
+
+static void caml_poll_gc_work(void);
+static void stw_handler(caml_domain_state* domain)
+{
+  CAML_EV_BEGIN(EV_STW_HANDLER);
+  CAML_EV_BEGIN(EV_STW_API_BARRIER);
+  {
+    SPIN_WAIT {
+      if (atomic_load_acq(&stw_request.domains_still_running) == 0)
+        break;
+
+      if (stw_request.enter_spin_callback)
+        stw_request.enter_spin_callback(domain, stw_request.enter_spin_data);
+    }
+  }
+  CAML_EV_END(EV_STW_API_BARRIER);
+
+  #ifdef DEBUG
+  Caml_state->inside_stw_handler = 1;
+  #endif
+  stw_request.callback(
+      domain,
+      stw_request.data,
+      stw_request.num_domains,
+      stw_request.participating);
+  #ifdef DEBUG
+  Caml_state->inside_stw_handler = 0;
+  #endif
+
+  decrement_stw_domains_still_processing();
+
+  CAML_EV_END(EV_STW_HANDLER);
+
+  /* poll the GC to check for deferred work
+     we do this here because blocking or waiting threads only execute
+     the interrupt handler and do not poll for deferred work*/
+  caml_poll_gc_work();
+}
+
+
+#ifdef DEBUG
+int caml_domain_is_in_stw(void) {
+  return Caml_state->inside_stw_handler;
+}
+#endif
+
+int caml_try_run_on_all_domains_with_spin_work(
+  void (*handler)(caml_domain_state*, void*, int, caml_domain_state**),
+  void* data,
+  void (*leader_setup)(caml_domain_state*),
+  void (*enter_spin_callback)(caml_domain_state*, void*),
+  void* enter_spin_data)
+{
+  int i;
+  caml_domain_state* domain_state = domain_self->state;
+
+  caml_gc_log("requesting STW");
+
+  /* Don't touch the lock if there's already a stw leader
+    OR we can't get the lock */
+  if (atomic_load_acq(&stw_leader) ||
+      !caml_plat_try_lock(&all_domains_lock)) {
+    caml_handle_incoming_interrupts();
+    return 0;
+  }
+
+  /* see if there is a stw_leader already */
+  if (atomic_load_acq(&stw_leader)) {
+    caml_plat_unlock(&all_domains_lock);
+    caml_handle_incoming_interrupts();
+    return 0;
+  }
+
+  /* we have the lock and can claim the stw_leader */
+  atomic_store_rel(&stw_leader, (uintnat)domain_self);
+
+  CAML_EV_BEGIN(EV_STW_LEADER);
+  caml_gc_log("causing STW");
+
+  /* setup all fields for this stw_request, must have those needed
+     for domains waiting at the enter spin barrier */
+  stw_request.enter_spin_callback = enter_spin_callback;
+  stw_request.enter_spin_data = enter_spin_data;
+  stw_request.callback = handler;
+  stw_request.data = data;
+  atomic_store_rel(&stw_request.barrier, 0);
+  atomic_store_rel(&stw_request.domains_still_running, 1);
+  stw_request.num_domains = stw_domains.participating_domains;
+  atomic_store_rel(&stw_request.num_domains_still_processing,
+                     stw_domains.participating_domains);
+
+  if( leader_setup ) {
+    leader_setup(domain_state);
+  }
+
+#ifdef DEBUG
+  {
+    int domains_participating = 0;
+    for(i=0; i<Max_domains; i++) {
+      if(all_domains[i].interruptor.running)
+        domains_participating++;
+    }
+    CAMLassert(domains_participating == stw_domains.participating_domains);
+    CAMLassert(domains_participating > 0);
+  }
+#endif
+
+  /* Next, interrupt all domains */
+  for(i = 0; i < stw_domains.participating_domains; i++) {
+    caml_domain_state* d = stw_domains.domains[i]->state;
+    stw_request.participating[i] = d;
+    if (d != domain_state) {
+      caml_send_interrupt(&stw_domains.domains[i]->interruptor);
+    } else {
+      CAMLassert(!domain_self->interruptor.interrupt_pending);
+    }
+  }
+
+  /* domains now know they are part of the STW */
+  caml_plat_unlock(&all_domains_lock);
+
+  for(i = 0; i < stw_request.num_domains; i++) {
+    int id = stw_request.participating[i]->id;
+    caml_wait_interrupt_serviced(&all_domains[id].interruptor);
+  }
+
+  /* release from the enter barrier */
+  atomic_store_rel(&stw_request.domains_still_running, 0);
+
+  #ifdef DEBUG
+  domain_state->inside_stw_handler = 1;
+  #endif
+  handler(domain_state, data,
+          stw_request.num_domains, stw_request.participating);
+  #ifdef DEBUG
+  domain_state->inside_stw_handler = 0;
   #endif
+
+  decrement_stw_domains_still_processing();
+
+  CAML_EV_END(EV_STW_LEADER);
+
+  return 1;
+}
+
+int caml_try_run_on_all_domains(
+  void (*handler)(caml_domain_state*, void*, int, caml_domain_state**),
+  void* data,
+  void (*leader_setup)(caml_domain_state*))
+{
+  return
+      caml_try_run_on_all_domains_with_spin_work(handler,
+                                                 data,
+                                                 leader_setup, 0, 0);
+}
+
+void caml_interrupt_self(void) {
+  interrupt_domain(&domain_self->interruptor);
+}
+
+static void caml_poll_gc_work(void)
+{
+  CAMLalloc_point_here;
+
+  if (((uintnat)Caml_state->young_ptr - Bhsize_wosize(Max_young_wosize) <
+       (uintnat)Caml_state->young_start) ||
+      Caml_state->requested_minor_gc) {
+    /* out of minor heap or collection forced */
+    CAML_EV_BEGIN(EV_MINOR);
+    Caml_state->requested_minor_gc = 0;
+    caml_empty_minor_heaps_once();
+    CAML_EV_END(EV_MINOR);
+
+    /* FIXME: a domain will only ever call finalizers if its minor
+      heap triggers the minor collection
+      Care may be needed with finalizers running when the domain
+      is waiting in a blocking section and serviced by the backup
+      thread.
+      */
+    CAML_EV_BEGIN(EV_MINOR_FINALIZED);
+    caml_final_do_calls();
+    CAML_EV_END(EV_MINOR_FINALIZED);
+  }
+
+  if (Caml_state->requested_major_slice) {
+    CAML_EV_BEGIN(EV_MAJOR);
+    Caml_state->requested_major_slice = 0;
+    caml_major_collection_slice(AUTO_TRIGGERED_MAJOR_SLICE);
+    CAML_EV_END(EV_MAJOR);
+  }
+
+  if (atomic_load_acq(
+          (atomic_uintnat*)&Caml_state->requested_external_interrupt)) {
+    caml_domain_external_interrupt_hook();
+  }
+
+}
+
+CAMLexport int caml_check_pending_actions (void)
+{
+  atomic_uintnat* young_limit = domain_self->interruptor.interrupt_word;
+
+  return atomic_load_acq(young_limit) == INTERRUPT_MAGIC;
+}
+
+static void handle_gc_interrupt() {
+  atomic_uintnat* young_limit = domain_self->interruptor.interrupt_word;
+  CAMLalloc_point_here;
+
+  CAML_EV_BEGIN(EV_INTERRUPT_GC);
+  if (caml_check_pending_actions()) {
+    /* interrupt */
+    CAML_EV_BEGIN(EV_INTERRUPT_REMOTE);
+    while (caml_check_pending_actions()) {
+      uintnat i = INTERRUPT_MAGIC;
+      atomic_compare_exchange_strong(
+          young_limit, &i, (uintnat)Caml_state->young_start);
+    }
+    caml_handle_incoming_interrupts();
+    CAML_EV_END(EV_INTERRUPT_REMOTE);
+  }
+
+  caml_poll_gc_work();
+
+  CAML_EV_END(EV_INTERRUPT_GC);
+}
+
+CAMLexport void caml_process_pending_actions(void)
+{
+  handle_gc_interrupt();
+  caml_process_pending_signals();
+}
+
+void caml_handle_gc_interrupt_no_async_exceptions(void)
+{
+  handle_gc_interrupt();
+}
+
+void caml_handle_gc_interrupt(void)
+{
+  handle_gc_interrupt();
+}
+
+CAMLexport int caml_bt_is_in_blocking_section(void)
+{
+  dom_internal* self = domain_self;
+  uintnat status = atomic_load_acq(&self->backup_thread_msg);
+  if (status == BT_IN_BLOCKING_SECTION)
+    return 1;
+  else
+    return 0;
+
+}
+
+CAMLexport intnat caml_domain_is_multicore (void)
+{
+  dom_internal *self = domain_self;
+  return (!caml_domain_alone() || self->backup_thread_running);
+}
+
+CAMLexport void caml_acquire_domain_lock(void)
+{
+  dom_internal* self = domain_self;
+  caml_plat_lock(&self->domain_lock);
+}
+
+CAMLexport void caml_bt_enter_ocaml(void)
+{
+  dom_internal* self = domain_self;
+
+  CAMLassert(caml_domain_alone() || self->backup_thread_running);
+
+  if (self->backup_thread_running) {
+    atomic_store_rel(&self->backup_thread_msg, BT_ENTERING_OCAML);
+  }
+}
+
+CAMLexport void caml_release_domain_lock(void)
+{
+  dom_internal* self = domain_self;
+  caml_plat_unlock(&self->domain_lock);
+}
+
+CAMLexport void caml_bt_exit_ocaml(void)
+{
+  dom_internal* self = domain_self;
+
+  CAMLassert(caml_domain_alone() || self->backup_thread_running);
+
+  if (self->backup_thread_running) {
+    atomic_store_rel(&self->backup_thread_msg, BT_IN_BLOCKING_SECTION);
+    /* Wakeup backup thread if it is sleeping */
+    caml_plat_signal(&self->domain_cond);
+  }
+}
+
+/* default handler for unix_fork, will be called by unix_fork. */
+static void caml_atfork_default(void) {
+  caml_reset_domain_lock();
+  caml_acquire_domain_lock();
+}
+
+CAMLexport void (*caml_atfork_hook)(void) = caml_atfork_default;
+
+static void handover_ephemerons(caml_domain_state* domain_state)
+{
+  if (domain_state->ephe_info->todo == 0 &&
+      domain_state->ephe_info->live == 0)
+    return;
+
+  caml_add_to_orphaned_ephe_list(domain_state->ephe_info);
+  CAMLassert (domain_state->ephe_info->live == 0);
+  CAMLassert (domain_state->ephe_info->todo == 0);
+}
+
+static void handover_finalisers(caml_domain_state* domain_state)
+{
+  struct caml_final_info* f = domain_state->final_info;
+
+  if (f->todo_head != NULL || f->first.size != 0 || f->last.size != 0) {
+    /* have some final structures */
+    if (caml_gc_phase != Phase_sweep_and_mark_main) {
+      /* Force a major GC cycle to simplify constraints for
+       * handing over finalisers. */
+      caml_finish_major_cycle();
+      CAMLassert(caml_gc_phase == Phase_sweep_and_mark_main);
+    }
+    caml_add_orphaned_finalisers (f);
+    /* Create a dummy final info */
+    domain_state->final_info = caml_alloc_final_info();
+  }
+  caml_final_domain_terminate(domain_state);
+}
+
+int caml_domain_is_terminating (void)
+{
+  struct interruptor* s = &domain_self->interruptor;
+  return s->terminating;
+}
+
+static void domain_terminate (void)
+{
+  caml_domain_state* domain_state = domain_self->state;
+  struct interruptor* s = &domain_self->interruptor;
+  int finished = 0;
+
+  caml_gc_log("Domain terminating");
+  s->terminating = 1;
+
+  // run the domain termination hook
+  caml_domain_stop_hook();
+
+  while (!finished) {
+    caml_orphan_allocated_words();
+    caml_finish_sweeping();
+
+    caml_empty_minor_heaps_once();
+
+    caml_finish_marking();
+    handover_ephemerons(domain_state);
+    handover_finalisers(domain_state);
+
+    /* take the all_domains_lock to try and exit the STW participant set
+       without racing with a STW section being triggered */
+    caml_plat_lock(&all_domains_lock);
+
+    /* The interaction of termination and major GC is quite subtle.
+     *
+     * At the end of the major GC, we decide the number of domains to mark and
+     * sweep for the next cycle. If a STW section has been started, it will
+     * require this domain to participate, which in turn could involve a
+     * major GC cycle. This would then require finish marking and sweeping
+     * again in order to decrement the globals [num_domains_to_mark] and
+     * [num_domains_to_sweep] (see major_gc.c).
+     */
+
+    if (!caml_incoming_interrupts_queued() &&
+        domain_state->marking_done &&
+        domain_state->sweeping_done) {
+
+      finished = 1;
+      s->terminating = 0;
+      s->running = 0;
+      s->unique_id += Max_domains;
+
+      /* Remove this domain from stw_domains */
+      remove_from_stw_domains(domain_self);
+
+      /* signal the interruptor condition variable
+       * because the backup thread may be waiting on it
+       */
+      caml_plat_lock(&s->lock);
+      caml_plat_broadcast(&s->cond);
+      caml_plat_unlock(&s->lock);
+
+      CAMLassert (domain_self->backup_thread_running);
+      domain_self->backup_thread_running = 0;
+    }
+    caml_plat_unlock(&all_domains_lock);
+  }
+  /* We can not touch domain_self->interruptor after here
+     because it may be reused */
+  caml_sample_gc_collect(domain_state);
+  caml_remove_generational_global_root(&domain_state->dls_root);
+  caml_remove_generational_global_root(&domain_state->backtrace_last_exn);
+
+  caml_stat_free(domain_state->final_info);
+  caml_stat_free(domain_state->ephe_info);
+  caml_free_intern_state();
+  caml_free_extern_state();
+  caml_teardown_major_gc();
+  CAML_EVENTLOG_TEARDOWN();
+  caml_teardown_shared_heap(domain_state->shared_heap);
+  domain_state->shared_heap = 0;
+  caml_free_minor_tables(domain_state->minor_tables);
+  domain_state->minor_tables = 0;
+  caml_free_signal_stack();
+
+  if(domain_state->current_stack != NULL) {
+    caml_free_stack(domain_state->current_stack);
+  }
+
+  /* signal the domain termination to the backup thread
+     NB: for a program with no additional domains, the backup thread
+     will not have been started */
+  atomic_store_rel(&domain_self->backup_thread_msg, BT_TERMINATE);
+  caml_plat_signal(&domain_self->domain_cond);
+  caml_plat_unlock(&domain_self->domain_lock);
+
+  caml_plat_assert_all_locks_unlocked();
+  /* This is the last thing we do because we need to be able to rely
+     on caml_domain_alone (which uses caml_num_domains_running) in at least
+     the shared_heap lockfree fast paths */
+  atomic_fetch_add(&caml_num_domains_running, -1);
+}
+
+CAMLprim value caml_ml_domain_cpu_relax(value t)
+{
+  struct interruptor* self = &domain_self->interruptor;
+  handle_incoming_otherwise_relax (self);
+  return Val_unit;
+}
+
+CAMLprim value caml_domain_dls_set(value t)
+{
+  CAMLnoalloc;
+  caml_modify_generational_global_root(&Caml_state->dls_root, t);
+  return Val_unit;
+}
+
+CAMLprim value caml_domain_dls_get(value unused)
+{
+  CAMLnoalloc;
+  return Caml_state->dls_root;
+}
+
+CAMLprim value caml_ml_domain_set_name(value name)
+{
+  CAMLparam1(name);
+
+  if (caml_string_length(name) >= MAX_DOMAIN_NAME_LENGTH)
+    caml_invalid_argument("caml_ml_domain_set_name");
+  caml_thread_setname(String_val(name));
+  CAMLreturn(Val_unit);
 }
diff --git a/runtime/dynlink_nat.c b/runtime/dynlink_nat.c
index dba30c3848bb..98a45d4ca394 100644
--- a/runtime/dynlink_nat.c
+++ b/runtime/dynlink_nat.c
@@ -25,6 +25,8 @@
 #include "caml/intext.h"
 #include "caml/osdeps.h"
 #include "caml/fail.h"
+#include "caml/frame_descriptors.h"
+#include "caml/globroots.h"
 #include "caml/signals.h"
 
 #include "caml/hooks.h"
@@ -51,6 +53,8 @@ static void *getsym(void *handle, const char *module, const char *name){
   return sym;
 }
 
+extern char caml_globals_map[];
+
 CAMLprim value caml_natdynlink_getmap(value unit)
 {
   return caml_input_value_from_block(caml_globals_map, INT_MAX);
@@ -111,16 +115,12 @@ CAMLprim value caml_natdynlink_run(value handle_v, value symbol) {
   sym = optsym("__gc_roots");
   if (NULL != sym) caml_register_dyn_global(sym);
 
-  sym = optsym("__data_begin");
-  sym2 = optsym("__data_end");
-  if (NULL != sym && NULL != sym2)
-    caml_page_table_add(In_static_data, sym, sym2);
-
   sym = optsym("__code_begin");
   sym2 = optsym("__code_end");
-  if (NULL != sym && NULL != sym2)
+  if (NULL != sym && NULL != sym2) {
     caml_register_code_fragment((char *) sym, (char *) sym2,
                                 DIGEST_LATER, NULL);
+  }
 
   if( caml_natdynlink_hook != NULL ) caml_natdynlink_hook(handle,unit);
 
diff --git a/runtime/eventlog.c b/runtime/eventlog.c
index d9aca44aed77..8653d2826bb8 100644
--- a/runtime/eventlog.c
+++ b/runtime/eventlog.c
@@ -40,7 +40,7 @@
 #ifdef CAML_INSTR
 
 #define CTF_MAGIC 0xc1fc1fc1
-#define CAML_TRACE_VERSION 0x1
+#define CAML_TRACE_VERSION 0x100 /* Multicore OCaml WIP */
 
 struct ctf_stream_header {
   uint32_t magic;
@@ -57,7 +57,6 @@ static struct ctf_stream_header header = {
 #pragma pack(1)
 struct ctf_event_header {
   uint64_t timestamp;
-  uint32_t pid;
   uint32_t id;
 };
 
@@ -67,15 +66,30 @@ struct event {
   uint16_t  counter_kind; /* misc counter name */
   uint8_t  alloc_bucket; /* for alloc counters */
   uint64_t count; /* for misc counters */
+  uint8_t is_backup_thread; /* is this event from a backup thread ? */
 };
 
-#define EVENT_BUF_SIZE 4096
+#define EVENT_BUF_SIZE 32768
 struct event_buffer {
+  uintnat ev_flushed;
+  int domain_unique_id;
   uintnat ev_generated;
+  uint64_t alloc_buckets [20]; /* allocation stats, by size */
   struct event events[EVENT_BUF_SIZE];
 };
 
-static struct event_buffer* evbuf;
+#define evbuf Caml_state->eventlog_buffer
+
+static time_t startup_timestamp;
+static atomic_uintnat eventlog_enabled = 0;
+static atomic_uintnat eventlog_paused = 0;
+
+static __thread uint8_t is_backup_thread = 0;
+
+void caml_eventlog_is_backup_thread (void)
+{
+  is_backup_thread = 1;
+}
 
 static int64_t time_counter(void)
 {
@@ -120,33 +134,48 @@ static int64_t time_counter(void)
 #endif
 }
 
-static void setup_evbuf()
+static void flush_events(FILE* out, struct event_buffer* eb);
+
+void caml_eventlog_teardown(void)
 {
-  CAMLassert(!evbuf);
-  evbuf = caml_stat_alloc_noexc(sizeof(*evbuf));
+  if(!evbuf) return;
 
-  if (evbuf == NULL)
-    caml_fatal_error("eventlog: could not allocate event buffer");
+  if (Caml_state->eventlog_out != NULL) {
+    flush_events(Caml_state->eventlog_out, evbuf);
+    fflush(Caml_state->eventlog_out);
+    fclose(Caml_state->eventlog_out);
+    Caml_state->eventlog_out = NULL;
+  };
 
-  evbuf->ev_generated = 0;
+  free(evbuf);
+
+  evbuf = NULL;
 }
 
 #define OUTPUT_FILE_LEN 4096
-static void setup_eventlog_file()
+static void thread_setup_eventlog_file(int unique_id)
 {
   char_os output_file[OUTPUT_FILE_LEN];
   char_os *eventlog_filename = NULL;
+  long pid;
+
+#ifdef _WIN32
+  pid = _getpid();
+#else
+  pid = getpid();
+#endif
 
   eventlog_filename = caml_secure_getenv(T("OCAML_EVENTLOG_PREFIX"));
 
   if (eventlog_filename) {
-    int ret = snprintf_os(output_file, OUTPUT_FILE_LEN, T("%s.%ld.eventlog"),
-                         eventlog_filename, Caml_state->eventlog_startup_pid);
+    int ret = snprintf_os(output_file, OUTPUT_FILE_LEN,
+                          T("%s-caml-%ld-%d.eventlog"),
+                          eventlog_filename, pid, unique_id);
     if (ret > OUTPUT_FILE_LEN)
       caml_fatal_error("eventlog: specified OCAML_EVENTLOG_PREFIX is too long");
   } else {
-    snprintf_os(output_file, OUTPUT_FILE_LEN, T("caml-%ld.eventlog"),
-               Caml_state->eventlog_startup_pid);
+    snprintf_os(output_file, OUTPUT_FILE_LEN, T("caml-%ld-%d.eventlog"),
+                pid, unique_id);
   }
 
   Caml_state->eventlog_out = fopen_os(output_file, T("wb"));
@@ -156,36 +185,63 @@ static void setup_eventlog_file()
                       1, Caml_state->eventlog_out);
     if (ret != 1)
       caml_eventlog_disable();
-    fflush(Caml_state->eventlog_out);
   } else {
     caml_fatal_error("eventlog: could not open trace for writing");
   }
+
+
 }
 #undef OUTPUT_FILE_LEN
 
+static void thread_setup_evbuf()
+{
+  CAMLassert(!evbuf);
+
+  evbuf = caml_stat_alloc_noexc(sizeof(*evbuf));
+
+  if (evbuf == NULL)
+    caml_fatal_error("eventlog: could not allocate event buffer");
+
+  evbuf->ev_flushed = 0;
+  evbuf->ev_generated = 0;
+  evbuf->domain_unique_id = 2 * Caml_state->unique_id;
+
+  Caml_state->eventlog_out = NULL;
+
+  if (!Caml_state->eventlog_out)
+    thread_setup_eventlog_file(evbuf->domain_unique_id);
+
+  /* reset alloc_buckets */
+  for (int i = 1; i < 20; i++) {
+      evbuf->alloc_buckets[i] = 0;
+  }
+
+  return ;
+}
+
 #define FWRITE_EV(item, size) \
   if (fwrite(item, size, 1, out) != 1) \
     goto fwrite_failure;
 
+#define FWRITE_HEADER(header, tid, is_bt) \
+  FWRITE_EV(header, sizeof(struct ctf_event_header));\
+  FWRITE_EV(tid, sizeof(uint32_t));\
+  FWRITE_EV(is_bt, sizeof(uint8_t));
+
 static void flush_events(FILE* out, struct event_buffer* eb)
 {
   uintnat i;
   uint64_t flush_duration;
-  uintnat n = eb->ev_generated;
+  uintnat n = evbuf->ev_generated;
+  struct ctf_event_header header; // header for alloc event and flush events
 
-  struct ctf_event_header ev_flush;
-  ev_flush.id = EV_FLUSH;
-  ev_flush.timestamp = time_counter() -
-                        Caml_state->eventlog_startup_timestamp;
-  ev_flush.pid = Caml_state->eventlog_startup_pid;
+  header.timestamp = time_counter() - startup_timestamp;
 
   for (i = 0; i < n; i++) {
     struct event ev = eb->events[i];
-    ev.header.pid = Caml_state->eventlog_startup_pid;
-
-    FWRITE_EV(&ev.header, sizeof(struct ctf_event_header));
 
-    switch (ev.header.id)
+    FWRITE_HEADER(&ev.header, &eb->domain_unique_id, &ev.is_backup_thread);
+    switch (ev.header.id) /* event payload */
     {
     case EV_ENTRY:
       FWRITE_EV(&ev.phase, sizeof(uint16_t));
@@ -197,21 +253,27 @@ static void flush_events(FILE* out, struct event_buffer* eb)
       FWRITE_EV(&ev.count, sizeof(uint64_t));
       FWRITE_EV(&ev.counter_kind, sizeof(uint16_t));
       break;
-    case EV_ALLOC:
-      FWRITE_EV(&ev.count, sizeof(uint64_t));
-      FWRITE_EV(&ev.alloc_bucket, sizeof(uint8_t));
-      break;
     default:
       break;
     }
   }
 
-  flush_duration =
-    (time_counter() - Caml_state->eventlog_startup_timestamp) -
-    ev_flush.timestamp;
+  // flush alloc counters
+  for (i = 1; i < 20; i++) {
+    if (eb->alloc_buckets[i] != 0) {
+      header.id = EV_ALLOC;
+      FWRITE_HEADER(&header, &eb->domain_unique_id, &is_backup_thread);
+      FWRITE_EV(&eb->alloc_buckets[i], sizeof(uint64_t));
+      FWRITE_EV(&i, sizeof(uint8_t));
+    };
+    evbuf->alloc_buckets[i] = 0;
+  };
+
+  header.id = EV_FLUSH;
+  flush_duration = (time_counter() - startup_timestamp) - header.timestamp;
 
-  FWRITE_EV(&ev_flush, sizeof(struct ctf_event_header));
-  FWRITE_EV(&flush_duration, sizeof(uint64_t));
+  FWRITE_HEADER(&header, &eb->domain_unique_id, &is_backup_thread)
+  FWRITE_EV(&flush_duration, sizeof(int64_t));
 
   return;
 
@@ -239,27 +301,19 @@ static void teardown_eventlog(void)
   }
 }
 
-void caml_eventlog_init()
+void caml_eventlog_init (void)
 {
   char_os *toggle = caml_secure_getenv(T("OCAML_EVENTLOG_ENABLED"));
 
   if (toggle != NULL) {
-    Caml_state->eventlog_enabled = 1;
+    eventlog_enabled = 1;
     if (*toggle == 'p')
-      Caml_state->eventlog_paused = 1;
+      atomic_store_rel(&eventlog_paused, 1);
   };
 
-  if (!Caml_state->eventlog_enabled) return;
+  if (!eventlog_enabled) return;
 
-  Caml_state->eventlog_startup_timestamp = time_counter();
-#ifdef _WIN32
-  Caml_state->eventlog_startup_pid = _getpid();
-#else
-  Caml_state->eventlog_startup_pid = getpid();
-#endif
-
-  setup_eventlog_file();
-  setup_evbuf();
+  startup_timestamp = time_counter();
 
   atexit(&teardown_eventlog);
 }
@@ -270,8 +324,9 @@ static void post_event(ev_gc_phase phase, ev_gc_counter counter_kind,
   uintnat i;
   struct event* ev;
 
-  if (!Caml_state->eventlog_enabled) return;
-  if (Caml_state->eventlog_paused) return;
+  if (!eventlog_enabled) return;
+  if (eventlog_paused) return;
+  if (!evbuf) thread_setup_evbuf();
 
   i = evbuf->ev_generated;
   CAMLassert(i <= EVENT_BUF_SIZE);
@@ -286,8 +341,8 @@ static void post_event(ev_gc_phase phase, ev_gc_counter counter_kind,
   ev->counter_kind = counter_kind;
   ev->alloc_bucket = bucket;
   ev->phase = phase;
-  ev->header.timestamp = time_counter() -
-                           Caml_state->eventlog_startup_timestamp;
+  ev->is_backup_thread = is_backup_thread;
+  ev->header.timestamp = time_counter() - startup_timestamp;
   evbuf->ev_generated = i + 1;
 }
 
@@ -306,49 +361,35 @@ void caml_ev_counter(ev_gc_counter counter, uint64_t val)
   post_event(0, counter, 0, val, EV_COUNTER);
 }
 
-static uint64_t alloc_buckets [20] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
 /* This function records allocations in caml_alloc_shr_aux in given bucket sizes
    These buckets are meant to be flushed explicitly by the caller through the
    caml_ev_alloc_flush function. Until then the buckets are just updated until
    flushed.
+   TODO(engil): we actually need to record these in Multicore.
+   Since the allocator is much different in Multicore it is unclear where and if
+   we want these.
 */
 void caml_ev_alloc(uint64_t sz)
 {
-  if (!Caml_state->eventlog_enabled) return;
-  if (Caml_state->eventlog_paused) return;
+  if (!eventlog_enabled) return;
+  if (eventlog_paused) return;
+
+  if (evbuf == NULL)
+    thread_setup_evbuf();
 
   if (sz < 10) {
-    ++alloc_buckets[sz];
+    ++evbuf->alloc_buckets[sz];
   } else if (sz < 100) {
-    ++alloc_buckets[sz/10 + 9];
+    ++evbuf->alloc_buckets[sz/10 + 9];
   } else {
-    ++alloc_buckets[19];
-  }
-}
-
-/*  Note that this function does not trigger an actual disk flush, it just
-    pushes events in the event buffer.
-*/
-void caml_ev_alloc_flush()
-{
-  int i;
-
-  if (!Caml_state->eventlog_enabled) return;
-  if (Caml_state->eventlog_paused) return;
-
-  for (i = 1; i < 20; i++) {
-    if (alloc_buckets[i] != 0) {
-      post_event(0, 0, i, alloc_buckets[i], EV_ALLOC);
-    };
-    alloc_buckets[i] = 0;
+    ++evbuf->alloc_buckets[19];
   }
 }
 
-void caml_ev_flush()
+void caml_ev_flush (void)
 {
-  if (!Caml_state->eventlog_enabled) return;
-  if (Caml_state->eventlog_paused) return;
+  if (!eventlog_enabled) return;
+  if (eventlog_paused) return;
 
   if (Caml_state->eventlog_out) {
     if (evbuf)
@@ -357,28 +398,24 @@ void caml_ev_flush()
   };
 }
 
-void caml_eventlog_disable()
+void caml_eventlog_disable (void)
 {
-  Caml_state->eventlog_enabled = 0;
-  teardown_eventlog();
+  atomic_store_rel(&eventlog_enabled, 0);
 }
 
 CAMLprim value caml_eventlog_resume(value v)
 {
   CAMLassert(v == Val_unit);
-  if (Caml_state->eventlog_enabled)
-    Caml_state->eventlog_paused = 0;
+  if (eventlog_enabled)
+    atomic_store_rel(&eventlog_paused, 0);
   return Val_unit;
 }
 
 CAMLprim value caml_eventlog_pause(value v)
 {
   CAMLassert(v == Val_unit);
-  if (Caml_state->eventlog_enabled) {
-    Caml_state->eventlog_paused = 1;
-    if (evbuf && Caml_state->eventlog_out)
-      flush_events(Caml_state->eventlog_out, evbuf);
-  };
+  if (eventlog_enabled)
+    atomic_store_rel(&eventlog_paused, 1);
   return Val_unit;
 }
 
diff --git a/runtime/extern.c b/runtime/extern.c
index 12d3c41e13ea..1cd7eb1b8014 100644
--- a/runtime/extern.c
+++ b/runtime/extern.c
@@ -32,10 +32,7 @@
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
 #include "caml/reverse.h"
-
-static uintnat obj_counter;  /* Number of objects emitted so far */
-static uintnat size_32;  /* Size in words of 32-bit block for struct. */
-static uintnat size_64;  /* Size in words of 64-bit block for struct. */
+#include "caml/shared_heap.h"
 
 /* Flags affecting marshaling */
 
@@ -46,20 +43,12 @@ enum {
                                    be read back on a 32-bit platform */
 };
 
-static int extern_flags;        /* logical or of some of the flags above */
-
 /* Stack for pending values to marshal */
 
-struct extern_item { value * v; mlsize_t count; };
-
 #define EXTERN_STACK_INIT_SIZE 256
 #define EXTERN_STACK_MAX_SIZE (1024*1024*100)
 
-static struct extern_item extern_stack_init[EXTERN_STACK_INIT_SIZE];
-
-static struct extern_item * extern_stack = extern_stack_init;
-static struct extern_item * extern_stack_limit = extern_stack_init
-                                                   + EXTERN_STACK_INIT_SIZE;
+struct extern_item { value * v; mlsize_t count; };
 
 /* Hash table to record already-marshaled objects and their positions */
 
@@ -93,61 +82,126 @@ struct position_table {
 #define POS_TABLE_INIT_SIZE_LOG2 8
 #define POS_TABLE_INIT_SIZE (1 << POS_TABLE_INIT_SIZE_LOG2)
 
-static uintnat pos_table_present_init[Bitvect_size(POS_TABLE_INIT_SIZE)];
-static struct object_position pos_table_entries_init[POS_TABLE_INIT_SIZE];
+struct output_block {
+  struct output_block * next;
+  char * end;
+  char data[SIZE_EXTERN_OUTPUT_BLOCK];
+};
+
+struct caml_extern_state {
+
+  int extern_flags;        /* logical or of some of the flags */
 
-static struct position_table pos_table;
+  uintnat obj_counter;    /* Number of objects emitted so far */
+  uintnat size_32;        /* Size in words of 32-bit block for struct. */
+  uintnat size_64;        /* Size in words of 64-bit block for struct. */
+
+  /* Stack for pending value to marshal */
+  struct extern_item extern_stack_init[EXTERN_STACK_INIT_SIZE];
+  struct extern_item * extern_stack;
+  struct extern_item * extern_stack_limit;
+
+  /* Hash table to record already marshalled objects */
+  uintnat pos_table_present_init[Bitvect_size(POS_TABLE_INIT_SIZE)];
+  struct object_position pos_table_entries_init[POS_TABLE_INIT_SIZE];
+  struct position_table pos_table;
+
+  /* To buffer the output */
+
+  char * extern_userprovided_output;
+  char * extern_ptr;
+  char * extern_limit;
+
+  struct output_block * extern_output_first;
+  struct output_block * extern_output_block;
+};
+
+static struct caml_extern_state* get_extern_state (void)
+{
+  struct caml_extern_state* extern_state;
+
+  if (Caml_state->extern_state != NULL)
+    return Caml_state->extern_state;
+
+  extern_state =
+    caml_stat_alloc_noexc(sizeof(struct caml_extern_state));
+  if (extern_state == NULL) {
+    return NULL;
+  }
+
+  extern_state->extern_flags = 0;
+  extern_state->obj_counter = 0;
+  extern_state->size_32 = 0;
+  extern_state->size_64 = 0;
+  extern_state->extern_stack = extern_state->extern_stack_init;
+  extern_state->extern_stack_limit =
+    extern_state->extern_stack + EXTERN_STACK_INIT_SIZE;
+
+  Caml_state->extern_state = extern_state;
+  return extern_state;
+}
+
+void caml_free_extern_state (void)
+{
+  if (Caml_state->extern_state != NULL) {
+    caml_stat_free(Caml_state->extern_state);
+    Caml_state->extern_state = NULL;
+  }
+}
 
 /* Forward declarations */
 
 CAMLnoreturn_start
-static void extern_out_of_memory(void)
+static void extern_out_of_memory(struct caml_extern_state* s)
 CAMLnoreturn_end;
 
 CAMLnoreturn_start
-static void extern_invalid_argument(char *msg)
+static void extern_invalid_argument(struct caml_extern_state* s, char *msg)
 CAMLnoreturn_end;
 
 CAMLnoreturn_start
-static void extern_failwith(char *msg)
+static void extern_failwith(struct caml_extern_state* s, char *msg)
 CAMLnoreturn_end;
 
 CAMLnoreturn_start
-static void extern_stack_overflow(void)
+static void extern_stack_overflow(struct caml_extern_state* s)
 CAMLnoreturn_end;
 
-static void free_extern_output(void);
+static void free_extern_output(struct caml_extern_state* s);
 
 /* Free the extern stack if needed */
-static void extern_free_stack(void)
+static void extern_free_stack(struct caml_extern_state* s)
 {
-  if (extern_stack != extern_stack_init) {
-    caml_stat_free(extern_stack);
+  if (s->extern_stack != s->extern_stack_init) {
+    caml_stat_free(s->extern_stack);
     /* Reinitialize the globals for next time around */
-    extern_stack = extern_stack_init;
-    extern_stack_limit = extern_stack + EXTERN_STACK_INIT_SIZE;
+    s->extern_stack = s->extern_stack_init;
+    s->extern_stack_limit = s->extern_stack + EXTERN_STACK_INIT_SIZE;
   }
 }
 
-static struct extern_item * extern_resize_stack(struct extern_item * sp)
+
+static struct extern_item * extern_resize_stack(struct caml_extern_state* s,
+                                                struct extern_item * sp)
 {
-  asize_t newsize = 2 * (extern_stack_limit - extern_stack);
-  asize_t sp_offset = sp - extern_stack;
+  asize_t newsize = 2 * (s->extern_stack_limit - s->extern_stack);
+  asize_t sp_offset = sp - s->extern_stack;
   struct extern_item * newstack;
 
-  if (newsize >= EXTERN_STACK_MAX_SIZE) extern_stack_overflow();
-  if (extern_stack == extern_stack_init) {
-    newstack = caml_stat_alloc_noexc(sizeof(struct extern_item) * newsize);
-    if (newstack == NULL) extern_stack_overflow();
-    memcpy(newstack, extern_stack_init,
-           sizeof(struct extern_item) * EXTERN_STACK_INIT_SIZE);
-  } else {
-    newstack = caml_stat_resize_noexc(extern_stack,
-                                      sizeof(struct extern_item) * newsize);
-    if (newstack == NULL) extern_stack_overflow();
-  }
-  extern_stack = newstack;
-  extern_stack_limit = newstack + newsize;
+  if (newsize >= EXTERN_STACK_MAX_SIZE) extern_stack_overflow(s);
+  newstack = caml_stat_calloc_noexc(newsize, sizeof(struct extern_item));
+  if (newstack == NULL) extern_stack_overflow(s);
+
+  /* Copy items from the old stack to the new stack */
+  memcpy (newstack, s->extern_stack,
+          sizeof(struct extern_item) * sp_offset);
+
+  /* Free the old stack if it is not the initial stack */
+  if (s->extern_stack != s->extern_stack_init)
+    caml_stat_free(s->extern_stack);
+
+  s->extern_stack = newstack;
+  s->extern_stack_limit = newstack + newsize;
   return newstack + sp_offset;
 }
 
@@ -159,34 +213,37 @@ static struct extern_item * extern_resize_stack(struct extern_item * sp)
 #else
 #define HASH_FACTOR 2654435769UL
 #endif
-#define Hash(v) (((uintnat)(v) * HASH_FACTOR) >> pos_table.shift)
+#define Hash(v,shift) (((uintnat)(v) * HASH_FACTOR) >> (shift))
 
 /* When the table becomes 2/3 full, its size is increased. */
 #define Threshold(sz) (((sz) * 2) / 3)
 
 /* Initialize the position table */
 
-static void extern_init_position_table(void)
+static void extern_init_position_table(struct caml_extern_state* s)
 {
-  if (extern_flags & NO_SHARING) return;
-  pos_table.size = POS_TABLE_INIT_SIZE;
-  pos_table.shift = 8 * sizeof(value) - POS_TABLE_INIT_SIZE_LOG2;
-  pos_table.mask = POS_TABLE_INIT_SIZE - 1;
-  pos_table.threshold = Threshold(POS_TABLE_INIT_SIZE);
-  pos_table.present = pos_table_present_init;
-  pos_table.entries = pos_table_entries_init;
-  memset(pos_table_present_init, 0, sizeof(pos_table_present_init));
+  if (s->extern_flags & NO_SHARING) return;
+  s->pos_table.size = POS_TABLE_INIT_SIZE;
+  s->pos_table.shift = 8 * sizeof(value) - POS_TABLE_INIT_SIZE_LOG2;
+  s->pos_table.mask = POS_TABLE_INIT_SIZE - 1;
+  s->pos_table.threshold = Threshold(POS_TABLE_INIT_SIZE);
+  s->pos_table.present = s->pos_table_present_init;
+  s->pos_table.entries = s->pos_table_entries_init;
+  memset(s->pos_table_present_init, 0,
+         Bitvect_size(POS_TABLE_INIT_SIZE) * sizeof(uintnat));
 }
 
 /* Free the position table */
 
-static void extern_free_position_table(void)
+static void extern_free_position_table(struct caml_extern_state* s)
 {
-  if (pos_table.present != pos_table_present_init) {
-    caml_stat_free(pos_table.present);
-    caml_stat_free(pos_table.entries);
+  if (s->extern_flags & NO_SHARING) return;
+  if (s->pos_table.present != s->pos_table_present_init) {
+    caml_stat_free(s->pos_table.present);
+    caml_stat_free(s->pos_table.entries);
     /* Protect against repeated calls to extern_free_position_table */
-    pos_table.present = pos_table_present_init;
+    s->pos_table.present = s->pos_table_present_init;
+    s->pos_table.entries = s->pos_table_entries_init;
   }
 }
 
@@ -204,14 +261,14 @@ Caml_inline void bitvect_set(uintnat * bv, uintnat i)
 
 /* Grow the position table */
 
-static void extern_resize_position_table(void)
+static void extern_resize_position_table(struct caml_extern_state *s)
 {
   mlsize_t new_size, new_byte_size;
   int new_shift;
   uintnat * new_present;
   struct object_position * new_entries;
   uintnat i, h;
-  struct position_table old = pos_table;
+  struct position_table old = s->pos_table;
 
   /* Grow the table quickly (x 8) up to 10^6 entries,
      more slowly (x 2) afterwards. */
@@ -225,35 +282,35 @@ static void extern_resize_position_table(void)
   if (new_size == 0
       || caml_umul_overflow(new_size, sizeof(struct object_position),
                             &new_byte_size))
-    extern_out_of_memory();
+    extern_out_of_memory(s);
   new_entries = caml_stat_alloc_noexc(new_byte_size);
-  if (new_entries == NULL) extern_out_of_memory();
+  if (new_entries == NULL) extern_out_of_memory(s);
   new_present =
     caml_stat_calloc_noexc(Bitvect_size(new_size), sizeof(uintnat));
   if (new_present == NULL) {
     caml_stat_free(new_entries);
-    extern_out_of_memory();
+    extern_out_of_memory(s);
   }
-  pos_table.size = new_size;
-  pos_table.shift = new_shift;
-  pos_table.mask = new_size - 1;
-  pos_table.threshold = Threshold(new_size);
-  pos_table.present = new_present;
-  pos_table.entries = new_entries;
+  s->pos_table.size = new_size;
+  s->pos_table.shift = new_shift;
+  s->pos_table.mask = new_size - 1;
+  s->pos_table.threshold = Threshold(new_size);
+  s->pos_table.present = new_present;
+  s->pos_table.entries = new_entries;
 
   /* Insert every entry of the old table in the new table */
   for (i = 0; i < old.size; i++) {
     if (! bitvect_test(old.present, i)) continue;
-    h = Hash(old.entries[i].obj);
+    h = Hash(old.entries[i].obj, s->pos_table.shift);
     while (bitvect_test(new_present, h)) {
-      h = (h + 1) & pos_table.mask;
+      h = (h + 1) & s->pos_table.mask;
     }
     bitvect_set(new_present, h);
     new_entries[h] = old.entries[i];
   }
 
-  /* Free the old tables if not statically allocated */
-  if (old.present != pos_table_present_init) {
+  /* Free the old tables if they are not the initial ones */
+  if (old.present != s->pos_table_present_init) {
     caml_stat_free(old.present);
     caml_stat_free(old.entries);
   }
@@ -264,20 +321,20 @@ static void extern_resize_position_table(void)
    If not, set [*h_out] to the hash value appropriate for
    [extern_record_location] and return 0. */
 
-Caml_inline int extern_lookup_position(value obj,
+Caml_inline int extern_lookup_position(struct caml_extern_state *s, value obj,
                                        uintnat * pos_out, uintnat * h_out)
 {
-  uintnat h = Hash(obj);
+  uintnat h = Hash(obj, s->pos_table.shift);
   while (1) {
-    if (! bitvect_test(pos_table.present, h)) {
+    if (! bitvect_test(s->pos_table.present, h)) {
       *h_out = h;
       return 0;
     }
-    if (pos_table.entries[h].obj == obj) {
-      *pos_out = pos_table.entries[h].pos;
+    if (s->pos_table.entries[h].obj == obj) {
+      *pos_out = s->pos_table.entries[h].pos;
       return 1;
     }
-    h = (h + 1) & pos_table.mask;
+    h = (h + 1) & s->pos_table.mask;
   }
 }
 
@@ -285,93 +342,85 @@ Caml_inline int extern_lookup_position(value obj,
 /* The [h] parameter is the index in the hash table where the object
    must be inserted.  It was determined during lookup. */
 
-static void extern_record_location(value obj, uintnat h)
+static void extern_record_location(struct caml_extern_state* s,
+                                   value obj, uintnat h)
 {
-  if (extern_flags & NO_SHARING) return;
-  bitvect_set(pos_table.present, h);
-  pos_table.entries[h].obj = obj;
-  pos_table.entries[h].pos = obj_counter;
-  obj_counter++;
-  if (obj_counter >= pos_table.threshold) extern_resize_position_table();
+  if (s->extern_flags & NO_SHARING) return;
+  bitvect_set(s->pos_table.present, h);
+  s->pos_table.entries[h].obj = obj;
+  s->pos_table.entries[h].pos = s->obj_counter;
+  s->obj_counter++;
+  if (s->obj_counter >= s->pos_table.threshold)
+    extern_resize_position_table(s);
 }
 
 /* To buffer the output */
 
-static char * extern_userprovided_output;
-static char * extern_ptr, * extern_limit;
-
-struct output_block {
-  struct output_block * next;
-  char * end;
-  char data[SIZE_EXTERN_OUTPUT_BLOCK];
-};
-
-static struct output_block * extern_output_first, * extern_output_block;
-
-static void init_extern_output(void)
+static void init_extern_output(struct caml_extern_state* s)
 {
-  extern_userprovided_output = NULL;
-  extern_output_first = caml_stat_alloc_noexc(sizeof(struct output_block));
-  if (extern_output_first == NULL) caml_raise_out_of_memory();
-  extern_output_block = extern_output_first;
-  extern_output_block->next = NULL;
-  extern_ptr = extern_output_block->data;
-  extern_limit = extern_output_block->data + SIZE_EXTERN_OUTPUT_BLOCK;
+  s->extern_userprovided_output = NULL;
+  s->extern_output_first = caml_stat_alloc_noexc(sizeof(struct output_block));
+  if (s->extern_output_first == NULL) caml_raise_out_of_memory();
+  s->extern_output_block = s->extern_output_first;
+  s->extern_output_block->next = NULL;
+  s->extern_ptr = s->extern_output_block->data;
+  s->extern_limit = s->extern_output_block->data + SIZE_EXTERN_OUTPUT_BLOCK;
 }
 
-static void close_extern_output(void)
+static void close_extern_output(struct caml_extern_state* s)
 {
-  if (extern_userprovided_output == NULL){
-    extern_output_block->end = extern_ptr;
+  if (s->extern_userprovided_output == NULL){
+    s->extern_output_block->end = s->extern_ptr;
   }
 }
 
-static void free_extern_output(void)
+static void free_extern_output(struct caml_extern_state* s)
 {
   struct output_block * blk, * nextblk;
 
-  if (extern_userprovided_output == NULL) {
-    for (blk = extern_output_first; blk != NULL; blk = nextblk) {
+  if (s->extern_userprovided_output == NULL) {
+    for (blk = s->extern_output_first; blk != NULL; blk = nextblk) {
       nextblk = blk->next;
       caml_stat_free(blk);
     }
-    extern_output_first = NULL;
+    s->extern_output_first = NULL;
   }
-  extern_free_stack();
-  extern_free_position_table();
+  extern_free_stack(s);
+  extern_free_position_table(s);
 }
 
-static void grow_extern_output(intnat required)
+static void grow_extern_output(struct caml_extern_state *s, intnat required)
 {
   struct output_block * blk;
   intnat extra;
 
-  if (extern_userprovided_output != NULL) {
-    extern_failwith("Marshal.to_buffer: buffer overflow");
+  if (s->extern_userprovided_output != NULL) {
+    extern_failwith(s, "Marshal.to_buffer: buffer overflow");
   }
-  extern_output_block->end = extern_ptr;
+  s->extern_output_block->end = s->extern_ptr;
   if (required <= SIZE_EXTERN_OUTPUT_BLOCK / 2)
     extra = 0;
   else
     extra = required;
   blk = caml_stat_alloc_noexc(sizeof(struct output_block) + extra);
-  if (blk == NULL) extern_out_of_memory();
-  extern_output_block->next = blk;
-  extern_output_block = blk;
-  extern_output_block->next = NULL;
-  extern_ptr = extern_output_block->data;
-  extern_limit = extern_output_block->data + SIZE_EXTERN_OUTPUT_BLOCK + extra;
+  if (blk == NULL) extern_out_of_memory(s);
+  s->extern_output_block->next = blk;
+  s->extern_output_block = blk;
+  s->extern_output_block->next = NULL;
+  s->extern_ptr = s->extern_output_block->data;
+  s->extern_limit =
+    s->extern_output_block->data + SIZE_EXTERN_OUTPUT_BLOCK + extra;
 }
 
-static intnat extern_output_length(void)
+static intnat extern_output_length(struct caml_extern_state* s)
 {
   struct output_block * blk;
   intnat len;
 
-  if (extern_userprovided_output != NULL) {
-    return extern_ptr - extern_userprovided_output;
+  if (s->extern_userprovided_output != NULL) {
+    return s->extern_ptr - s->extern_userprovided_output;
   } else {
-    for (len = 0, blk = extern_output_first; blk != NULL; blk = blk->next)
+    for (len = 0, blk = s->extern_output_first; blk != NULL; blk = blk->next)
       len += blk->end - blk->data;
     return len;
   }
@@ -379,28 +428,28 @@ static intnat extern_output_length(void)
 
 /* Exception raising, with cleanup */
 
-static void extern_out_of_memory(void)
+static void extern_out_of_memory(struct caml_extern_state* s)
 {
-  free_extern_output();
+  free_extern_output(s);
   caml_raise_out_of_memory();
 }
 
-static void extern_invalid_argument(char *msg)
+static void extern_invalid_argument(struct caml_extern_state *s, char *msg)
 {
-  free_extern_output();
+  free_extern_output(s);
   caml_invalid_argument(msg);
 }
 
-static void extern_failwith(char *msg)
+static void extern_failwith(struct caml_extern_state* s, char *msg)
 {
-  free_extern_output();
+  free_extern_output(s);
   caml_failwith(msg);
 }
 
-static void extern_stack_overflow(void)
+static void extern_stack_overflow(struct caml_extern_state* s)
 {
   caml_gc_message (0x04, "Stack overflow in marshaling value\n");
-  free_extern_output();
+  free_extern_output(s);
   caml_raise_out_of_memory();
 }
 
@@ -424,180 +473,190 @@ Caml_inline void store64(char * dst, int64_t n)
 
 /* Write characters, integers, and blocks in the output buffer */
 
-Caml_inline void write(int c)
+Caml_inline void write(struct caml_extern_state* s, int c)
 {
-  if (extern_ptr >= extern_limit) grow_extern_output(1);
-  *extern_ptr++ = c;
+  if (s->extern_ptr >= s->extern_limit) grow_extern_output(s, 1);
+  *s->extern_ptr++ = c;
 }
 
-static void writeblock(const char * data, intnat len)
+static void writeblock(struct caml_extern_state* s, const char * data,
+                       intnat len)
 {
-  if (extern_ptr + len > extern_limit) grow_extern_output(len);
-  memcpy(extern_ptr, data, len);
-  extern_ptr += len;
+  if (s->extern_ptr + len > s->extern_limit) grow_extern_output(s, len);
+  memcpy(s->extern_ptr, data, len);
+  s->extern_ptr += len;
 }
 
-Caml_inline void writeblock_float8(const double * data, intnat ndoubles)
+Caml_inline void writeblock_float8(struct caml_extern_state* s,
+                                   const double * data, intnat ndoubles)
 {
 #if ARCH_FLOAT_ENDIANNESS == 0x01234567 || ARCH_FLOAT_ENDIANNESS == 0x76543210
-  writeblock((const char *) data, ndoubles * 8);
+  writeblock(s, (const char *) data, ndoubles * 8);
 #else
   caml_serialize_block_float_8(data, ndoubles);
 #endif
 }
 
-static void writecode8(int code, intnat val)
+static void writecode8(struct caml_extern_state* s,
+                       int code, intnat val)
 {
-  if (extern_ptr + 2 > extern_limit) grow_extern_output(2);
-  extern_ptr[0] = code;
-  extern_ptr[1] = val;
-  extern_ptr += 2;
+  if (s->extern_ptr + 2 > s->extern_limit) grow_extern_output(s, 2);
+  s->extern_ptr[0] = code;
+  s->extern_ptr[1] = val;
+  s->extern_ptr += 2;
 }
 
-static void writecode16(int code, intnat val)
+static void writecode16(struct caml_extern_state* s,
+                        int code, intnat val)
 {
-  if (extern_ptr + 3 > extern_limit) grow_extern_output(3);
-  extern_ptr[0] = code;
-  store16(extern_ptr + 1, (int) val);
-  extern_ptr += 3;
+  if (s->extern_ptr + 3 > s->extern_limit) grow_extern_output(s, 3);
+  s->extern_ptr[0] = code;
+  store16(s->extern_ptr + 1, (int) val);
+  s->extern_ptr += 3;
 }
 
-static void writecode32(int code, intnat val)
+static void writecode32(struct caml_extern_state* s,
+                        int code, intnat val)
 {
-  if (extern_ptr + 5 > extern_limit) grow_extern_output(5);
-  extern_ptr[0] = code;
-  store32(extern_ptr + 1, val);
-  extern_ptr += 5;
+  if (s->extern_ptr + 5 > s->extern_limit) grow_extern_output(s, 5);
+  s->extern_ptr[0] = code;
+  store32(s->extern_ptr + 1, val);
+  s->extern_ptr += 5;
 }
 
 #ifdef ARCH_SIXTYFOUR
-static void writecode64(int code, intnat val)
+static void writecode64(struct caml_extern_state* s,
+                        int code, intnat val)
 {
-  if (extern_ptr + 9 > extern_limit) grow_extern_output(9);
-  extern_ptr[0] = code;
-  store64(extern_ptr + 1, val);
-  extern_ptr += 9;
+  if (s->extern_ptr + 9 > s->extern_limit) grow_extern_output(s, 9);
+  s->extern_ptr[0] = code;
+  store64(s->extern_ptr + 1, val);
+  s->extern_ptr += 9;
 }
 #endif
 
 /* Marshaling integers */
 
-Caml_inline void extern_int(intnat n)
+Caml_inline void extern_int(struct caml_extern_state* s, intnat n)
 {
   if (n >= 0 && n < 0x40) {
-    write(PREFIX_SMALL_INT + n);
+    write(s, PREFIX_SMALL_INT + n);
   } else if (n >= -(1 << 7) && n < (1 << 7)) {
-    writecode8(CODE_INT8, n);
+    writecode8(s, CODE_INT8, n);
   } else if (n >= -(1 << 15) && n < (1 << 15)) {
-    writecode16(CODE_INT16, n);
+    writecode16(s, CODE_INT16, n);
 #ifdef ARCH_SIXTYFOUR
   } else if (n < -((intnat)1 << 30) || n >= ((intnat)1 << 30)) {
-      if (extern_flags & COMPAT_32)
-        extern_failwith("output_value: integer cannot be read back on "
+      if (s->extern_flags & COMPAT_32)
+        extern_failwith(s, "output_value: integer cannot be read back on "
                         "32-bit platform");
-      writecode64(CODE_INT64, n);
+      writecode64(s, CODE_INT64, n);
 #endif
   } else {
-    writecode32(CODE_INT32, n);
+    writecode32(s, CODE_INT32, n);
   }
 }
 
 /* Marshaling references to previously-marshaled blocks */
 
-Caml_inline void extern_shared_reference(uintnat d)
+Caml_inline void extern_shared_reference(struct caml_extern_state* s,
+                                         uintnat d)
 {
   if (d < 0x100) {
-    writecode8(CODE_SHARED8, d);
+    writecode8(s, CODE_SHARED8, d);
   } else if (d < 0x10000) {
-    writecode16(CODE_SHARED16, d);
+    writecode16(s, CODE_SHARED16, d);
 #ifdef ARCH_SIXTYFOUR
   } else if (d >= (uintnat)1 << 32) {
-    writecode64(CODE_SHARED64, d);
+    writecode64(s, CODE_SHARED64, d);
 #endif
   } else {
-    writecode32(CODE_SHARED32, d);
+    writecode32(s, CODE_SHARED32, d);
   }
 }
 
 /* Marshaling block headers */
 
-Caml_inline void extern_header(mlsize_t sz, tag_t tag)
+Caml_inline void extern_header(struct caml_extern_state* s,
+                               mlsize_t sz, tag_t tag)
 {
   if (tag < 16 && sz < 8) {
-    write(PREFIX_SMALL_BLOCK + tag + (sz << 4));
+    write(s, PREFIX_SMALL_BLOCK + tag + (sz << 4));
   } else {
-    header_t hd = Make_header(sz, tag, Caml_white);
+    header_t hd = Make_header(sz, tag, NOT_MARKABLE);
 #ifdef ARCH_SIXTYFOUR
-    if (sz > 0x3FFFFF && (extern_flags & COMPAT_32))
-      extern_failwith("output_value: array cannot be read back on "
+    if (sz > 0x3FFFFF && (s->extern_flags & COMPAT_32))
+      extern_failwith(s, "output_value: array cannot be read back on "
                       "32-bit platform");
     if (hd < (uintnat)1 << 32)
-      writecode32(CODE_BLOCK32, hd);
+      writecode32(s, CODE_BLOCK32, hd);
     else
-      writecode64(CODE_BLOCK64, hd);
+      writecode64(s, CODE_BLOCK64, hd);
 #else
-    writecode32(CODE_BLOCK32, hd);
+    writecode32(s, CODE_BLOCK32, hd);
 #endif
   }
 }
 
 /* Marshaling strings */
 
-Caml_inline void extern_string(value v, mlsize_t len)
+Caml_inline void extern_string(struct caml_extern_state *s,
+                               value v, mlsize_t len)
 {
   if (len < 0x20) {
-    write(PREFIX_SMALL_STRING + len);
+    write(s, PREFIX_SMALL_STRING + len);
   } else if (len < 0x100) {
-    writecode8(CODE_STRING8, len);
+    writecode8(s, CODE_STRING8, len);
   } else {
 #ifdef ARCH_SIXTYFOUR
-    if (len > 0xFFFFFB && (extern_flags & COMPAT_32))
-      extern_failwith("output_value: string cannot be read back on "
+    if (len > 0xFFFFFB && (s->extern_flags & COMPAT_32))
+      extern_failwith(s, "output_value: string cannot be read back on "
                       "32-bit platform");
     if (len < (uintnat)1 << 32)
-      writecode32(CODE_STRING32, len);
+      writecode32(s, CODE_STRING32, len);
     else
-      writecode64(CODE_STRING64, len);
+      writecode64(s, CODE_STRING64, len);
 #else
-    writecode32(CODE_STRING32, len);
+    writecode32(s, CODE_STRING32, len);
 #endif
   }
-  writeblock(String_val(v), len);
+  writeblock(s, String_val(v), len);
 }
 
 /* Marshaling FP numbers */
 
-Caml_inline void extern_double(value v)
+Caml_inline void extern_double(struct caml_extern_state* s, value v)
 {
-  write(CODE_DOUBLE_NATIVE);
-  writeblock_float8((double *) v, 1);
+  write(s, CODE_DOUBLE_NATIVE);
+  writeblock_float8(s, (double *) v, 1);
 }
 
 /* Marshaling FP arrays */
 
-Caml_inline void extern_double_array(value v, mlsize_t nfloats)
+Caml_inline void extern_double_array(struct caml_extern_state* s,
+                                     value v, mlsize_t nfloats)
 {
   if (nfloats < 0x100) {
-    writecode8(CODE_DOUBLE_ARRAY8_NATIVE, nfloats);
+    writecode8(s, CODE_DOUBLE_ARRAY8_NATIVE, nfloats);
   } else {
 #ifdef ARCH_SIXTYFOUR
-    if (nfloats > 0x1FFFFF && (extern_flags & COMPAT_32))
-      extern_failwith("output_value: float array cannot be read back on "
+    if (nfloats > 0x1FFFFF && (s->extern_flags & COMPAT_32))
+      extern_failwith(s, "output_value: float array cannot be read back on "
                       "32-bit platform");
     if (nfloats < (uintnat) 1 << 32)
-      writecode32(CODE_DOUBLE_ARRAY32_NATIVE, nfloats);
+      writecode32(s, CODE_DOUBLE_ARRAY32_NATIVE, nfloats);
     else
-      writecode64(CODE_DOUBLE_ARRAY64_NATIVE, nfloats);
+      writecode64(s, CODE_DOUBLE_ARRAY64_NATIVE, nfloats);
 #else
-    writecode32(CODE_DOUBLE_ARRAY32_NATIVE, nfloats);
+    writecode32(s, CODE_DOUBLE_ARRAY32_NATIVE, nfloats);
 #endif
   }
-  writeblock_float8((double *) v, nfloats);
+  writeblock_float8(s, (double *) v, nfloats);
 }
 
 /* Marshaling custom blocks */
 
-Caml_inline void extern_custom(value v,
+Caml_inline void extern_custom(struct caml_extern_state* s, value v,
                                /*out*/ uintnat * sz_32,
                                /*out*/ uintnat * sz_64)
 {
@@ -608,21 +667,21 @@ Caml_inline void extern_custom(value v,
   const struct custom_fixed_length* fixed_length
         = Custom_ops_val(v)->fixed_length;
   if (serialize == NULL)
-    extern_invalid_argument("output_value: abstract value (Custom)");
+    extern_invalid_argument(s, "output_value: abstract value (Custom)");
   if (fixed_length == NULL) {
-    write(CODE_CUSTOM_LEN);
-    writeblock(ident, strlen(ident) + 1);
+    write(s, CODE_CUSTOM_LEN);
+    writeblock(s, ident, strlen(ident) + 1);
     /* Reserve 12 bytes for the lengths (sz_32 and sz_64). */
-    if (extern_ptr + 12 >= extern_limit) grow_extern_output(12);
-    size_header = extern_ptr;
-    extern_ptr += 12;
+    if (s->extern_ptr + 12 >= s->extern_limit) grow_extern_output(s, 12);
+    size_header = s->extern_ptr;
+    s->extern_ptr += 12;
     serialize(v, sz_32, sz_64);
     /* Store length before serialized block */
     store32(size_header, *sz_32);
     store64(size_header + 4, *sz_64);
   } else {
-    write(CODE_CUSTOM_FIXED);
-    writeblock(ident, strlen(ident) + 1);
+    write(s, CODE_CUSTOM_FIXED);
+    writeblock(s, ident, strlen(ident) + 1);
     serialize(v, sz_32, sz_64);
         if (*sz_32 != fixed_length->bsize_32 ||
             *sz_64 != fixed_length->bsize_64)
@@ -634,29 +693,30 @@ Caml_inline void extern_custom(value v,
 
 /* Marshaling code pointers */
 
-static void extern_code_pointer(char * codeptr)
+static void extern_code_pointer(struct caml_extern_state* s, char * codeptr)
 {
   struct code_fragment * cf;
   const char * digest;
 
   cf = caml_find_code_fragment_by_pc(codeptr);
   if (cf != NULL) {
-    if ((extern_flags & CLOSURES) == 0)
-      extern_invalid_argument("output_value: functional value");
+    if ((s->extern_flags & CLOSURES) == 0)
+      extern_invalid_argument(s, "output_value: functional value");
     digest = (const char *) caml_digest_of_code_fragment(cf);
     if (digest == NULL)
-      extern_invalid_argument("output_value: private function");
-    writecode32(CODE_CODEPOINTER, codeptr - cf->code_start);
-    writeblock(digest, 16);
+      extern_invalid_argument(s, "output_value: private function");
+    CAMLassert(cf == caml_find_code_fragment_by_digest((unsigned char*)digest));
+    writecode32(s, CODE_CODEPOINTER, codeptr - cf->code_start);
+    writeblock(s, digest, 16);
   } else {
-    extern_invalid_argument("output_value: abstract value (outside heap)");
+    extern_invalid_argument(s, "output_value: abstract value (outside heap)");
   }
 }
 
 /* Marshaling the non-environment part of closures */
 
-#ifdef NO_NAKED_POINTERS
-Caml_inline mlsize_t extern_closure_up_to_env(value v)
+Caml_inline mlsize_t extern_closure_up_to_env(struct caml_extern_state* s,
+                                              value v)
 {
   mlsize_t startenv, i;
   value info;
@@ -665,41 +725,35 @@ Caml_inline mlsize_t extern_closure_up_to_env(value v)
   i = 0;
   do {
     /* The infix header */
-    if (i > 0) extern_int(Long_val(Field(v, i++)));
+    if (i > 0) extern_int(s, Long_val(Field(v, i++)));
     /* The default entry point */
-    extern_code_pointer((char *) Field(v, i++));
+    extern_code_pointer(s, (char *) Field(v, i++));
     /* The closure info. */
     info = Field(v, i++);
-    extern_int(Long_val(info));
+    extern_int(s, Long_val(info));
     /* The direct entry point if arity is neither 0 nor 1 */
     if (Arity_closinfo(info) != 0 && Arity_closinfo(info) != 1) {
-      extern_code_pointer((char *) Field(v, i++));
+      extern_code_pointer(s, (char *) Field(v, i++));
     }
   } while (i < startenv);
   CAMLassert(i == startenv);
   return startenv;
 }
-#endif
 
 /* Marshal the given value in the output buffer */
 
-static void extern_rec(value v)
+static void extern_rec(struct caml_extern_state* s, value v)
 {
   struct extern_item * sp;
   uintnat h = 0;
   uintnat pos = 0;
 
-  extern_init_position_table();
-  sp = extern_stack;
+  extern_init_position_table(s);
+  sp = s->extern_stack;
 
   while(1) {
   if (Is_long(v)) {
-    extern_int(Long_val(v));
-  }
-  else if (! (Is_in_value_area(v))) {
-    /* Naked pointer outside the heap: try to marshal it as a code pointer,
-       otherwise fail. */
-    extern_code_pointer((char *) v);
+    extern_int(s, Long_val(v));
   }
   else {
     header_t hd = Hd_val(v);
@@ -709,8 +763,9 @@ static void extern_rec(value v)
     if (tag == Forward_tag) {
       value f = Forward_val (v);
       if (Is_block (f)
-          && (!Is_in_value_area(f) || Tag_val (f) == Forward_tag
+          && (   Tag_val (f) == Forward_tag
               || Tag_val (f) == Lazy_tag
+              || Tag_val (f) == Forcing_tag
 #ifdef FLAT_FLOAT_ARRAY
               || Tag_val (f) == Double_tag
 #endif
@@ -724,13 +779,13 @@ static void extern_rec(value v)
     /* Atoms are treated specially for two reasons: they are not allocated
        in the externed block, and they are automatically shared. */
     if (sz == 0) {
-      extern_header(0, tag);
+      extern_header(s, 0, tag);
       goto next_item;
     }
     /* Check if object already seen */
-    if (! (extern_flags & NO_SHARING)) {
-      if (extern_lookup_position(v, &pos, &h)) {
-        extern_shared_reference(obj_counter - pos);
+    if (! (s->extern_flags & NO_SHARING)) {
+      if (extern_lookup_position(s, v, &pos, &h)) {
+        extern_shared_reference(s, s->obj_counter - pos);
         goto next_item;
       }
     }
@@ -738,58 +793,58 @@ static void extern_rec(value v)
     switch(tag) {
     case String_tag: {
       mlsize_t len = caml_string_length(v);
-      extern_string(v, len);
-      size_32 += 1 + (len + 4) / 4;
-      size_64 += 1 + (len + 8) / 8;
-      extern_record_location(v, h);
+      extern_string(s, v, len);
+      s->size_32 += 1 + (len + 4) / 4;
+      s->size_64 += 1 + (len + 8) / 8;
+      extern_record_location(s, v, h);
       break;
     }
     case Double_tag: {
       CAMLassert(sizeof(double) == 8);
-      extern_double(v);
-      size_32 += 1 + 2;
-      size_64 += 1 + 1;
-      extern_record_location(v, h);
+      extern_double(s, v);
+      s->size_32 += 1 + 2;
+      s->size_64 += 1 + 1;
+      extern_record_location(s, v, h);
       break;
     }
     case Double_array_tag: {
       mlsize_t nfloats;
       CAMLassert(sizeof(double) == 8);
       nfloats = Wosize_val(v) / Double_wosize;
-      extern_double_array(v, nfloats);
-      size_32 += 1 + nfloats * 2;
-      size_64 += 1 + nfloats;
-      extern_record_location(v, h);
+      extern_double_array(s, v, nfloats);
+      s->size_32 += 1 + nfloats * 2;
+      s->size_64 += 1 + nfloats;
+      extern_record_location(s, v, h);
       break;
     }
     case Abstract_tag:
-      extern_invalid_argument("output_value: abstract value (Abstract)");
+      extern_invalid_argument(s, "output_value: abstract value (Abstract)");
       break;
     case Infix_tag:
-      writecode32(CODE_INFIXPOINTER, Infix_offset_hd(hd));
+      writecode32(s, CODE_INFIXPOINTER, Infix_offset_hd(hd));
       v = v - Infix_offset_hd(hd); /* PR#5772 */
       continue;
     case Custom_tag: {
       uintnat sz_32, sz_64;
-      extern_custom(v, &sz_32, &sz_64);
-      size_32 += 2 + ((sz_32 + 3) >> 2);  /* header + ops + data */
-      size_64 += 2 + ((sz_64 + 7) >> 3);
-      extern_record_location(v, h);
+      extern_custom(s, v, &sz_32, &sz_64);
+      s->size_32 += 2 + ((sz_32 + 3) >> 2);  /* header + ops + data */
+      s->size_64 += 2 + ((sz_64 + 7) >> 3);
+      extern_record_location(s, v, h);
       break;
     }
-#ifdef NO_NAKED_POINTERS
     case Closure_tag: {
       mlsize_t i;
-      extern_header(sz, tag);
-      size_32 += 1 + sz;
-      size_64 += 1 + sz;
-      extern_record_location(v, h);
-      i = extern_closure_up_to_env(v);
+      extern_header(s, sz, tag);
+      s->size_32 += 1 + sz;
+      s->size_64 += 1 + sz;
+      extern_record_location(s, v, h);
+      i = extern_closure_up_to_env(s, v);
       if (i >= sz) goto next_item;
       /* Remember that we still have to serialize fields i + 1 ... sz - 1 */
       if (i < sz - 1) {
         sp++;
-        if (sp >= extern_stack_limit) sp = extern_resize_stack(sp);
+        if (sp >= s->extern_stack_limit)
+          sp = extern_resize_stack(s, sp);
         sp->v = &Field(v, i + 1);
         sp->count = sz - i - 1;
       }
@@ -797,16 +852,16 @@ static void extern_rec(value v)
       v = Field(v, i);
       continue;
     }
-#endif
     default: {
-      extern_header(sz, tag);
-      size_32 += 1 + sz;
-      size_64 += 1 + sz;
-      extern_record_location(v, h);
+      extern_header(s, sz, tag);
+      s->size_32 += 1 + sz;
+      s->size_64 += 1 + sz;
+      extern_record_location(s, v, h);
       /* Remember that we still have to serialize fields 1 ... sz - 1 */
       if (sz > 1) {
         sp++;
-        if (sp >= extern_stack_limit) sp = extern_resize_stack(sp);
+        if (sp >= s->extern_stack_limit)
+          sp = extern_resize_stack(s, sp);
         sp->v = &Field(v, 1);
         sp->count = sz - 1;
       }
@@ -818,10 +873,10 @@ static void extern_rec(value v)
   }
   next_item:
     /* Pop one more item to marshal, if any */
-    if (sp == extern_stack) {
+    if (sp == s->extern_stack) {
         /* We are done.   Cleanup the stack and leave the function */
-        extern_free_stack();
-        extern_free_position_table();
+        extern_free_stack(s);
+        extern_free_position_table(s);
         return;
     }
     v = *((sp->v)++);
@@ -832,38 +887,38 @@ static void extern_rec(value v)
 
 static int extern_flag_values[] = { NO_SHARING, CLOSURES, COMPAT_32 };
 
-static intnat extern_value(value v, value flags,
+static intnat extern_value(struct caml_extern_state* s, value v, value flags,
                            /*out*/ char header[32],
                            /*out*/ int * header_len)
 {
   intnat res_len;
   /* Parse flag list */
-  extern_flags = caml_convert_flag_list(flags, extern_flag_values);
+  s->extern_flags = caml_convert_flag_list(flags, extern_flag_values);
   /* Initializations */
-  obj_counter = 0;
-  size_32 = 0;
-  size_64 = 0;
+  s->obj_counter = 0;
+  s->size_32 = 0;
+  s->size_64 = 0;
   /* Marshal the object */
-  extern_rec(v);
+  extern_rec(s, v);
   /* Record end of output */
-  close_extern_output();
+  close_extern_output(s);
   /* Write the header */
-  res_len = extern_output_length();
+  res_len = extern_output_length(s);
 #ifdef ARCH_SIXTYFOUR
   if (res_len >= ((intnat)1 << 32) ||
-      size_32 >= ((intnat)1 << 32) || size_64 >= ((intnat)1 << 32)) {
+      s->size_32 >= ((intnat)1 << 32) || s->size_64 >= ((intnat)1 << 32)) {
     /* The object is too big for the small header format.
        Fail if we are in compat32 mode, or use big header. */
-    if (extern_flags & COMPAT_32) {
-      free_extern_output();
+    if (s->extern_flags & COMPAT_32) {
+      free_extern_output(s);
       caml_failwith("output_value: object too big to be read back on "
                     "32-bit platform");
     }
     store32(header, Intext_magic_number_big);
     store32(header + 4, 0);
     store64(header + 8, res_len);
-    store64(header + 16, obj_counter);
-    store64(header + 24, size_64);
+    store64(header + 16, s->obj_counter);
+    store64(header + 24, s->size_64);
     *header_len = 32;
     return res_len;
   }
@@ -871,9 +926,9 @@ static intnat extern_value(value v, value flags,
   /* Use the small header format */
   store32(header, Intext_magic_number_small);
   store32(header + 4, res_len);
-  store32(header + 8, obj_counter);
-  store32(header + 12, size_32);
-  store32(header + 16, size_64);
+  store32(header + 8, s->obj_counter);
+  store32(header + 12, s->size_32);
+  store32(header + 16, s->size_64);
   *header_len = 20;
   return res_len;
 }
@@ -883,15 +938,16 @@ void caml_output_val(struct channel *chan, value v, value flags)
   char header[32];
   int header_len;
   struct output_block * blk, * nextblk;
+  struct caml_extern_state* s = get_extern_state ();
 
   if (! caml_channel_binary_mode(chan))
     caml_failwith("output_value: not a binary channel");
-  init_extern_output();
-  extern_value(v, flags, header, &header_len);
+  init_extern_output(s);
+  extern_value(s, v, flags, header, &header_len);
   /* During [caml_really_putblock], concurrent [caml_output_val] operations
      can take place (via signal handlers or context switching in systhreads),
      and [extern_output_first] may change. So, save it in a local variable. */
-  blk = extern_output_first;
+  blk = s->extern_output_first;
   caml_really_putblock(chan, header, header_len);
   while (blk != NULL) {
     caml_really_putblock(chan, blk->data, blk->end - blk->data);
@@ -920,12 +976,13 @@ CAMLprim value caml_output_value_to_bytes(value v, value flags)
   intnat data_len, ofs;
   value res;
   struct output_block * blk, * nextblk;
+  struct caml_extern_state* s = get_extern_state ();
 
-  init_extern_output();
-  data_len = extern_value(v, flags, header, &header_len);
+  init_extern_output(s);
+  data_len = extern_value(s, v, flags, header, &header_len);
   /* PR#4030: it is prudent to save extern_output_first before allocating
      the result, as in caml_output_val */
-  blk = extern_output_first;
+  blk = s->extern_output_first;
   res = caml_alloc_string(header_len + data_len);
   ofs = 0;
   memcpy(&Byte(res, ofs), header, header_len);
@@ -952,12 +1009,14 @@ CAMLexport intnat caml_output_value_to_block(value v, value flags,
   char header[32];
   int header_len;
   intnat data_len;
+  struct caml_extern_state* s = get_extern_state ();
+
   /* At this point we don't know the size of the header.
      Guess that it is small, and fix up later if not. */
-  extern_userprovided_output = buf + 20;
-  extern_ptr = extern_userprovided_output;
-  extern_limit = buf + len;
-  data_len = extern_value(v, flags, header, &header_len);
+  s->extern_userprovided_output = buf + 20;
+  s->extern_ptr = s->extern_userprovided_output;
+  s->extern_limit = buf + len;
+  data_len = extern_value(s, v, flags, header, &header_len);
   if (header_len != 20) {
     /* Bad guess!  Need to shift the output to make room for big header.
        Make sure there is room. */
@@ -987,16 +1046,17 @@ CAMLexport void caml_output_value_to_malloc(value v, value flags,
   intnat data_len;
   char * res;
   struct output_block * blk, * nextblk;
+  struct caml_extern_state* s = get_extern_state ();
 
-  init_extern_output();
-  data_len = extern_value(v, flags, header, &header_len);
+  init_extern_output(s);
+  data_len = extern_value(s, v, flags, header, &header_len);
   res = caml_stat_alloc_noexc(header_len + data_len);
-  if (res == NULL) extern_out_of_memory();
+  if (res == NULL) extern_out_of_memory(s);
   *buf = res;
   *len = header_len + data_len;
   memcpy(res, header, header_len);
   res += header_len;
-  for (blk = extern_output_first; blk != NULL; blk = nextblk) {
+  for (blk = s->extern_output_first; blk != NULL; blk = nextblk) {
     intnat n = blk->end - blk->data;
     memcpy(res, blk->data, n);
     res += n;
@@ -1009,30 +1069,34 @@ CAMLexport void caml_output_value_to_malloc(value v, value flags,
 
 CAMLexport void caml_serialize_int_1(int i)
 {
-  if (extern_ptr + 1 > extern_limit) grow_extern_output(1);
-  extern_ptr[0] = i;
-  extern_ptr += 1;
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 1 > s->extern_limit) grow_extern_output(s, 1);
+  s->extern_ptr[0] = i;
+  s->extern_ptr += 1;
 }
 
 CAMLexport void caml_serialize_int_2(int i)
 {
-  if (extern_ptr + 2 > extern_limit) grow_extern_output(2);
-  store16(extern_ptr, i);
-  extern_ptr += 2;
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 2 > s->extern_limit) grow_extern_output(s, 2);
+  store16(s->extern_ptr, i);
+  s->extern_ptr += 2;
 }
 
 CAMLexport void caml_serialize_int_4(int32_t i)
 {
-  if (extern_ptr + 4 > extern_limit) grow_extern_output(4);
-  store32(extern_ptr, i);
-  extern_ptr += 4;
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 4 > s->extern_limit) grow_extern_output(s, 4);
+  store32(s->extern_ptr, i);
+  s->extern_ptr += 4;
 }
 
 CAMLexport void caml_serialize_int_8(int64_t i)
 {
-  if (extern_ptr + 8 > extern_limit) grow_extern_output(8);
-  store64(extern_ptr, i);
-  extern_ptr += 8;
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 8 > s->extern_limit) grow_extern_output(s, 8);
+  store64(s->extern_ptr, i);
+  s->extern_ptr += 8;
 }
 
 CAMLexport void caml_serialize_float_4(float f)
@@ -1047,83 +1111,91 @@ CAMLexport void caml_serialize_float_8(double f)
 
 CAMLexport void caml_serialize_block_1(void * data, intnat len)
 {
-  if (extern_ptr + len > extern_limit) grow_extern_output(len);
-  memcpy(extern_ptr, data, len);
-  extern_ptr += len;
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + len > s->extern_limit) grow_extern_output(s, len);
+  memcpy(s->extern_ptr, data, len);
+  s->extern_ptr += len;
 }
 
 CAMLexport void caml_serialize_block_2(void * data, intnat len)
 {
-  if (extern_ptr + 2 * len > extern_limit) grow_extern_output(2 * len);
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 2 * len > s->extern_limit)
+    grow_extern_output(s, 2 * len);
 #ifndef ARCH_BIG_ENDIAN
   {
     unsigned char * p;
     char * q;
-    for (p = data, q = extern_ptr; len > 0; len--, p += 2, q += 2)
+    for (p = data, q = s->extern_ptr; len > 0; len--, p += 2, q += 2)
       Reverse_16(q, p);
-    extern_ptr = q;
+    s->extern_ptr = q;
   }
 #else
-  memcpy(extern_ptr, data, len * 2);
-  extern_ptr += len * 2;
+  memcpy(s->extern_ptr, data, len * 2);
+  s->extern_ptr += len * 2;
 #endif
 }
 
 CAMLexport void caml_serialize_block_4(void * data, intnat len)
 {
-  if (extern_ptr + 4 * len > extern_limit) grow_extern_output(4 * len);
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 4 * len > s->extern_limit)
+    grow_extern_output(s, 4 * len);
 #ifndef ARCH_BIG_ENDIAN
   {
     unsigned char * p;
     char * q;
-    for (p = data, q = extern_ptr; len > 0; len--, p += 4, q += 4)
+    for (p = data, q = s->extern_ptr; len > 0; len--, p += 4, q += 4)
       Reverse_32(q, p);
-    extern_ptr = q;
+    s->extern_ptr = q;
   }
 #else
-  memcpy(extern_ptr, data, len * 4);
-  extern_ptr += len * 4;
+  memcpy(s->extern_ptr, data, len * 4);
+  s->extern_ptr += len * 4;
 #endif
 }
 
 CAMLexport void caml_serialize_block_8(void * data, intnat len)
 {
-  if (extern_ptr + 8 * len > extern_limit) grow_extern_output(8 * len);
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 8 * len > s->extern_limit)
+    grow_extern_output(s, 8 * len);
 #ifndef ARCH_BIG_ENDIAN
   {
     unsigned char * p;
     char * q;
-    for (p = data, q = extern_ptr; len > 0; len--, p += 8, q += 8)
+    for (p = data, q = s->extern_ptr; len > 0; len--, p += 8, q += 8)
       Reverse_64(q, p);
-    extern_ptr = q;
+    s->extern_ptr = q;
   }
 #else
-  memcpy(extern_ptr, data, len * 8);
-  extern_ptr += len * 8;
+  memcpy(s->extern_ptr, data, len * 8);
+  s->extern_ptr += len * 8;
 #endif
 }
 
 CAMLexport void caml_serialize_block_float_8(void * data, intnat len)
 {
-  if (extern_ptr + 8 * len > extern_limit) grow_extern_output(8 * len);
+  struct caml_extern_state* s = get_extern_state ();
+  if (s->extern_ptr + 8 * len > s->extern_limit) grow_extern_output(s, 8 * len);
 #if ARCH_FLOAT_ENDIANNESS == 0x01234567
-  memcpy(extern_ptr, data, len * 8);
-  extern_ptr += len * 8;
+  memcpy(s->extern_ptr, data, len * 8);
+  s->extern_ptr += len * 8;
 #elif ARCH_FLOAT_ENDIANNESS == 0x76543210
   {
     unsigned char * p;
     char * q;
-    for (p = data, q = extern_ptr; len > 0; len--, p += 8, q += 8)
+    for (p = data, q = s->extern_ptr; len > 0; len--, p += 8, q += 8)
       Reverse_64(q, p);
-    extern_ptr = q;
+    s->extern_ptr = q;
   }
 #else
   {
     unsigned char * p;
     char * q;
-    for (p = data, q = extern_ptr; len > 0; len--, p += 8, q += 8)
+    for (p = data, q = s->extern_ptr; len > 0; len--, p += 8, q += 8)
       Permute_64(q, 0x01234567, p, ARCH_FLOAT_ENDIANNESS);
-    extern_ptr = q;
+    s->extern_ptr = q;
   }
 #endif
 }
@@ -1133,22 +1205,21 @@ CAMLprim value caml_obj_reachable_words(value v)
   intnat size;
   struct extern_item * sp;
   uintnat h = 0;
-  uintnat pos;
+  uintnat pos = 0;
+  struct caml_extern_state *s = get_extern_state ();
 
-  obj_counter = 0;
-  extern_init_position_table();
-  sp = extern_stack;
+  s->obj_counter = 0;
+  s->extern_flags = 0;
+  extern_init_position_table(s);
+  sp = s->extern_stack;
   size = 0;
+
+  /* In Multicore OCaml, we don't distinguish between major heap blocks and
+   * out-of-heap blocks, so we end up counting out-of-heap blocks too. */
   while (1) {
     if (Is_long(v)) {
       /* Tagged integers contribute 0 to the size, nothing to do */
-    } else if (! Is_in_heap_or_young(v)) {
-      /* Out-of-heap blocks contribute 0 to the size, nothing to do */
-      /* However, in no-naked-pointers mode, we don't distinguish
-         between major heap blocks and out-of-heap blocks,
-         and the test above is always false,
-         so we end up counting out-of-heap blocks too. */
-    } else if (extern_lookup_position(v, &pos, &h)) {
+    } else if (extern_lookup_position(s, v, &pos, &h)) {
       /* Already seen and counted, nothing to do */
     } else {
       header_t hd = Hd_val(v);
@@ -1160,7 +1231,7 @@ CAMLprim value caml_obj_reachable_words(value v)
         continue;
       }
       /* Remember that we've visited this block */
-      extern_record_location(v, h);
+      extern_record_location(s, v, h);
       /* The block contributes to the total size */
       size += 1 + sz;           /* header word included */
       if (tag < No_scan_tag) {
@@ -1171,7 +1242,8 @@ CAMLprim value caml_obj_reachable_words(value v)
           if (i < sz - 1) {
             /* Remember that we need to count fields i + 1 ... sz - 1 */
             sp++;
-            if (sp >= extern_stack_limit) sp = extern_resize_stack(sp);
+            if (sp >= s->extern_stack_limit)
+              sp = extern_resize_stack(s, sp);
             sp->v = &Field(v, i + 1);
             sp->count = sz - i - 1;
           }
@@ -1182,11 +1254,11 @@ CAMLprim value caml_obj_reachable_words(value v)
       }
     }
     /* Pop one more item to traverse, if any */
-    if (sp == extern_stack) break;
+    if (sp == s->extern_stack) break;
     v = *((sp->v)++);
     if (--(sp->count) == 0) sp--;
   }
-  extern_free_stack();
-  extern_free_position_table();
+  extern_free_stack(s);
+  extern_free_position_table(s);
   return Val_long(size);
 }
diff --git a/runtime/fail_byt.c b/runtime/fail_byt.c
index 0d0d2b05afa0..e9b2063d9ac3 100644
--- a/runtime/fail_byt.c
+++ b/runtime/fail_byt.c
@@ -29,7 +29,7 @@
 #include "caml/mlvalues.h"
 #include "caml/printexc.h"
 #include "caml/signals.h"
-#include "caml/stacks.h"
+#include "caml/fiber.h"
 
 CAMLexport void caml_raise(value v)
 {
@@ -37,13 +37,18 @@ CAMLexport void caml_raise(value v)
   CAMLassert(!Is_exception_result(v));
 
   // avoid calling caml_raise recursively
-  v = caml_process_pending_actions_with_root_exn(v);
+  v = caml_process_pending_signals_with_root_exn(v);
   if (Is_exception_result(v))
     v = Extract_exception(v);
 
-  Caml_state->exn_bucket = v;
   if (Caml_state->external_raise == NULL) caml_fatal_uncaught_exception(v);
-  siglongjmp(Caml_state->external_raise->buf, 1);
+  *Caml_state->external_raise->exn_bucket = v;
+
+  while(Caml_state->local_roots != Caml_state->external_raise->local_roots) {
+    Caml_state->local_roots = Caml_state->local_roots->next;
+  }
+
+  siglongjmp(Caml_state->external_raise->jmp->buf, 1);
 }
 
 CAMLexport void caml_raise_constant(value tag)
@@ -98,15 +103,16 @@ CAMLexport void caml_raise_with_string(value tag, char const *msg)
 */
 static void check_global_data(char const *exception_name)
 {
-  if (caml_global_data == 0) {
-    fprintf(stderr, "Fatal error: exception %s\n", exception_name);
+  if (caml_global_data == 0 || !Is_block(caml_global_data)) {
+    fprintf(stderr, "Fatal error: exception %s during initialisation\n",
+            exception_name);
     exit(2);
   }
 }
 
 static void check_global_data_param(char const *exception_name, char const *msg)
 {
-  if (caml_global_data == 0) {
+  if (caml_global_data == 0 || !Is_block(caml_global_data)) {
     fprintf(stderr, "Fatal error: exception %s(\"%s\")\n", exception_name, msg);
     exit(2);
   }
@@ -197,6 +203,12 @@ CAMLexport void caml_raise_sys_blocked_io(void)
   caml_raise_constant(Field(caml_global_data, SYS_BLOCKED_IO));
 }
 
+CAMLexport void caml_raise_continuation_already_taken(void)
+{
+  check_global_data("Continuation_already_taken");
+  caml_raise_constant(Field(caml_global_data, CONTINUATION_ALREADY_TAKEN_EXN));
+}
+
 CAMLexport value caml_raise_if_exception(value res)
 {
   if (Is_exception_result(res)) caml_raise(Extract_exception(res));
@@ -208,8 +220,15 @@ int caml_is_special_exception(value exn) {
      a more readable textual representation of some exceptions. It is
      better to fall back to the general, less readable representation
      than to abort with a fatal error as above. */
-  if (caml_global_data == 0) return 0;
-  return exn == Field(caml_global_data, MATCH_FAILURE_EXN)
-    || exn == Field(caml_global_data, ASSERT_FAILURE_EXN)
-    || exn == Field(caml_global_data, UNDEFINED_RECURSIVE_MODULE_EXN);
+
+  value f;
+
+  if (caml_global_data == 0 || !Is_block(caml_global_data)) {
+    return 0;
+  }
+
+  f = caml_global_data;
+  return exn == Field(f, MATCH_FAILURE_EXN)
+      || exn == Field(f, ASSERT_FAILURE_EXN)
+      || exn == Field(f, UNDEFINED_RECURSIVE_MODULE_EXN);
 }
diff --git a/runtime/fail_nat.c b/runtime/fail_nat.c
index d3a69c635892..8516ce13721d 100644
--- a/runtime/fail_nat.c
+++ b/runtime/fail_nat.c
@@ -20,7 +20,6 @@
 #include <stdio.h>
 #include <signal.h>
 #include "caml/alloc.h"
-#include "caml/domain.h"
 #include "caml/fail.h"
 #include "caml/io.h"
 #include "caml/gc.h"
@@ -49,7 +48,9 @@ extern caml_generated_constant
   caml_exn_Sys_blocked_io,
   caml_exn_Stack_overflow,
   caml_exn_Assert_failure,
-  caml_exn_Undefined_recursive_module;
+  caml_exn_Undefined_recursive_module,
+  caml_exn_Unhandled,
+  caml_exn_Continuation_already_taken;
 
 /* Exception raising */
 
@@ -62,22 +63,23 @@ CAMLnoreturn_end;
 CAMLno_asan
 void caml_raise(value v)
 {
+  char* exception_pointer;
+
   Unlock_exn();
 
   CAMLassert(!Is_exception_result(v));
 
   // avoid calling caml_raise recursively
-  v = caml_process_pending_actions_with_root_exn(v);
+  v = caml_process_pending_signals_with_root_exn(v);
   if (Is_exception_result(v))
     v = Extract_exception(v);
 
-  if (Caml_state->exception_pointer == NULL) {
-    caml_terminate_signals();
-    caml_fatal_uncaught_exception(v);
-  }
+  exception_pointer = (char*)Caml_state->c_stack;
+
+  if (exception_pointer == NULL) caml_fatal_uncaught_exception(v);
 
   while (Caml_state->local_roots != NULL &&
-         (char *) Caml_state->local_roots < Caml_state->exception_pointer) {
+         (char *) Caml_state->local_roots < exception_pointer) {
     Caml_state->local_roots = Caml_state->local_roots->next;
   }
 
@@ -111,8 +113,7 @@ void caml_raise_with_args(value tag, int nargs, value args[])
   value bucket;
   int i;
 
-  CAMLassert(1 + nargs <= Max_young_wosize);
-  bucket = caml_alloc_small (1 + nargs, 0);
+  bucket = caml_alloc (1 + nargs, 0);
   Field(bucket, 0) = tag;
   for (i = 0; i < nargs; i++) Field(bucket, 1 + i) = args[i];
   caml_raise(bucket);
@@ -185,6 +186,11 @@ void caml_raise_sys_blocked_io(void)
   caml_raise_constant((value) caml_exn_Sys_blocked_io);
 }
 
+void caml_raise_continuation_already_taken(void)
+{
+  caml_raise_constant((value) caml_exn_Continuation_already_taken);
+}
+
 CAMLexport value caml_raise_if_exception(value res)
 {
   if (Is_exception_result(res)) caml_raise(Extract_exception(res));
@@ -195,20 +201,20 @@ CAMLexport value caml_raise_if_exception(value res)
    do a GC before the exception is raised (lack of stack descriptors
    for the ccall to [caml_array_bound_error]).  */
 
-static const value * caml_array_bound_error_exn = NULL;
-
 void caml_array_bound_error(void)
 {
-  if (caml_array_bound_error_exn == NULL) {
-    caml_array_bound_error_exn =
-      caml_named_value("Pervasives.array_bound_error");
-    if (caml_array_bound_error_exn == NULL) {
+  static atomic_uintnat exn_cache = ATOMIC_UINTNAT_INIT(0);
+  const value* exn = (const value*)atomic_load_acq(&exn_cache);
+  if (!exn) {
+    exn = caml_named_value("Pervasives.array_bound_error");
+    if (!exn) {
       fprintf(stderr, "Fatal error: exception "
-                      "Invalid_argument(\"index out of bounds\")\n");
+        "Invalid_argument(\"index out of bounds\")\n");
       exit(2);
     }
+    atomic_store_rel(&exn_cache, (uintnat)exn);
   }
-  caml_raise(*caml_array_bound_error_exn);
+  caml_raise(*exn);
 }
 
 int caml_is_special_exception(value exn) {
diff --git a/runtime/fiber.c b/runtime/fiber.c
new file mode 100644
index 000000000000..04d1d4f9fd64
--- /dev/null
+++ b/runtime/fiber.c
@@ -0,0 +1,537 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                   Tom Kelly, OCaml Labs Consultancy                    */
+/*                Stephen Dolan, University of Cambridge                  */
+/*                                                                        */
+/*   Copyright 2021 Indian Institute of Technology, Madras                */
+/*   Copyright 2021 OCaml Labs Consultancy                                */
+/*   Copyright 2019 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#define CAML_INTERNALS
+
+#include <string.h>
+#include <unistd.h>
+#include "caml/alloc.h"
+#include "caml/codefrag.h"
+#include "caml/fail.h"
+#include "caml/fiber.h"
+#include "caml/gc_ctrl.h"
+#include "caml/platform.h"
+#include "caml/minor_gc.h"
+#include "caml/misc.h"
+#include "caml/major_gc.h"
+#include "caml/memory.h"
+#include "caml/startup_aux.h"
+#ifdef NATIVE_CODE
+#include "caml/stack.h"
+#include "caml/frame_descriptors.h"
+#endif
+
+#ifdef DEBUG
+#define fiber_debug_log(...) caml_gc_log(__VA_ARGS__)
+#else
+#define fiber_debug_log(...)
+#endif
+
+void caml_change_max_stack_size (uintnat new_max_size)
+{
+  struct stack_info *current_stack = Caml_state->current_stack;
+  asize_t size = Stack_high(current_stack) - (value*)current_stack->sp
+                 + Stack_threshold / sizeof (value);
+
+  if (new_max_size < size) new_max_size = size;
+  if (new_max_size != caml_max_stack_size){
+    caml_gc_log ("Changing stack limit to %"
+                 ARCH_INTNAT_PRINTF_FORMAT "uk bytes",
+                     new_max_size * sizeof (value) / 1024);
+  }
+  caml_max_stack_size = new_max_size;
+}
+
+struct stack_info** caml_alloc_stack_cache (void)
+{
+  int i;
+
+  struct stack_info** stack_cache =
+    (struct stack_info**)caml_stat_alloc_noexc(sizeof(struct stack_info*) *
+                                               NUM_STACK_SIZE_CLASSES);
+  if (stack_cache == NULL)
+    return NULL;
+
+  for(i = 0; i < NUM_STACK_SIZE_CLASSES; i++)
+    stack_cache[i] = NULL;
+
+  return stack_cache;
+}
+
+Caml_inline struct stack_info* alloc_for_stack (mlsize_t wosize)
+{
+  return caml_stat_alloc_noexc(sizeof(struct stack_info) +
+                               sizeof(value) * wosize +
+                               8 /* for alignment */ +
+                               sizeof(struct stack_handler));
+}
+
+Caml_inline struct stack_info** stack_cache_bucket (mlsize_t wosize) {
+  mlsize_t size_bucket_wsz = caml_fiber_wsz;
+  struct stack_info** size_bucket = Caml_state->stack_cache;
+  struct stack_info** end = size_bucket + NUM_STACK_SIZE_CLASSES;
+
+  /* wosize is in stack cache bucket n iff wosize == caml_fiber_wsz * 2**n */
+  while (size_bucket < end) {
+    if (wosize == size_bucket_wsz)
+      return size_bucket;
+    ++size_bucket;
+    size_bucket_wsz += size_bucket_wsz;
+  }
+
+  return NULL;
+}
+
+static struct stack_info*
+alloc_size_class_stack_noexc(mlsize_t wosize, struct stack_info** size_bucket,
+                             value hval, value hexn, value heff)
+{
+  struct stack_info* stack;
+  struct stack_handler* hand;
+
+  CAML_STATIC_ASSERT(sizeof(struct stack_info) % sizeof(value) == 0);
+  CAML_STATIC_ASSERT(sizeof(struct stack_handler) % sizeof(value) == 0);
+
+  if (size_bucket != NULL && *size_bucket != NULL) {
+    stack = *size_bucket;
+    *size_bucket = (struct stack_info*)stack->exception_ptr;
+    CAMLassert(stack->size_bucket == stack_cache_bucket(wosize));
+    hand = stack->handler;
+  } else {
+    /* couldn't get a cached stack, so have to create one */
+    stack = alloc_for_stack(wosize);
+    if (stack == NULL) {
+      return NULL;
+    }
+
+    stack->size_bucket = size_bucket;
+
+    /* Ensure 16-byte alignment because some architectures require it */
+    hand = (struct stack_handler*)
+     (((uintnat)stack + sizeof(struct stack_info) + sizeof(value) * wosize + 8)
+      & ((uintnat)-1 << 4));
+    stack->handler = hand;
+  }
+
+  hand->handle_value = hval;
+  hand->handle_exn = hexn;
+  hand->handle_effect = heff;
+  hand->parent = NULL;
+  stack->sp = (value*)hand;
+  stack->exception_ptr = NULL;
+#ifdef DEBUG
+  stack->magic = 42;
+#endif
+  CAMLassert(Stack_high(stack) - Stack_base(stack) == wosize ||
+             Stack_high(stack) - Stack_base(stack) == wosize + 1);
+  return stack;
+
+}
+
+/* allocate a stack with at least "wosize" usable words of stack */
+static struct stack_info* alloc_stack_noexc(mlsize_t wosize, value hval,
+                                            value hexn, value heff)
+{
+  struct stack_info** size_bucket = stack_cache_bucket (wosize);
+  return alloc_size_class_stack_noexc(wosize, size_bucket, hval, hexn, heff);
+}
+
+#ifdef NATIVE_CODE
+
+value caml_alloc_stack (value hval, value hexn, value heff) {
+  struct stack_info* stack =
+    alloc_size_class_stack_noexc(caml_fiber_wsz, Caml_state->stack_cache,
+                                 hval, hexn, heff);
+
+  if (!stack) caml_raise_out_of_memory();
+
+  fiber_debug_log ("Allocate stack=%p of %" ARCH_INTNAT_PRINTF_FORMAT
+                     "u words", stack, caml_fiber_wsz);
+
+  return Val_ptr(stack);
+}
+
+void caml_get_stack_sp_pc (struct stack_info* stack,
+                           char** sp /* out */, uintnat* pc /* out */)
+{
+  char* p = (char*)stack->sp;
+
+  p += sizeof(value);
+  *sp = p;
+  *pc = Saved_return_address(*sp);
+}
+
+Caml_inline void scan_stack_frames(scanning_action f, void* fdata,
+                                   struct stack_info* stack, value* gc_regs)
+{
+  char * sp;
+  uintnat retaddr;
+  value * regs;
+  frame_descr * d;
+  uintnat h;
+  int n, ofs;
+  unsigned short * p;
+  value *root;
+  caml_frame_descrs fds = caml_get_frame_descrs();
+
+  sp = (char*)stack->sp;
+  regs = gc_regs;
+
+next_chunk:
+  if (sp == (char*)Stack_high(stack)) return;
+
+  retaddr = *(uintnat*)sp;
+  sp += sizeof(value);
+
+  while(1) {
+    /* Find the descriptor corresponding to the return address */
+    h = Hash_retaddr(retaddr, fds.mask);
+    while(1) {
+      d = fds.descriptors[h];
+      if (d->retaddr == retaddr) break;
+      h = (h+1) & fds.mask;
+    }
+    if (d->frame_size != 0xFFFF) {
+      /* Scan the roots in this frame */
+      for (p = d->live_ofs, n = d->num_live; n > 0; n--, p++) {
+        ofs = *p;
+        if (ofs & 1) {
+          root = regs + (ofs >> 1);
+        } else {
+          root = (value *)(sp + ofs);
+        }
+        f (fdata, *root, root);
+      }
+      /* Move to next frame */
+      sp += (d->frame_size & 0xFFFC);
+      retaddr = Saved_return_address(sp);
+      /* XXX KC: disabled already scanned optimization. */
+    } else {
+      /* This marks the top of an ML stack chunk. Move sp to the previous
+       * stack chunk.  */
+      sp += 3 * sizeof(value); /* trap frame & DWARF pointer */
+      regs = *(value**)sp;     /* update gc_regs */
+      sp += 1 * sizeof(value); /* gc_regs */
+      goto next_chunk;
+    }
+  }
+}
+
+void caml_scan_stack(scanning_action f, void* fdata,
+                     struct stack_info* stack, value* gc_regs)
+{
+  while (stack != NULL) {
+    scan_stack_frames(f, fdata, stack, gc_regs);
+
+    f(fdata, Stack_handle_value(stack), &Stack_handle_value(stack));
+    f(fdata, Stack_handle_exception(stack), &Stack_handle_exception(stack));
+    f(fdata, Stack_handle_effect(stack), &Stack_handle_effect(stack));
+
+    stack = Stack_parent(stack);
+  }
+}
+
+void caml_maybe_expand_stack (void)
+{
+  struct stack_info* stk = Caml_state->current_stack;
+  uintnat stack_available =
+    (value*)stk->sp - Stack_base(stk);
+  uintnat stack_needed =
+    Stack_threshold / sizeof(value)
+    + 8 /* for words pushed by caml_start_program */;
+
+  if (stack_available < stack_needed)
+    if (!caml_try_realloc_stack (stack_needed))
+      caml_raise_stack_overflow();
+
+  if (Caml_state->gc_regs_buckets == NULL) {
+    /* ensure there is at least one gc_regs bucket available before
+       running any OCaml code */
+    value* bucket = caml_stat_alloc(sizeof(value) * Wosize_gc_regs);
+    bucket[0] = 0; /* no next bucket */
+    Caml_state->gc_regs_buckets = bucket;
+  }
+}
+
+#else /* End NATIVE_CODE, begin BYTE_CODE */
+
+value caml_global_data;
+
+CAMLprim value caml_alloc_stack(value hval, value hexn, value heff)
+{
+  value* sp;
+  struct stack_info* stack =
+    alloc_size_class_stack_noexc(caml_fiber_wsz, Caml_state->stack_cache,
+                                 hval, hexn, heff);
+
+  if (!stack) caml_raise_out_of_memory();
+
+  sp = Stack_high(stack);
+  sp -= 1;
+  sp[0] = Val_long(1);
+
+  stack->sp = sp;
+
+  return Val_ptr(stack);
+}
+
+CAMLprim value caml_ensure_stack_capacity(value required_space)
+{
+  asize_t req = Long_val(required_space);
+  if (Caml_state->current_stack->sp - req <
+      Stack_base(Caml_state->current_stack))
+    if (!caml_try_realloc_stack(req))
+      caml_raise_stack_overflow();
+  return Val_unit;
+}
+
+/*
+  Root scanning.
+
+  Used by the GC to find roots on the stacks of running or runnable fibers.
+*/
+
+Caml_inline int is_block_and_not_code_frag(value v) {
+  return Is_block(v) && caml_find_code_fragment_by_pc((char *) v) == NULL;
+}
+
+void caml_scan_stack(scanning_action f, void* fdata,
+                     struct stack_info* stack, value* v_gc_regs)
+{
+  value *low, *high, *sp;
+
+  while (stack != NULL) {
+    CAMLassert(stack->magic == 42);
+
+    high = Stack_high(stack);
+    low = stack->sp;
+    for (sp = low; sp < high; sp++) {
+      /* Code pointers inside the stack are naked pointers.
+         We must avoid passing them to function [f]. */
+      value v = *sp;
+      if (is_block_and_not_code_frag(v)) {
+        f(fdata, v, sp);
+      }
+    }
+
+    if (is_block_and_not_code_frag(Stack_handle_value(stack)))
+      f(fdata, Stack_handle_value(stack), &Stack_handle_value(stack));
+    if (is_block_and_not_code_frag(Stack_handle_exception(stack)))
+      f(fdata, Stack_handle_exception(stack), &Stack_handle_exception(stack));
+    if (is_block_and_not_code_frag(Stack_handle_effect(stack)))
+      f(fdata, Stack_handle_effect(stack), &Stack_handle_effect(stack));
+
+    stack = Stack_parent(stack);
+  }
+}
+
+#endif /* end BYTE_CODE */
+
+/*
+  Stack management.
+
+  Used by the interpreter to allocate stack space.
+*/
+
+#ifdef NATIVE_CODE
+/* Update absolute exception pointers for new stack*/
+static void
+rewrite_exception_stack(struct stack_info *old_stack,
+                        value** exn_ptr, struct stack_info *new_stack)
+{
+  fiber_debug_log("Old [%p, %p]", Stack_base(old_stack), Stack_high(old_stack));
+  fiber_debug_log("New [%p, %p]", Stack_base(new_stack), Stack_high(new_stack));
+  if(exn_ptr) {
+    fiber_debug_log ("*exn_ptr=%p", *exn_ptr);
+
+    while (Stack_base(old_stack) < *exn_ptr &&
+           *exn_ptr <= Stack_high(old_stack)) {
+#ifdef DEBUG
+      value* old_val = *exn_ptr;
+#endif
+      *exn_ptr = Stack_high(new_stack) - (Stack_high(old_stack) - *exn_ptr);
+
+      fiber_debug_log ("Rewriting %p to %p", old_val, *exn_ptr);
+
+      CAMLassert(Stack_base(new_stack) < *exn_ptr);
+      CAMLassert((value*)*exn_ptr <= Stack_high(new_stack));
+
+      exn_ptr = (value**)*exn_ptr;
+    }
+    fiber_debug_log ("finished with *exn_ptr=%p", *exn_ptr);
+  } else {
+    fiber_debug_log ("exn_ptr is null");
+  }
+}
+#endif
+
+int caml_try_realloc_stack(asize_t required_space)
+{
+  struct stack_info *old_stack, *new_stack;
+  asize_t size;
+  int stack_used;
+  CAMLnoalloc;
+
+  old_stack = Caml_state->current_stack;
+  stack_used = Stack_high(old_stack) - (value*)old_stack->sp;
+  size = Stack_high(old_stack) - Stack_base(old_stack);
+  do {
+    if (size >= caml_max_stack_size) return 0;
+    size *= 2;
+  } while (size < stack_used + required_space);
+
+  if (size > 4096 / sizeof(value)) {
+    caml_gc_log ("Growing stack to %"
+                 ARCH_INTNAT_PRINTF_FORMAT "uk bytes",
+                 (uintnat) size * sizeof(value) / 1024);
+  } else {
+    caml_gc_log ("Growing stack to %"
+                 ARCH_INTNAT_PRINTF_FORMAT "u bytes",
+                 (uintnat) size * sizeof(value));
+  }
+
+  new_stack = alloc_stack_noexc(size,
+                                Stack_handle_value(old_stack),
+                                Stack_handle_exception(old_stack),
+                                Stack_handle_effect(old_stack));
+  if (!new_stack) return 0;
+  memcpy(Stack_high(new_stack) - stack_used,
+         Stack_high(old_stack) - stack_used,
+         stack_used * sizeof(value));
+  new_stack->sp = Stack_high(new_stack) - stack_used;
+  Stack_parent(new_stack) = Stack_parent(old_stack);
+#ifdef NATIVE_CODE
+  rewrite_exception_stack(old_stack, (value**)&Caml_state->exn_handler,
+                          new_stack);
+#endif
+
+  /* Update stack pointers in Caml_state->c_stack. It is possible to have
+   * multiple c_stack_links to point to the same stack since callbacks are run
+   * on existing stacks. */
+  {
+    struct c_stack_link* link;
+    for (link = Caml_state->c_stack; link; link = link->prev) {
+      if (link->stack == old_stack) {
+        link->stack = new_stack;
+        link->sp = (void*)((char*)Stack_high(new_stack) -
+                           ((char*)Stack_high(old_stack) - (char*)link->sp));
+      }
+    }
+  }
+
+  caml_free_stack(old_stack);
+  Caml_state->current_stack = new_stack;
+  return 1;
+}
+
+struct stack_info* caml_alloc_main_stack (uintnat init_size)
+{
+  struct stack_info* stk =
+    alloc_stack_noexc(init_size, Val_unit, Val_unit, Val_unit);
+  return stk;
+}
+
+void caml_free_stack (struct stack_info* stack)
+{
+  CAMLnoalloc;
+  CAMLassert(stack->magic == 42);
+  if (stack->size_bucket != NULL) {
+    stack->exception_ptr = (void*)(*stack->size_bucket);
+    *stack->size_bucket = stack;
+#ifdef DEBUG
+    memset(Stack_base(stack), 0x42,
+           (Stack_high(stack)-Stack_base(stack))*sizeof(value));
+#endif
+  } else {
+#ifdef DEBUG
+    memset(stack, 0x42, (char*)stack->handler - (char*)stack);
+#endif
+    caml_stat_free(stack);
+  }
+}
+
+CAMLprim value caml_continuation_use_noexc (value cont)
+{
+  value v;
+  value null_stk = Val_ptr(NULL);
+  CAMLnoalloc;
+
+  fiber_debug_log("cont: is_block(%d) tag_val(%ul) is_young(%d)",
+                  Is_block(cont), Tag_val(cont), Is_young(cont));
+  CAMLassert(Is_block(cont) && Tag_val(cont) == Cont_tag);
+
+  /* this forms a barrier between execution and any other domains
+     that might be marking this continuation */
+  if (!Is_young(cont) ) caml_darken_cont(cont);
+
+  /* at this stage the stack is assured to be marked */
+  v = Field(cont, 0);
+
+  if (caml_domain_alone()) {
+    Field(cont, 0) = null_stk;
+    return v;
+  }
+
+  if (atomic_compare_exchange_strong(Op_atomic_val(cont), &v, null_stk)) {
+    return v;
+  } else {
+    return null_stk;
+  }
+}
+
+CAMLprim value caml_continuation_use (value cont)
+{
+  value v = caml_continuation_use_noexc(cont);
+  if (v == Val_ptr(NULL))
+    caml_raise_continuation_already_taken();
+  return v;
+}
+
+CAMLprim value caml_continuation_use_and_update_handler_noexc
+  (value cont, value hval, value hexn, value heff)
+{
+  value stack;
+  struct stack_info* stk;
+
+  stack = caml_continuation_use_noexc (cont);
+  stk = Ptr_val(stack);
+  if (stk == NULL) {
+    /* The continuation has already been taken */
+    return stack;
+  }
+  while (Stack_parent(stk) != NULL) stk = Stack_parent(stk);
+  Stack_handle_value(stk) = hval;
+  Stack_handle_exception(stk) = hexn;
+  Stack_handle_effect(stk) = heff;
+  return stack;
+}
+
+void caml_continuation_replace(value cont, struct stack_info* stk)
+{
+  value n = Val_ptr(NULL);
+  int b = atomic_compare_exchange_strong(Op_atomic_val(cont), &n, Val_ptr(stk));
+  CAMLassert(b);
+  (void)b; /* squash unused warning */
+}
+
+CAMLprim value caml_drop_continuation (value cont)
+{
+  struct stack_info* stk = Ptr_val(caml_continuation_use(cont));
+  caml_free_stack(stk);
+  return Val_unit;
+}
diff --git a/runtime/finalise.c b/runtime/finalise.c
index 46e1b7dd4fa1..d3f74789f2d4 100644
--- a/runtime/finalise.c
+++ b/runtime/finalise.c
@@ -15,90 +15,51 @@
 
 #define CAML_INTERNALS
 
-/* Handling of finalised values. */
+#include <string.h>
 
 #include "caml/callback.h"
-#include "caml/compact.h"
+#include "caml/eventlog.h"
 #include "caml/fail.h"
 #include "caml/finalise.h"
+#include "caml/memory.h"
 #include "caml/minor_gc.h"
-#include "caml/mlvalues.h"
+#include "caml/misc.h"
 #include "caml/roots.h"
-#include "caml/signals.h"
-
-struct final {
-  value fun;
-  value val;
-  int offset;
-};
-
-struct finalisable {
-  struct final *table;
-  uintnat old;
-  uintnat young;
-  uintnat size;
-};
-/* [0..old) : finalisable set, the values are in the major heap
-   [old..young) : recent set, the values could be in the minor heap
-   [young..size) : free space
-
-   The element of the finalisable set are moved to the finalising set
-   below when the value are unreachable (for the first or last time).
-
-*/
-
-static struct finalisable finalisable_first = {NULL,0,0,0};
-static struct finalisable finalisable_last = {NULL,0,0,0};
-
-struct to_do {
-  struct to_do *next;
-  int size;
-  struct final item[1];  /* variable size */
-};
-
-static struct to_do *to_do_hd = NULL;
-static struct to_do *to_do_tl = NULL;
-/*
-  to_do_hd: head of the list of finalisation functions that can be run.
-  to_do_tl: tail of the list of finalisation functions that can be run.
-
-  It is the finalising set.
-*/
-
-static int running_finalisation_function = 0;
+#include "caml/shared_heap.h"
 
 /* [size] is a number of elements for the [to_do.item] array */
-static void alloc_to_do (int size)
+static void alloc_todo (caml_domain_state* d, int size)
 {
-  struct to_do *result = caml_stat_alloc_noexc (sizeof (struct to_do) +
-                                                size * sizeof (struct final));
+  struct final_todo *result =
+    caml_stat_alloc_noexc (sizeof (struct final_todo) +
+                           size * sizeof (struct final));
+  struct caml_final_info *f = d->final_info;
   if (result == NULL) caml_fatal_error ("out of memory");
   result->next = NULL;
   result->size = size;
-  if (to_do_tl == NULL){
-    to_do_hd = result;
-    to_do_tl = result;
-    if(!running_finalisation_function) caml_set_action_pending();
-  }else{
-    CAMLassert (to_do_tl->next == NULL);
-    to_do_tl->next = result;
-    to_do_tl = result;
+  if (f->todo_tail == NULL) {
+    f->todo_head = result;
+    f->todo_tail = result;
+  } else {
+    CAMLassert (f->todo_tail->next == NULL);
+    f->todo_tail->next = result;
+    f->todo_tail = result;
   }
 }
 
 /* Find white finalisable values, move them to the finalising set, and
-   darken them (if darken_value is true).
-*/
-static void generic_final_update (struct finalisable * final, int darken_value)
+   darken them (if darken_value is true). */
+static void generic_final_update
+  (caml_domain_state* d, struct finalisable *final, int darken_value)
 {
   uintnat i, j, k;
   uintnat todo_count = 0;
+  struct caml_final_info *f = d->final_info;
 
   CAMLassert (final->old <= final->young);
-  for (i = 0; i < final->old; i++){
+  for (i = 0; i < final->old; i++) {
     CAMLassert (Is_block (final->table[i].val));
-    CAMLassert (Is_in_heap (final->table[i].val));
-    if (Is_white_val (final->table[i].val)){
+    if (is_unmarked (final->table[i].val)) {
       ++ todo_count;
     }
   }
@@ -112,24 +73,23 @@ static void generic_final_update (struct finalisable * final, int darken_value)
       (alive or in the minor heap), next available slot.
       - k : index in to_do_tl, next available slot.
   */
-  if (todo_count > 0){
-    alloc_to_do (todo_count);
+  if (todo_count > 0) {
+    alloc_todo (d, todo_count);
     j = k = 0;
     for (i = 0; i < final->old; i++){
       CAMLassert (Is_block (final->table[i].val));
-      CAMLassert (Is_in_heap (final->table[i].val));
       CAMLassert (Tag_val (final->table[i].val) != Forward_tag);
-      if(Is_white_val (final->table[i].val)){
+      if (is_unmarked (final->table[i].val)) {
         /** dead */
-        to_do_tl->item[k] = final->table[i];
-        if(!darken_value){
+        f->todo_tail->item[k] = final->table[i];
+        if (!darken_value) {
           /* The value is not darken so the finalisation function
              is called with unit not with the value */
-          to_do_tl->item[k].val = Val_unit;
-          to_do_tl->item[k].offset = 0;
+          f->todo_tail->item[k].val = Val_unit;
+          f->todo_tail->item[k].offset = 0;
         };
         k++;
-      }else{
+      } else {
         /** alive */
         final->table[j++] = final->table[i];
       }
@@ -137,146 +97,142 @@ static void generic_final_update (struct finalisable * final, int darken_value)
     CAMLassert (i == final->old);
     CAMLassert (k == todo_count);
     final->old = j;
-    for(;i < final->young; i++){
+    for ( ; i < final->young; i++) {
       final->table[j++] = final->table[i];
     }
     final->young = j;
-    to_do_tl->size = k;
-    if(darken_value){
-      for (i = 0; i < k; i++){
+    f->todo_tail->size = k;
+    if (darken_value) {
+      for (i = 0; i < k; i++) {
         /* Note that item may already be dark due to multiple entries in
            the final table. */
-        caml_darken (to_do_tl->item[i].val, NULL);
+        caml_darken (NULL, f->todo_tail->item[i].val, NULL);
       }
     }
   }
 }
 
-void caml_final_update_mark_phase (){
-  generic_final_update(&finalisable_first, /* darken_value */ 1);
+int caml_final_update_first (caml_domain_state* d)
+{
+  struct caml_final_info *f = d->final_info;
+  if (!f->updated_first) {
+    CAML_EV_BEGIN(EV_FINALISE_UPDATE_FIRST);
+    generic_final_update (d, &f->first, /* darken_value */ 1);
+    CAML_EV_END(EV_FINALISE_UPDATE_FIRST);
+    f->updated_first = 1;
+    return 1;
+  }
+  return 0;
 }
 
-void caml_final_update_clean_phase (){
-  generic_final_update(&finalisable_last, /* darken_value */ 0);
+int caml_final_update_last (caml_domain_state* d)
+{
+  struct caml_final_info *f = d->final_info;
+  if (!f->updated_last) {
+    CAML_EV_BEGIN(EV_FINALISE_UPDATE_LAST);
+    generic_final_update (d, &f->last, /* darken_value */ 0);
+    CAML_EV_END(EV_FINALISE_UPDATE_LAST);
+    f->updated_last = 1;
+    return 1;
+  }
+  return 0;
 }
 
-/* Call the finalisation functions for the finalising set.
-   Note that this function must be reentrant.
-*/
-value caml_final_do_calls_exn (void)
+void caml_final_do_calls (void)
 {
   struct final f;
   value res;
+  struct caml_final_info *fi = Caml_state->final_info;
 
-  if (!running_finalisation_function && to_do_hd != NULL){
-    if (caml_finalise_begin_hook != NULL) (*caml_finalise_begin_hook) ();
+  if (fi->running_finalisation_function) return;
+  if (fi->todo_head != NULL) {
     caml_gc_message (0x80, "Calling finalisation functions.\n");
-    while (1){
-      while (to_do_hd != NULL && to_do_hd->size == 0){
-        struct to_do *next_hd = to_do_hd->next;
-        caml_stat_free (to_do_hd);
-        to_do_hd = next_hd;
-        if (to_do_hd == NULL) to_do_tl = NULL;
+    while (1) {
+      while (fi->todo_head != NULL && fi->todo_head->size == 0) {
+        struct final_todo *next_head = fi->todo_head->next;
+        caml_stat_free (fi->todo_head);
+        fi->todo_head = next_head;
+        if (fi->todo_head == NULL) fi->todo_tail = NULL;
       }
-      if (to_do_hd == NULL) break;
-      CAMLassert (to_do_hd->size > 0);
-      -- to_do_hd->size;
-      f = to_do_hd->item[to_do_hd->size];
-      running_finalisation_function = 1;
+      if (fi->todo_head == NULL) break;
+      CAMLassert (fi->todo_head->size > 0);
+      --fi->todo_head->size;
+      f = fi->todo_head->item[fi->todo_head->size];
+      fi->running_finalisation_function = 1;
       res = caml_callback_exn (f.fun, f.val + f.offset);
-      running_finalisation_function = 0;
-      if (Is_exception_result (res)) return res;
+      fi->running_finalisation_function = 0;
+      if (Is_exception_result(res)) caml_raise (Extract_exception (res));
     }
     caml_gc_message (0x80, "Done calling finalisation functions.\n");
-    if (caml_finalise_end_hook != NULL) (*caml_finalise_end_hook) ();
   }
-  return Val_unit;
 }
 
 /* Call a scanning_action [f] on [x]. */
-#define Call_action(f,x) (*(f)) ((x), &(x))
-
-/* Call [*f] on the closures of the finalisable set and
-   the closures and values of the finalising set.
-   This is called by the major GC [caml_darken_all_roots]
-   and by the compactor through [caml_do_roots]
-*/
-void caml_final_do_roots (scanning_action f)
+#define Call_action(f,d,x) (*(f)) ((d), (x), &(x))
+
+/* Called my major_gc for marking roots */
+void caml_final_do_roots
+  (scanning_action act, void* fdata, caml_domain_state* d, int do_val)
 {
   uintnat i;
-  struct to_do *todo;
-
-  CAMLassert (finalisable_first.old <= finalisable_first.young);
-  for (i = 0; i < finalisable_first.young; i++){
-    Call_action (f, finalisable_first.table[i].fun);
-  };
-
-  CAMLassert (finalisable_last.old <= finalisable_last.young);
-  for (i = 0; i < finalisable_last.young; i++){
-    Call_action (f, finalisable_last.table[i].fun);
-  };
-
-  for (todo = to_do_hd; todo != NULL; todo = todo->next){
-    for (i = 0; i < todo->size; i++){
-      Call_action (f, todo->item[i].fun);
-      Call_action (f, todo->item[i].val);
-    }
+  struct final_todo *todo;
+  struct caml_final_info *f = d->final_info;
+
+  CAMLassert (f->first.old <= f->first.young);
+  for (i = 0; i < f->first.young; i++) {
+    Call_action (act, fdata, f->first.table[i].fun);
+    if (do_val)
+      Call_action (act, fdata, f->first.table[i].val);
   }
-}
 
-/* Call caml_invert_root on the values of the finalisable set. This is called
-   directly by the compactor.
-*/
-void caml_final_invert_finalisable_values ()
-{
-  uintnat i;
+  CAMLassert (f->last.old <= f->last.young);
+  for (i = 0; i < f->last.young; i++) {
+    Call_action (act, fdata, f->last.table[i].fun);
+    if (do_val)
+      Call_action (act, fdata, f->last.table[i].val);
+  }
 
-  CAMLassert (finalisable_first.old <= finalisable_first.young);
-  for (i = 0; i < finalisable_first.young; i++){
-    caml_invert_root(finalisable_first.table[i].val,
-                &finalisable_first.table[i].val);
-  };
-
-  CAMLassert (finalisable_last.old <= finalisable_last.young);
-  for (i = 0; i < finalisable_last.young; i++){
-    caml_invert_root(finalisable_last.table[i].val,
-                &finalisable_last.table[i].val);
-  };
+  for (todo = f->todo_head; todo != NULL; todo = todo->next) {
+    for (i = 0; i < todo->size; i++) {
+      Call_action (act, fdata, todo->item[i].fun);
+      Call_action (act, fdata, todo->item[i].val);
+    }
+  }
 }
 
-/* Call [caml_oldify_one] on the closures and values of the recent set.
-   This is called by the minor GC through [caml_oldify_local_roots].
-*/
-void caml_final_oldify_young_roots ()
+/* Called by minor gc for marking roots */
+void caml_final_do_young_roots
+  (scanning_action act, void* fdata, caml_domain_state* d, int do_last_val)
 {
   uintnat i;
+  struct caml_final_info *f = d->final_info;
 
-  CAMLassert (finalisable_first.old <= finalisable_first.young);
-  for (i = finalisable_first.old; i < finalisable_first.young; i++){
-    caml_oldify_one(finalisable_first.table[i].fun,
-                    &finalisable_first.table[i].fun);
-    caml_oldify_one(finalisable_first.table[i].val,
-                    &finalisable_first.table[i].val);
+  CAMLassert (f->first.old <= f->first.young);
+  for (i = f->first.old; i < f->first.young; i++) {
+    Call_action (act, fdata, f->first.table[i].fun);
+    Call_action (act, fdata, f->first.table[i].val);
   }
 
-  CAMLassert (finalisable_last.old <= finalisable_last.young);
-  for (i = finalisable_last.old; i < finalisable_last.young; i++){
-    caml_oldify_one(finalisable_last.table[i].fun,
-                    &finalisable_last.table[i].fun);
+  CAMLassert (f->last.old <= f->last.old);
+  for (i = f->last.old; i < f->last.young; i++) {
+    Call_action (act, fdata, f->last.table[i].fun);
+    if (do_last_val)
+      Call_action (act, fdata, f->last.table[i].val);
   }
-
 }
 
-static void generic_final_minor_update (struct finalisable * final)
+static void generic_final_minor_update
+  (caml_domain_state* d, struct finalisable * final)
 {
   uintnat i, j, k;
   uintnat todo_count = 0;
+  struct caml_final_info *fi = d->final_info;
 
   CAMLassert (final->old <= final->young);
   for (i = final->old; i < final->young; i++){
     CAMLassert (Is_block (final->table[i].val));
-    CAMLassert (Is_in_heap_or_young (final->table[i].val));
-    if (Is_young(final->table[i].val) && Hd_val(final->table[i].val) != 0){
+    if (Is_young(final->table[i].val) &&
+        caml_get_header_val(final->table[i].val) != 0){
       ++ todo_count;
     }
   }
@@ -289,22 +245,22 @@ static void generic_final_minor_update (struct finalisable * final)
             next available slot.
       - k : index in to_do_tl, next available slot.
   */
-  if (todo_count > 0){
-    alloc_to_do (todo_count);
+  if (todo_count > 0) {
+    alloc_todo (d, todo_count);
     k = 0;
     j = final->old;
-    for (i = final->old; i < final->young; i++){
+    for (i = final->old; i < final->young; i++) {
       CAMLassert (Is_block (final->table[i].val));
-      CAMLassert (Is_in_heap_or_young (final->table[i].val));
       CAMLassert (Tag_val (final->table[i].val) != Forward_tag);
-      if(Is_young(final->table[i].val) && Hd_val(final->table[i].val) != 0){
+      if (Is_young(final->table[j].val) &&
+          caml_get_header_val(final->table[i].val) != 0) {
         /** dead */
-        to_do_tl->item[k] = final->table[i];
+        fi->todo_tail->item[k] = final->table[i];
         /* The finalisation function is called with unit not with the value */
-        to_do_tl->item[k].val = Val_unit;
-        to_do_tl->item[k].offset = 0;
+        fi->todo_tail->item[k].val = Val_unit;
+        fi->todo_tail->item[k].offset = 0;
         k++;
-      }else{
+      } else {
         /** alive */
         final->table[j++] = final->table[i];
       }
@@ -312,119 +268,143 @@ static void generic_final_minor_update (struct finalisable * final)
     CAMLassert (i == final->young);
     CAMLassert (k == todo_count);
     final->young = j;
-    to_do_tl->size = todo_count;
+    fi->todo_tail->size = todo_count;
   }
 
   /** update the minor value to the copied major value */
-  for (i = final->old; i < final->young; i++){
+  for (i = final->old; i < final->young; i++) {
     CAMLassert (Is_block (final->table[i].val));
-    CAMLassert (Is_in_heap_or_young (final->table[i].val));
     if (Is_young(final->table[i].val)) {
-      CAMLassert (Hd_val(final->table[i].val) == 0);
-      final->table[i].val = Field(final->table[i].val,0);
+      CAMLassert (caml_get_header_val(final->table[i].val) == 0);
+      final->table[i].val = Field(final->table[i].val, 0);
     }
   }
+}
 
-  /** check invariant */
-  CAMLassert (final->old <= final->young);
-  for (i = 0; i < final->young; i++){
-    CAMLassert( Is_in_heap(final->table[i].val) );
-  };
-
+void caml_final_update_last_minor (caml_domain_state* d)
+{
+  generic_final_minor_update(d, &d->final_info->last);
 }
 
-/* At the end of minor collection update the finalise_last roots in
-   minor heap when moved to major heap or moved them to the finalising
-   set when dead.
-*/
-void caml_final_update_minor_roots ()
+void caml_final_empty_young (caml_domain_state* d)
 {
-  generic_final_minor_update(&finalisable_last);
+  struct caml_final_info *f = d->final_info;
+  f->first.old = f->first.young;
+  f->last.old = f->last.young;
 }
 
-/* Empty the recent set into the finalisable set.
-   This is called at the end of each minor collection.
-   The minor heap must be empty when this is called.
-*/
-void caml_final_empty_young (void)
+void caml_final_merge_finalisable
+  (struct finalisable *source, struct finalisable *target)
 {
-  finalisable_first.old = finalisable_first.young;
-  finalisable_last.old = finalisable_last.young;
+  uintnat new_size;
+
+  CAMLassert (target->old <= target->young);
+  /* to merge the source structure, all its values are in the major heap */
+  CAMLassert (source->old == source->young);
+  if (target->young + source->young >= target->size) {
+    new_size = 2 * (target->young + source->young);
+    if (target->table == NULL) {
+      target->table = caml_stat_alloc (new_size * sizeof (struct final));
+      CAMLassert (target->old == 0);
+      CAMLassert (target->young == 0);
+      target->size = new_size;
+    } else {
+      target->table = caml_stat_resize (target->table,
+                                       new_size * sizeof (struct final));
+      target->size = new_size;
+    }
+  }
+  /* all values from the source are old, we will prepend them
+     into the old area of the target */
+  memmove(target->table + source->young, target->table,
+          target->young * sizeof (struct final));
+  memcpy(target->table, source->table,
+         source->young * sizeof (struct final));
+  /* adjust indices for the prepended values from the source */
+  target->old += source->young;
+  target->young += source->young;
+
+#ifdef DEBUG
+  {
+    /** check target is well formed on the values */
+    int i;
+    for (i = 0; i < target->old; i++) {
+      CAMLassert (target->table[i].val); /* no null ptrs */
+      CAMLassert (Is_block(target->table[i].val));
+      CAMLassert (!Is_young(target->table[i].val));
+    };
+    for (; i < target->young; i++) {
+      CAMLassert (target->table[i].val); /* no null ptrs */
+      CAMLassert (Is_block(target->table[i].val));
+    }
+  }
+#endif
 }
 
-/* Put (f,v) in the recent set. */
 static void generic_final_register (struct finalisable *final, value f, value v)
 {
-  if (!Is_block (v)
-      || !Is_in_heap_or_young(v)
-      || Tag_val (v) == Lazy_tag
+  uintnat new_size;
+
+  if (!Is_block(v) || Tag_val(v) == Lazy_tag
 #ifdef FLAT_FLOAT_ARRAY
-      || Tag_val (v) == Double_tag
+      || Tag_val(v) == Double_tag
 #endif
-      || Tag_val (v) == Forward_tag) {
+      || Tag_val(v) == Forcing_tag
+      || Tag_val(v) == Forward_tag) {
     caml_invalid_argument ("Gc.finalise");
   }
   CAMLassert (final->old <= final->young);
 
-  if (final->young >= final->size){
-    if (final->table == NULL){
-      uintnat new_size = 30;
+  if (final->young >= final->size) {
+    if (final->table == NULL) {
+      new_size = 30;
       final->table = caml_stat_alloc (new_size * sizeof (struct final));
       CAMLassert (final->old == 0);
       CAMLassert (final->young == 0);
       final->size = new_size;
-    }else{
-      uintnat new_size = final->size * 2;
+    } else {
+      new_size = final->size * 2;
       final->table = caml_stat_resize (final->table,
-                                      new_size * sizeof (struct final));
+                                       new_size * sizeof (struct final));
       final->size = new_size;
     }
   }
   CAMLassert (final->young < final->size);
   final->table[final->young].fun = f;
-  if (Tag_val (v) == Infix_tag){
+  if (Tag_val(v) == Infix_tag) {
     final->table[final->young].offset = Infix_offset_val (v);
     final->table[final->young].val = v - Infix_offset_val (v);
-  }else{
+  } else {
     final->table[final->young].offset = 0;
     final->table[final->young].val = v;
   }
   ++ final->young;
-
 }
 
-CAMLprim value caml_final_register (value f, value v){
-  generic_final_register(&finalisable_first, f, v);
+
+CAMLprim value caml_final_register (value f, value v)
+{
+  generic_final_register (&Caml_state->final_info->first, f, v);
   return Val_unit;
 }
 
-CAMLprim value caml_final_register_called_without_value (value f, value v){
-  generic_final_register(&finalisable_last, f, v);
+CAMLprim value caml_final_register_called_without_value (value f, value v)
+{
+  generic_final_register (&Caml_state->final_info->last, f, v);
   return Val_unit;
 }
 
 CAMLprim value caml_final_release (value unit)
 {
-  running_finalisation_function = 0;
-  /* Some finalisers might be waiting. */
-  if (to_do_tl != NULL)
-    caml_set_action_pending();
+  Caml_state->final_info->running_finalisation_function = 0;
   return Val_unit;
 }
 
-static void gen_final_invariant_check(struct finalisable *final){
-  uintnat i;
-
-  CAMLassert (final->old <= final->young);
-  for (i = 0; i < final->old; i++){
-    CAMLassert( Is_in_heap(final->table[i].val) );
-  };
-  for (i = final->old; i < final->young; i++){
-    CAMLassert( Is_in_heap_or_young(final->table[i].val) );
-  };
-}
-
-void caml_final_invariant_check(void){
-  gen_final_invariant_check(&finalisable_first);
-  gen_final_invariant_check(&finalisable_last);
+struct caml_final_info* caml_alloc_final_info (void)
+{
+  struct caml_final_info* f =
+    caml_stat_alloc_noexc (sizeof(struct caml_final_info));
+  if(f != NULL)
+    memset (f, 0, sizeof(struct caml_final_info));
+  return f;
 }
diff --git a/runtime/fix_code.c b/runtime/fix_code.c
index aa059be5dfd7..76802d4d2f48 100644
--- a/runtime/fix_code.c
+++ b/runtime/fix_code.c
@@ -106,7 +106,8 @@ int* caml_init_opcode_nargs(void)
       l[BRANCH] = l[BRANCHIF] = l[BRANCHIFNOT] = l[PUSHTRAP] =
       l[C_CALL1] = l[C_CALL2] = l[C_CALL3] = l[C_CALL4] = l[C_CALL5] =
       l[CONSTINT] = l[PUSHCONSTINT] = l[OFFSETINT] =
-      l[OFFSETREF] = l[OFFSETCLOSURE] = l[PUSHOFFSETCLOSURE] = 1;
+      l[OFFSETREF] = l[OFFSETCLOSURE] = l[PUSHOFFSETCLOSURE] =
+      l[RESUMETERM] = l[REPERFORMTERM] = 1;
 
     /* Instructions with two operands */
     l[APPTERM] = l[CLOSURE] = l[PUSHGETGLOBALFIELD] =
diff --git a/runtime/floats.c b/runtime/floats.c
index 7561bfba81e6..11c632be07f7 100644
--- a/runtime/floats.c
+++ b/runtime/floats.c
@@ -36,7 +36,7 @@
 #include "caml/mlvalues.h"
 #include "caml/misc.h"
 #include "caml/reverse.h"
-#include "caml/stacks.h"
+#include "caml/fiber.h"
 
 #if defined(HAS_LOCALE) || defined(__MINGW32__)
 
@@ -153,11 +153,8 @@ CAMLexport value caml_copy_double(double d)
 {
   value res;
 
-#define Setup_for_gc
-#define Restore_after_gc
-  Alloc_small(res, Double_wosize, Double_tag);
-#undef Setup_for_gc
-#undef Restore_after_gc
+  Alloc_small(res, Double_wosize, Double_tag,
+              { caml_handle_gc_interrupt_no_async_exceptions(); });
   Store_double_val(res, d);
   return res;
 }
diff --git a/runtime/frame_descriptors.c b/runtime/frame_descriptors.c
new file mode 100644
index 000000000000..67e4270247d6
--- /dev/null
+++ b/runtime/frame_descriptors.c
@@ -0,0 +1,199 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                   Tom Kelly, OCaml Labs Consultancy                    */
+/*                 Stephen Dolan, University of Cambridge                 */
+/*                                                                        */
+/*   Copyright 2019 Indian Institute of Technology, Madras                */
+/*   Copyright 2021 OCaml Labs Consultancy Ltd                            */
+/*   Copyright 2019 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#define CAML_INTERNALS
+
+#include "caml/platform.h"
+#include "caml/frame_descriptors.h"
+#include "caml/major_gc.h" /* for caml_major_cycles_completed */
+#include "caml/memory.h"
+#include <stddef.h>
+
+/* Defined in code generated by ocamlopt */
+extern intnat * caml_frametable[];
+
+typedef struct link {
+  intnat* frametable;
+  struct link *next;
+} link;
+
+#define iter_list(list,lnk) \
+  for (lnk = list; lnk != NULL; lnk = lnk->next)
+
+static frame_descr * next_frame_descr(frame_descr * d) {
+  unsigned char num_allocs = 0, *p;
+  CAMLassert(d->retaddr >= 4096);
+  /* Skip to end of live_ofs */
+  p = (unsigned char*)&d->live_ofs[d->num_live];
+  /* Skip alloc_lengths if present */
+  if (d->frame_size & 2) {
+    num_allocs = *p;
+    p += num_allocs + 1;
+  }
+  /* Skip debug info if present */
+  if (d->frame_size & 1 &&
+      d->frame_size != (unsigned short)-1) {
+    /* Align to 32 bits */
+    p = Align_to(p, uint32_t);
+    p += sizeof(uint32_t) * (d->frame_size & 2 ? num_allocs : 1);
+  }
+  /* Align to word size */
+  p = Align_to(p, void*);
+  return ((frame_descr*) p);
+}
+
+static caml_frame_descrs build_frame_descriptors(link* frametables)
+{
+  intnat num_descr, tblsize, i, j, len;
+  intnat * tbl;
+  frame_descr * d;
+  uintnat h;
+  link *lnk;
+  caml_frame_descrs table;
+
+  /* Count the frame descriptors */
+  num_descr = 0;
+  iter_list(frametables,lnk) {
+    num_descr += *lnk->frametable;
+  }
+
+  /* The size of the hashtable is a power of 2 greater or equal to
+     2 times the number of descriptors */
+  tblsize = 4;
+  while (tblsize < 2 * num_descr) tblsize *= 2;
+
+  /* Allocate the hash table */
+  table.descriptors = caml_stat_alloc(tblsize * sizeof(frame_descr*));
+  table.mask = tblsize - 1;
+  for (i = 0; i < tblsize; i++) table.descriptors[i] = NULL;
+
+  /* Fill the hash table */
+  iter_list(frametables,lnk) {
+    tbl = lnk->frametable;
+    len = *tbl;
+    d = (frame_descr *)(tbl + 1);
+    for (j = 0; j < len; j++) {
+      h = Hash_retaddr(d->retaddr, tblsize - 1);
+      while (table.descriptors[h] != NULL) {
+        h = (h+1) & table.mask;
+      }
+      table.descriptors[h] = d;
+      d = next_frame_descr(d);
+    }
+  }
+  return table;
+}
+
+static caml_plat_mutex descr_mutex;
+static link* frametables;
+
+/* Memory used by frametables is only freed once a GC cycle has
+   completed, because other threads access the frametable at
+   unpredictable times. */
+struct frametable_version {
+  caml_frame_descrs table;
+
+  /* after this cycle has completed,
+     the previous table should be deallocated.
+     Set to No_need_to_free after prev is freed */
+  atomic_uintnat free_prev_after_cycle;
+  struct frametable_version* prev;
+};
+#define No_need_to_free ((uintnat)(-1))
+
+/* Only modified when holding descr_mutex, but read without locking */
+static atomic_uintnat current_frametable = ATOMIC_UINTNAT_INIT(0);
+
+static link *cons(intnat *frametable, link *tl) {
+  link *lnk = caml_stat_alloc(sizeof(link));
+  lnk->frametable = frametable;
+  lnk->next = tl;
+  return lnk;
+}
+
+void caml_init_frame_descriptors(void)
+{
+  int i;
+  struct frametable_version *ft;
+
+  caml_plat_mutex_init(&descr_mutex);
+
+  caml_plat_lock(&descr_mutex);
+  for (i = 0; caml_frametable[i] != 0; i++)
+    frametables = cons(caml_frametable[i], frametables);
+
+  ft = caml_stat_alloc(sizeof(*ft));
+  ft->table = build_frame_descriptors(frametables);
+  atomic_store_rel(&ft->free_prev_after_cycle, No_need_to_free);
+  ft->prev = 0;
+  atomic_store_rel(&current_frametable, (uintnat)ft);
+  caml_plat_unlock(&descr_mutex);
+}
+
+void caml_register_frametable(intnat *table)
+{
+  struct frametable_version *ft, *old;
+
+  caml_plat_lock(&descr_mutex);
+
+  frametables = cons(table, frametables);
+  old = (struct frametable_version*)atomic_load_acq(&current_frametable);
+  CAMLassert(old != NULL);
+  ft = caml_stat_alloc(sizeof(*ft));
+  ft->table = build_frame_descriptors(frametables);
+  atomic_store_rel(&ft->free_prev_after_cycle, caml_major_cycles_completed);
+  ft->prev = old;
+  atomic_store_rel(&current_frametable, (uintnat)ft);
+
+  caml_plat_unlock(&descr_mutex);
+}
+
+caml_frame_descrs caml_get_frame_descrs(void)
+{
+  struct frametable_version *ft =
+    (struct frametable_version*)atomic_load_acq(&current_frametable);
+  CAMLassert(ft);
+  if (atomic_load_acq(&ft->free_prev_after_cycle) < caml_major_cycles_completed)
+  {
+    /* it's now safe to free the old table */
+    caml_plat_lock(&descr_mutex);
+    if (ft->prev != NULL) {
+      caml_stat_free(ft->prev->table.descriptors);
+      caml_stat_free(ft->prev);
+      ft->prev = NULL;
+      atomic_store_rel(&ft->free_prev_after_cycle, No_need_to_free);
+    }
+    caml_plat_unlock(&descr_mutex);
+  }
+  return ft->table;
+}
+
+frame_descr* caml_find_frame_descr(caml_frame_descrs fds, uintnat pc)
+{
+  frame_descr * d;
+  uintnat h;
+
+  h = Hash_retaddr(pc, fds.mask);
+  while (1) {
+    d = fds.descriptors[h];
+    if (d == 0) return NULL; /* can happen if some code compiled without -g */
+    if (d->retaddr == pc) break;
+    h = (h+1) & fds.mask;
+  }
+  return d;
+}
diff --git a/runtime/freelist.c b/runtime/freelist.c
deleted file mode 100644
index 8e8d5c9160f7..000000000000
--- a/runtime/freelist.c
+++ /dev/null
@@ -1,1856 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*              Damien Doligez, projet Para, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-#define CAML_INTERNALS
-
-#define FREELIST_DEBUG 0
-#if FREELIST_DEBUG
-#include <stdio.h>
-#endif
-
-#include <string.h>
-
-#include "caml/config.h"
-#include "caml/custom.h"
-#include "caml/freelist.h"
-#include "caml/gc.h"
-#include "caml/gc_ctrl.h"
-#include "caml/memory.h"
-#include "caml/major_gc.h"
-#include "caml/misc.h"
-#include "caml/mlvalues.h"
-#include "caml/eventlog.h"
-
-/*************** declarations common to all policies ******************/
-
-/* A block in a small free list is a [value] (integer representing a
-   pointer to the first word after the block's header). The end of the
-  list is NULL.
-*/
-#define Val_NULL ((value) NULL)
-
-asize_t caml_fl_cur_wsz = 0;     /* Number of words in the free set,
-                                    including headers but not fragments. */
-
-value caml_fl_merge = Val_NULL;  /* Current insertion pointer.  Managed
-                                    jointly with [sweep_slice]. */
-
-/* Next in list */
-#define Next_small(v) Field ((v), 0)
-
-/* Next in memory order */
-Caml_inline value Next_in_mem (value v) {
-  return (value) &Field ((v), Whsize_val (v));
-}
-
-#ifdef CAML_INSTR
-
-/* number of pointers followed to allocate from the free set */
-uintnat caml_instr_alloc_jump = 0;
-
-#define EV_ALLOC_JUMP(n) (caml_instr_alloc_jump += (n))
-
-#endif /*CAML_INSTR*/
-
-
-
-/********************* next-fit allocation policy *********************/
-
-/* The free-list is kept sorted by increasing addresses.
-   This makes the merging of adjacent free blocks possible.
-   (See [nf_merge_block].)
-*/
-
-/* The sentinel can be located anywhere in memory, but it must not be
-   adjacent to any heap object. */
-static struct {
-  value filler1; /* Make sure the sentinel is never adjacent to any block. */
-  header_t h;
-  value first_field;
-  value filler2; /* Make sure the sentinel is never adjacent to any block. */
-} nf_sentinel = {0, Make_header (0, 0, Caml_blue), Val_NULL, 0};
-
-#define Nf_head (Val_bp (&(nf_sentinel.first_field)))
-
-static value nf_prev = Nf_head;  /* Current allocation pointer. */
-static value nf_last = Val_NULL; /* Last block in the list.  Only valid
-                                    just after [nf_allocate] returns NULL. */
-
-#if defined (DEBUG) || FREELIST_DEBUG
-static void nf_check (void)
-{
-  value cur;
-  int prev_found = 0, merge_found = 0;
-  uintnat size_found = 0;
-
-  cur = Next_small (Nf_head);
-  while (cur != Val_NULL){
-    size_found += Whsize_bp (cur);
-    CAMLassert (Is_in_heap (cur));
-    if (cur == nf_prev) prev_found = 1;
-    if (cur == caml_fl_merge) merge_found = 1;
-    cur = Next_small (cur);
-  }
-  CAMLassert (prev_found || nf_prev == Nf_head);
-  CAMLassert (merge_found || caml_fl_merge == Nf_head);
-  CAMLassert (size_found == caml_fl_cur_wsz);
-}
-
-#endif /* DEBUG || FREELIST_DEBUG */
-
-/* [nf_allocate_block] is called by [nf_allocate].  Given a suitable free
-   block and the requested size, it allocates a new block from the free
-   block.  There are three cases:
-   0. The free block has the requested size. Detach the block from the
-      free-list and return it.
-   1. The free block is 1 word longer than the requested size. Detach
-      the block from the free list.  The remaining word cannot be linked:
-      turn it into an empty block (header only), and return the rest.
-   2. The free block is large enough. Split it in two and return the right
-      block.
-   In all cases, the allocated block is right-justified in the free block:
-   it is located in the high-address words of the free block, so that
-   the linking of the free-list does not change in case 2.
-*/
-static header_t *nf_allocate_block (mlsize_t wh_sz, value prev, value cur)
-{
-  header_t h = Hd_bp (cur);
-  CAMLassert (Whsize_hd (h) >= wh_sz);
-  if (Wosize_hd (h) < wh_sz + 1){                        /* Cases 0 and 1. */
-    caml_fl_cur_wsz -= Whsize_hd (h);
-    Next_small (prev) = Next_small (cur);
-    CAMLassert (Is_in_heap (Next_small (prev))
-                || Next_small (prev) == Val_NULL);
-    if (caml_fl_merge == cur) caml_fl_merge = prev;
-#ifdef DEBUG
-    nf_last = Val_NULL;
-#endif
-      /* In case 1, the following creates the empty block correctly.
-         In case 0, it gives an invalid header to the block.  The function
-         calling [nf_allocate] will overwrite it. */
-    Hd_op (cur) = Make_header (0, 0, Caml_white);
-  }else{                                                        /* Case 2. */
-    caml_fl_cur_wsz -= wh_sz;
-    Hd_op (cur) = Make_header (Wosize_hd (h) - wh_sz, 0, Caml_blue);
-  }
-  nf_prev = prev;
-  return (header_t *) &Field (cur, Wosize_hd (h) - wh_sz);
-}
-
-static header_t *nf_allocate (mlsize_t wo_sz)
-{
-  value cur = Val_NULL, prev;
-  CAMLassert (sizeof (char *) == sizeof (value));
-  CAMLassert (wo_sz >= 1);
-
-    CAMLassert (nf_prev != Val_NULL);
-    /* Search from [nf_prev] to the end of the list. */
-    prev = nf_prev;
-    cur = Next_small (prev);
-    while (cur != Val_NULL){
-      CAMLassert (Is_in_heap (cur));
-      if (Wosize_bp (cur) >= wo_sz){
-        return nf_allocate_block (Whsize_wosize (wo_sz), prev, cur);
-      }
-      prev = cur;
-      cur = Next_small (prev);
-      CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-    }
-    nf_last = prev;
-    /* Search from the start of the list to [nf_prev]. */
-    prev = Nf_head;
-    cur = Next_small (prev);
-    while (prev != nf_prev){
-      if (Wosize_bp (cur) >= wo_sz){
-        return nf_allocate_block (Whsize_wosize (wo_sz), prev, cur);
-      }
-      prev = cur;
-      cur = Next_small (prev);
-      CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-    }
-    /* No suitable block was found. */
-    return NULL;
-}
-
-/* Location of the last fragment seen by the sweeping code.
-   This is a pointer to the first word after the fragment, which is
-   the header of the next block.
-   Note that [last_fragment] doesn't point to the fragment itself,
-   but to the block after it.
-*/
-static header_t *nf_last_fragment;
-
-static void nf_init_merge (void)
-{
-  CAML_EV_ALLOC_FLUSH();
-  nf_last_fragment = NULL;
-  caml_fl_merge = Nf_head;
-#ifdef DEBUG
-  nf_check ();
-#endif
-}
-
-static void nf_init (void)
-{
-  Next_small (Nf_head) = Val_NULL;
-  nf_prev = Nf_head;
-  caml_fl_cur_wsz = 0;
-}
-
-static void nf_reset (void)
-{
-  nf_init ();
-}
-
-/* Note: the [limit] parameter is unused because we merge blocks one by one. */
-static header_t *nf_merge_block (value bp, char *limit)
-{
-  value prev, cur, adj;
-  header_t hd = Hd_val (bp);
-  mlsize_t prev_wosz;
-
-  caml_fl_cur_wsz += Whsize_hd (hd);
-
-  /* [merge_block] is now responsible for calling the finalization function. */
-  if (Tag_hd (hd) == Custom_tag){
-    void (*final_fun)(value) = Custom_ops_val(bp)->finalize;
-    if (final_fun != NULL) final_fun(bp);
-  }
-
-#ifdef DEBUG
-  caml_set_fields (bp, 0, Debug_free_major);
-#endif
-  prev = caml_fl_merge;
-  cur = Next_small (prev);
-  /* The sweep code makes sure that this is the right place to insert
-     this block: */
-  CAMLassert (Bp_val (prev) < Bp_val (bp) || prev == Nf_head);
-  CAMLassert (Bp_val (cur) > Bp_val (bp) || cur == Val_NULL);
-
-  /* If [last_fragment] and [bp] are adjacent, merge them. */
-  if (nf_last_fragment == Hp_val (bp)){
-    mlsize_t bp_whsz = Whsize_val (bp);
-    if (bp_whsz <= Max_wosize){
-      hd = Make_header (bp_whsz, 0, Caml_white);
-      bp = (value) nf_last_fragment;
-      Hd_val (bp) = hd;
-      caml_fl_cur_wsz += Whsize_wosize (0);
-    }
-  }
-
-  /* If [bp] and [cur] are adjacent, remove [cur] from the free-list
-     and merge them. */
-  adj = Next_in_mem (bp);
-  if (adj == cur){
-    value next_cur = Next_small (cur);
-    mlsize_t cur_whsz = Whsize_val (cur);
-
-    if (Wosize_hd (hd) + cur_whsz <= Max_wosize){
-      Next_small (prev) = next_cur;
-      if (nf_prev == cur) nf_prev = prev;
-      hd = Make_header (Wosize_hd (hd) + cur_whsz, 0, Caml_blue);
-      Hd_val (bp) = hd;
-      adj = Next_in_mem (bp);
-#ifdef DEBUG
-      nf_last = Val_NULL;
-      Next_small (cur) = (value) Debug_free_major;
-      Hd_val (cur) = Debug_free_major;
-#endif
-      cur = next_cur;
-    }
-  }
-  /* If [prev] and [bp] are adjacent merge them, else insert [bp] into
-     the free-list if it is big enough. */
-  prev_wosz = Wosize_val (prev);
-  if (Next_in_mem (prev) == bp && prev_wosz + Whsize_hd (hd) < Max_wosize){
-    Hd_val (prev) = Make_header (prev_wosz + Whsize_hd (hd), 0, Caml_blue);
-#ifdef DEBUG
-    Hd_val (bp) = Debug_free_major;
-#endif
-    CAMLassert (caml_fl_merge == prev);
-  }else if (Wosize_hd (hd) != 0){
-    Hd_val (bp) = Bluehd_hd (hd);
-    Next_small (bp) = cur;
-    Next_small (prev) = bp;
-    caml_fl_merge = bp;
-  }else{
-    /* This is a fragment.  Leave it in white but remember it for eventual
-       merging with the next block. */
-    nf_last_fragment = (header_t *) bp;
-    caml_fl_cur_wsz -= Whsize_wosize (0);
-  }
-  return Hp_val (adj);
-}
-
-/* This is a heap extension.  We have to insert it in the right place
-   in the free-list.
-   [nf_add_blocks] can only be called right after a call to
-   [nf_allocate] that returned Val_NULL.
-   Most of the heap extensions are expected to be at the end of the
-   free list.  (This depends on the implementation of [malloc].)
-
-   [bp] must point to a list of blocks chained by their field 0,
-   terminated by Val_NULL, and field 1 of the first block must point to
-   the last block.
-*/
-static void nf_add_blocks (value bp)
-{
-  value cur = bp;
-  CAMLassert (nf_last != Val_NULL);
-  CAMLassert (Next_small (nf_last) == Val_NULL);
-  do {
-    caml_fl_cur_wsz += Whsize_bp (cur);
-    cur = Field(cur, 0);
-  } while (cur != Val_NULL);
-
-  if (Bp_val (bp) > Bp_val (nf_last)){
-    Next_small (nf_last) = bp;
-    if (nf_last == caml_fl_merge && (char *) bp < caml_gc_sweep_hp){
-      caml_fl_merge = Field (bp, 1);
-    }
-  }else{
-    value prev;
-
-    prev = Nf_head;
-    cur = Next_small (prev);
-    while (cur != Val_NULL && Bp_val (cur) < Bp_val (bp)){
-      CAMLassert (Bp_val (prev) < Bp_val (bp) || prev == Nf_head);
-      prev = cur;
-      cur = Next_small (prev);
-    }
-    CAMLassert (Bp_val (prev) < Bp_val (bp) || prev == Nf_head);
-    CAMLassert (Bp_val (cur) > Bp_val (bp) || cur == Val_NULL);
-    Next_small (Field (bp, 1)) = cur;
-    Next_small (prev) = bp;
-    /* When inserting blocks between [caml_fl_merge] and [caml_gc_sweep_hp],
-       we must advance [caml_fl_merge] to the new block, so that [caml_fl_merge]
-       is always the last free-list block before [caml_gc_sweep_hp]. */
-    if (prev == caml_fl_merge && (char *) bp < caml_gc_sweep_hp){
-      caml_fl_merge = Field (bp, 1);
-    }
-  }
-}
-
-static void nf_make_free_blocks
-  (value *p, mlsize_t size, int do_merge, int color)
-{
-  mlsize_t sz;
-
-  while (size > 0){
-    if (size > Whsize_wosize (Max_wosize)){
-      sz = Whsize_wosize (Max_wosize);
-    }else{
-      sz = size;
-    }
-    *(header_t *)p = Make_header (Wosize_whsize (sz), 0, color);
-    if (do_merge) nf_merge_block (Val_hp (p), NULL);
-    size -= sz;
-    p += sz;
-  }
-}
-
-/******************** first-fit allocation policy *********************/
-
-#define FLP_MAX 1000
-static value flp [FLP_MAX];
-static int flp_size = 0;
-static value beyond = Val_NULL;
-
-/* The sentinel can be located anywhere in memory, but it must not be
-   adjacent to any heap object. */
-static struct {
-  value filler1; /* Make sure the sentinel is never adjacent to any block. */
-  header_t h;
-  value first_field;
-  value filler2; /* Make sure the sentinel is never adjacent to any block. */
-} ff_sentinel = {0, Make_header (0, 0, Caml_blue), Val_NULL, 0};
-
-#define Ff_head (Val_bp (&(ff_sentinel.first_field)))
-static value ff_last = Val_NULL; /* Last block in the list.  Only valid
-                                    just after [ff_allocate] returns NULL. */
-
-
-#if defined (DEBUG) || FREELIST_DEBUG
-static void ff_check (void)
-{
-  value cur;
-  int flp_found = 0, merge_found = 0;
-  uintnat size_found = 0;
-  int sz = 0;
-
-  cur = Next_small (Ff_head);
-  while (cur != Val_NULL){
-    size_found += Whsize_bp (cur);
-    CAMLassert (Is_in_heap (cur));
-    if (Wosize_bp (cur) > sz){
-      sz = Wosize_bp (cur);
-      if (flp_found < flp_size){
-        CAMLassert (Next_small (flp[flp_found]) == cur);
-        ++ flp_found;
-      }else{
-        CAMLassert (beyond == Val_NULL
-                    || Bp_val (cur) >= Bp_val (Next_small (beyond)));
-      }
-    }
-    if (cur == caml_fl_merge) merge_found = 1;
-    cur = Next_small (cur);
-  }
-  CAMLassert (flp_found == flp_size);
-  CAMLassert (merge_found || caml_fl_merge == Ff_head);
-  CAMLassert (size_found == caml_fl_cur_wsz);
-}
-#endif /* DEBUG || FREELIST_DEBUG */
-
-/* [ff_allocate_block] is called by [ff_allocate].  Given a suitable free
-   block and the requested size, it allocates a new block from the free
-   block.  There are three cases:
-   0. The free block has the requested size. Detach the block from the
-      free-list and return it.
-   1. The free block is 1 word longer than the requested size. Detach
-      the block from the free list.  The remaining word cannot be linked:
-      turn it into an empty block (header only), and return the rest.
-   2. The free block is large enough. Split it in two and return the right
-      block.
-   In all cases, the allocated block is right-justified in the free block:
-   it is located in the high-address words of the free block, so that
-   the linking of the free-list does not change in case 2.
-*/
-static header_t *ff_allocate_block (mlsize_t wh_sz, int flpi, value prev,
-                                    value cur)
-{
-  header_t h = Hd_bp (cur);
-  CAMLassert (Whsize_hd (h) >= wh_sz);
-  if (Wosize_hd (h) < wh_sz + 1){                        /* Cases 0 and 1. */
-    caml_fl_cur_wsz -= Whsize_hd (h);
-    Next_small (prev) = Next_small (cur);
-    CAMLassert (Is_in_heap (Next_small (prev))
-                || Next_small (prev) == Val_NULL);
-    if (caml_fl_merge == cur) caml_fl_merge = prev;
-#ifdef DEBUG
-    ff_last = Val_NULL;
-#endif
-      /* In case 1, the following creates the empty block correctly.
-         In case 0, it gives an invalid header to the block.  The function
-         calling [ff_allocate] will overwrite it. */
-    Hd_op (cur) = Make_header (0, 0, Caml_white);
-    if (flpi + 1 < flp_size && flp[flpi + 1] == cur){
-      flp[flpi + 1] = prev;
-    }else if (flpi == flp_size - 1){
-      beyond = (prev == Ff_head) ? Val_NULL : prev;
-      -- flp_size;
-    }
-  }else{                                                        /* Case 2. */
-    caml_fl_cur_wsz -= wh_sz;
-    Hd_op (cur) = Make_header (Wosize_hd (h) - wh_sz, 0, Caml_blue);
-  }
-  return (header_t *) &Field (cur, Wosize_hd (h) - wh_sz);
-}
-
-static header_t *ff_allocate (mlsize_t wo_sz)
-{
-  value cur = Val_NULL, prev;
-  header_t *result;
-  int i;
-  mlsize_t sz, prevsz;
-  CAMLassert (sizeof (char *) == sizeof (value));
-  CAMLassert (wo_sz >= 1);
-
-    /* Search in the flp array. */
-    for (i = 0; i < flp_size; i++){
-      sz = Wosize_bp (Next_small (flp[i]));
-      if (sz >= wo_sz){
-#if FREELIST_DEBUG
-        if (i > 5) fprintf (stderr, "FLP: found at %d  size=%d\n", i, wo_sz);
-#endif
-        result = ff_allocate_block (Whsize_wosize (wo_sz), i, flp[i],
-                                    Next_small (flp[i]));
-        goto update_flp;
-      }
-    }
-    /* Extend the flp array. */
-    if (flp_size == 0){
-      prev = Ff_head;
-      prevsz = 0;
-    }else{
-      prev = Next_small (flp[flp_size - 1]);
-      prevsz = Wosize_bp (prev);
-      if (beyond != Val_NULL) prev = beyond;
-    }
-    while (flp_size < FLP_MAX){
-      cur = Next_small (prev);
-      if (cur == Val_NULL){
-        ff_last = prev;
-        beyond = (prev == Ff_head) ? Val_NULL : prev;
-        return NULL;
-      }else{
-        sz = Wosize_bp (cur);
-        if (sz > prevsz){
-          flp[flp_size] = prev;
-          ++ flp_size;
-          if (sz >= wo_sz){
-            beyond = cur;
-            i = flp_size - 1;
-#if FREELIST_DEBUG
-            if (flp_size > 5){
-              fprintf (stderr, "FLP: extended to %d\n", flp_size);
-            }
-#endif
-            result = ff_allocate_block (Whsize_wosize (wo_sz), flp_size - 1,
-                                        prev, cur);
-            goto update_flp;
-          }
-          prevsz = sz;
-        }
-      }
-      prev = cur;
-    }
-    beyond = cur;
-
-    /* The flp table is full.  Do a slow first-fit search. */
-#if FREELIST_DEBUG
-    fprintf (stderr, "FLP: table is full -- slow first-fit\n");
-#endif
-    if (beyond != Val_NULL){
-      prev = beyond;
-    }else{
-      prev = flp[flp_size - 1];
-    }
-    prevsz = Wosize_bp (Next_small (flp[FLP_MAX-1]));
-    CAMLassert (prevsz < wo_sz);
-    cur = Next_small (prev);
-    while (cur != Val_NULL){
-      CAMLassert (Is_in_heap (cur));
-      sz = Wosize_bp (cur);
-      if (sz < prevsz){
-        beyond = cur;
-      }else if (sz >= wo_sz){
-        return ff_allocate_block (Whsize_wosize (wo_sz), flp_size, prev, cur);
-      }
-      prev = cur;
-      cur = Next_small (prev);
-    }
-    ff_last = prev;
-    return NULL;
-
-  update_flp: /* (i, sz) */
-    /* The block at [i] was removed or reduced.  Update the table. */
-    CAMLassert (0 <= i && i < flp_size + 1);
-    if (i < flp_size){
-      if (i > 0){
-        prevsz = Wosize_bp (Next_small (flp[i-1]));
-      }else{
-        prevsz = 0;
-      }
-      if (i == flp_size - 1){
-        if (Wosize_bp (Next_small (flp[i])) <= prevsz){
-          beyond = Next_small (flp[i]);
-          -- flp_size;
-        }else{
-          beyond = Val_NULL;
-        }
-      }else{
-        value buf [FLP_MAX];
-        int j = 0;
-        mlsize_t oldsz = sz;
-
-        prev = flp[i];
-        while (prev != flp[i+1] && j < FLP_MAX - i){
-          cur = Next_small (prev);
-          sz = Wosize_bp (cur);
-          if (sz > prevsz){
-            buf[j++] = prev;
-            prevsz = sz;
-            if (sz >= oldsz){
-              CAMLassert (sz == oldsz);
-              break;
-            }
-          }
-          prev = cur;
-        }
-#if FREELIST_DEBUG
-        if (j > 2) fprintf (stderr, "FLP: update; buf size = %d\n", j);
-#endif
-        if (FLP_MAX >= flp_size + j - 1){
-          if (j != 1){
-            memmove (&flp[i+j], &flp[i+1], sizeof (value) * (flp_size-i-1));
-          }
-          if (j > 0) memmove (&flp[i], &buf[0], sizeof (value) * j);
-          flp_size += j - 1;
-        }else{
-          if (FLP_MAX > i + j){
-            if (j != 1){
-              memmove (&flp[i+j], &flp[i+1], sizeof (value) * (FLP_MAX-i-j));
-            }
-            if (j > 0) memmove (&flp[i], &buf[0], sizeof (value) * j);
-          }else{
-            if (i != FLP_MAX){
-              memmove (&flp[i], &buf[0], sizeof (value) * (FLP_MAX - i));
-            }
-          }
-          flp_size = FLP_MAX - 1;
-          beyond = Next_small (flp[FLP_MAX - 1]);
-        }
-      }
-    }
-    return result;
-}
-
-/* Location of the last fragment seen by the sweeping code.
-   This is a pointer to the first word after the fragment, which is
-   the header of the next block.
-   Note that [ff_last_fragment] doesn't point to the fragment itself,
-   but to the block after it.
-*/
-static header_t *ff_last_fragment;
-
-static void ff_init_merge (void)
-{
-  CAML_EV_ALLOC_FLUSH();
-  ff_last_fragment = NULL;
-  caml_fl_merge = Ff_head;
-#ifdef DEBUG
-  ff_check ();
-#endif
-}
-
-static void ff_truncate_flp (value changed)
-{
-  if (changed == Ff_head){
-    flp_size = 0;
-    beyond = Val_NULL;
-  }else{
-    while (flp_size > 0 &&
-           Bp_val (Next_small (flp[flp_size - 1])) >= Bp_val (changed))
-      -- flp_size;
-    if (Bp_val (beyond) >= Bp_val (changed)) beyond = Val_NULL;
-  }
-}
-
-static void ff_init (void)
-{
-  Next_small (Ff_head) = Val_NULL;
-  ff_truncate_flp (Ff_head);
-  caml_fl_cur_wsz = 0;
-}
-
-static void ff_reset (void)
-{
-  ff_init ();
-}
-
-/* Note: the [limit] parameter is unused because we merge blocks one by one. */
-static header_t *ff_merge_block (value bp, char *limit)
-{
-  value prev, cur, adj;
-  header_t hd = Hd_val (bp);
-  mlsize_t prev_wosz;
-
-  caml_fl_cur_wsz += Whsize_hd (hd);
-
-  /* [merge_block] is now responsible for calling the finalization function. */
-  if (Tag_hd (hd) == Custom_tag){
-    void (*final_fun)(value) = Custom_ops_val(bp)->finalize;
-    if (final_fun != NULL) final_fun(bp);
-  }
-
-#ifdef DEBUG
-  caml_set_fields (bp, 0, Debug_free_major);
-#endif
-  prev = caml_fl_merge;
-  cur = Next_small (prev);
-  /* The sweep code makes sure that this is the right place to insert
-     this block: */
-  CAMLassert (Bp_val (prev) < Bp_val (bp) || prev == Ff_head);
-  CAMLassert (Bp_val (cur) > Bp_val (bp) || cur == Val_NULL);
-
-  ff_truncate_flp (prev);
-
-  /* If [ff_last_fragment] and [bp] are adjacent, merge them. */
-  if (ff_last_fragment == Hp_bp (bp)){
-    mlsize_t bp_whsz = Whsize_val (bp);
-    if (bp_whsz <= Max_wosize){
-      hd = Make_header (bp_whsz, 0, Caml_white);
-      bp = (value) ff_last_fragment;
-      Hd_val (bp) = hd;
-      caml_fl_cur_wsz += Whsize_wosize (0);
-    }
-  }
-
-  /* If [bp] and [cur] are adjacent, remove [cur] from the free-list
-     and merge them. */
-  adj = Next_in_mem (bp);
-  if (adj == cur){
-    value next_cur = Next_small (cur);
-    mlsize_t cur_whsz = Whsize_val (cur);
-
-    if (Wosize_hd (hd) + cur_whsz <= Max_wosize){
-      Next_small (prev) = next_cur;
-      hd = Make_header (Wosize_hd (hd) + cur_whsz, 0, Caml_blue);
-      Hd_val (bp) = hd;
-      adj = Next_in_mem (bp);
-#ifdef DEBUG
-      ff_last = Val_NULL;
-      Next_small (cur) = (value) Debug_free_major;
-      Hd_val (cur) = Debug_free_major;
-#endif
-      cur = next_cur;
-    }
-  }
-  /* If [prev] and [bp] are adjacent merge them, else insert [bp] into
-     the free-list if it is big enough. */
-  prev_wosz = Wosize_val (prev);
-  if (Next_in_mem (prev) == bp && prev_wosz + Whsize_hd (hd) < Max_wosize){
-    Hd_val (prev) = Make_header (prev_wosz + Whsize_hd (hd), 0, Caml_blue);
-#ifdef DEBUG
-    Hd_val (bp) = Debug_free_major;
-#endif
-    CAMLassert (caml_fl_merge == prev);
-  }else if (Wosize_hd (hd) != 0){
-    Hd_val (bp) = Bluehd_hd (hd);
-    Next_small (bp) = cur;
-    Next_small (prev) = bp;
-    caml_fl_merge = bp;
-  }else{
-    /* This is a fragment.  Leave it in white but remember it for eventual
-       merging with the next block. */
-    ff_last_fragment = (header_t *) bp;
-    caml_fl_cur_wsz -= Whsize_wosize (0);
-  }
-  return Hp_val (adj);
-}
-
-/* This is a heap extension.  We have to insert it in the right place
-   in the free-list.
-   [ff_add_blocks] can only be called right after a call to
-   [ff_allocate] that returned Val_NULL.
-   Most of the heap extensions are expected to be at the end of the
-   free list.  (This depends on the implementation of [malloc].)
-
-   [bp] must point to a list of blocks chained by their field 0,
-   terminated by Val_NULL, and field 1 of the first block must point to
-   the last block.
-*/
-static void ff_add_blocks (value bp)
-{
-  value cur = bp;
-  CAMLassert (ff_last != Val_NULL);
-  CAMLassert (Next_small (ff_last) == Val_NULL);
-  do {
-    caml_fl_cur_wsz += Whsize_bp (cur);
-    cur = Field(cur, 0);
-  } while (cur != Val_NULL);
-
-  if (Bp_val (bp) > Bp_val (ff_last)){
-    Next_small (ff_last) = bp;
-    if (ff_last == caml_fl_merge && (char *) bp < caml_gc_sweep_hp){
-      caml_fl_merge = Field (bp, 1);
-    }
-    if (flp_size < FLP_MAX){
-      flp [flp_size++] = ff_last;
-    }
-  }else{
-    value prev;
-
-    prev = Ff_head;
-    cur = Next_small (prev);
-    while (cur != Val_NULL && Bp_val (cur) < Bp_val (bp)){
-      CAMLassert (Bp_val (prev) < Bp_val (bp) || prev == Ff_head);
-      /* XXX TODO: extend flp on the fly */
-      prev = cur;
-      cur = Next_small (prev);
-    }
-    CAMLassert (Bp_val (prev) < Bp_val (bp) || prev == Ff_head);
-    CAMLassert (Bp_val (cur) > Bp_val (bp) || cur == Val_NULL);
-    Next_small (Field (bp, 1)) = cur;
-    Next_small (prev) = bp;
-    /* When inserting blocks between [caml_fl_merge] and [caml_gc_sweep_hp],
-       we must advance [caml_fl_merge] to the new block, so that [caml_fl_merge]
-       is always the last free-list block before [caml_gc_sweep_hp]. */
-    if (prev == caml_fl_merge && (char *) bp < caml_gc_sweep_hp){
-      caml_fl_merge = Field (bp, 1);
-    }
-    ff_truncate_flp (bp);
-  }
-}
-
-static void ff_make_free_blocks
-  (value *p, mlsize_t size, int do_merge, int color)
-{
-  mlsize_t sz;
-
-  while (size > 0){
-    if (size > Whsize_wosize (Max_wosize)){
-      sz = Whsize_wosize (Max_wosize);
-    }else{
-      sz = size;
-    }
-    *(header_t *)p = Make_header (Wosize_whsize (sz), 0, color);
-    if (do_merge) ff_merge_block (Val_hp (p), NULL);
-    size -= sz;
-    p += sz;
-  }
-}
-
-/********************* best-fit allocation policy *********************/
-
-/* quick-fit + FIFO-ordered best fit (Wilson's nomenclature)
-   We use Standish's data structure (a tree of doubly-linked lists)
-   with a splay tree (Sleator & Tarjan).
-*/
-
-/* [BF_NUM_SMALL] must be at least 4 for this code to work
-   and at least 5 for good performance on typical OCaml programs.
-   For portability reasons, BF_NUM_SMALL cannot be more than 32.
-*/
-#define BF_NUM_SMALL 16
-
-/* Note that indexing into [bf_small_fl] starts at 1, so the first entry
-   in this array is unused.
-*/
-static struct {
-  value free;
-  value *merge;
-} bf_small_fl [BF_NUM_SMALL + 1];
-static int bf_small_map = 0;
-
-/* Small free blocks have only one pointer to the next block.
-   Large free blocks have 5 fields:
-   tree fields:
-     - isnode flag
-     - left child
-     - right child
-   list fields:
-     - next
-     - prev
-*/
-typedef struct large_free_block {
-  int isnode;
-  struct large_free_block *left;
-  struct large_free_block *right;
-  struct large_free_block *prev;
-  struct large_free_block *next;
-} large_free_block;
-
-Caml_inline mlsize_t bf_large_wosize (struct large_free_block *n) {
-  return Wosize_val((value)(n));
-}
-
-static struct large_free_block *bf_large_tree;
-static struct large_free_block *bf_large_least;
-/* [bf_large_least] is either NULL or a pointer to the smallest (leftmost)
-   block in the tree. In this latter case, the block must be alone in its
-   doubly-linked list (i.e. have [isnode] true and [prev] and [next]
-   both pointing back to this block)
-*/
-
-/* Auxiliary functions for bitmap */
-
-/* Find first (i.e. least significant) bit set in a word. */
-#ifdef HAS_FFS
-#include <strings.h>
-#elif defined(HAS_BITSCANFORWARD)
-#include <intrin.h>
-Caml_inline int ffs (int x)
-{
-  unsigned long index;
-  unsigned char result;
-  result = _BitScanForward (&index, (unsigned long) x);
-  return result ? (int) index + 1 : 0;
-}
-#else
-Caml_inline int ffs (int x)
-{
-  /* adapted from Hacker's Delight */
-  int bnz, b0, b1, b2, b3, b4;
-  CAMLassert ((x & 0xFFFFFFFF) == x);
-  x = x & -x;
-  bnz = x != 0;
-  b4 = !!(x & 0xFFFF0000) << 4;
-  b3 = !!(x & 0xFF00FF00) << 3;
-  b2 = !!(x & 0xF0F0F0F0) << 2;
-  b1 = !!(x & 0xCCCCCCCC) << 1;
-  b0 = !!(x & 0xAAAAAAAA);
-  return bnz + b0 + b1 + b2 + b3 + b4;
-}
-#endif /* HAS_FFS or HAS_BITSCANFORWARD */
-
-/* Indexing starts at 1 because that's the minimum block size. */
-Caml_inline void set_map (int index)
-{
-  bf_small_map |= (1 << (index - 1));
-}
-Caml_inline void unset_map (int index)
-{
-  bf_small_map &= ~(1 << (index - 1));
-}
-
-
-/* debug functions for checking the data structures */
-
-#if defined (DEBUG) || FREELIST_DEBUG
-
-static mlsize_t bf_check_cur_size = 0;
-static asize_t bf_check_subtree (large_free_block *p)
-{
-  mlsize_t wosz;
-  large_free_block *cur, *next;
-  asize_t total_size = 0;
-
-  if (p == NULL) return 0;
-
-  wosz = bf_large_wosize(p);
-  CAMLassert (p->isnode == 1);
-  total_size += bf_check_subtree (p->left);
-  CAMLassert (wosz > BF_NUM_SMALL);
-  CAMLassert (wosz > bf_check_cur_size);
-  bf_check_cur_size = wosz;
-  cur = p;
-  while (1){
-    CAMLassert (bf_large_wosize (cur) == wosz);
-    CAMLassert (Color_val ((value) cur) == Caml_blue);
-    CAMLassert ((cur == p && cur->isnode == 1) || cur->isnode == 0);
-    total_size += Whsize_wosize (wosz);
-    next = cur->next;
-    CAMLassert (next->prev == cur);
-    if (next == p) break;
-    cur = next;
-  }
-  total_size += bf_check_subtree (p->right);
-  return total_size;
-}
-
-static void bf_check (void)
-{
-  mlsize_t i;
-  asize_t total_size = 0;
-  int map = 0;
-
-  /* check free lists */
-  CAMLassert (BF_NUM_SMALL <= 8 * sizeof (int));
-  for (i = 1; i <= BF_NUM_SMALL; i++){
-    value b;
-    int col = 0;
-    int merge_found = 0;
-
-    if (bf_small_fl[i].merge == &bf_small_fl[i].free){
-      merge_found = 1;
-    }else{
-      CAMLassert (caml_gc_phase != Phase_sweep
-                  || caml_fl_merge == Val_NULL
-                  || bf_small_fl[i].merge < &Next_small(caml_fl_merge));
-    }
-    CAMLassert (*bf_small_fl[i].merge == Val_NULL
-                || Color_val (*bf_small_fl[i].merge) == Caml_blue);
-    if (bf_small_fl[i].free != Val_NULL) map |= 1 << (i-1);
-    for (b = bf_small_fl[i].free; b != Val_NULL; b = Next_small (b)){
-      if (bf_small_fl[i].merge == &Next_small (b)) merge_found = 1;
-      CAMLassert (Wosize_val (b) == i);
-      total_size += Whsize_wosize (i);
-      if (Color_val (b) == Caml_blue){
-        col = 1;
-        CAMLassert (Next_small (b) == Val_NULL
-                    || Bp_val (Next_small (b)) > Bp_val (b));
-      }else{
-        CAMLassert (col == 0);
-        CAMLassert (Color_val (b) == Caml_white);
-      }
-    }
-    if (caml_gc_phase == Phase_sweep) CAMLassert (merge_found);
-  }
-  CAMLassert (map == bf_small_map);
-  /* check [caml_fl_merge] */
-  CAMLassert (caml_gc_phase != Phase_sweep
-              || caml_fl_merge == Val_NULL
-              || Hp_val (caml_fl_merge) < (header_t *) caml_gc_sweep_hp);
-  /* check the tree */
-  bf_check_cur_size = 0;
-  total_size += bf_check_subtree (bf_large_tree);
-  /* check the total free set size */
-  CAMLassert (total_size == caml_fl_cur_wsz);
-  /* check the smallest-block pointer */
-  if (bf_large_least != NULL){
-    large_free_block *x = bf_large_tree;
-    while (x->left != NULL) x = x->left;
-    CAMLassert (x == bf_large_least);
-    CAMLassert (x->isnode == 1);
-    CAMLassert (x->prev == x);
-    CAMLassert (x->next == x);
-  }
-}
-
-#endif /* DEBUG || FREELIST_DEBUG */
-
-#if FREELIST_DEBUG
-#define FREELIST_DEBUG_bf_check() bf_check ()
-#else
-#define FREELIST_DEBUG_bf_check()
-#endif
-
-/**************************************************************************/
-/* splay trees */
-
-/* Our tree is composed of nodes. Each node is the head of a doubly-linked
-   circular list of blocks, all of the same size.
-*/
-
-/* Search for the node of the given size. Return a pointer to the pointer
-   to the node, or a pointer to the NULL where the node should have been
-   (it can be inserted here).
-*/
-static large_free_block **bf_search (mlsize_t wosz)
-{
-  large_free_block **p = &bf_large_tree;
-  large_free_block *cur;
-  mlsize_t cursz;
-
-  while (1){
-    cur = *p;
-    CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-    if (cur == NULL) break;
-    cursz = bf_large_wosize (cur);
-    if (cursz == wosz){
-      break;
-    }else if (cursz > wosz){
-      p = &(cur->left);
-    }else{
-      CAMLassert (cursz < wosz);
-      p = &(cur->right);
-    }
-  }
-  return p;
-}
-
-/* Search for the least node that is large enough to accommodate the given
-   size. Return in [next_lower] an upper bound on either the size of the
-   next-lower node in the tree, or BF_NUM_SMALL if there is no such node.
-*/
-static large_free_block **bf_search_best (mlsize_t wosz, mlsize_t *next_lower)
-{
-  large_free_block **p = &bf_large_tree;
-  large_free_block **best = NULL;
-  mlsize_t lowsz = BF_NUM_SMALL;
-  large_free_block *cur;
-  mlsize_t cursz;
-
-  while (1){
-    cur = *p;
-    CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-    if (cur == NULL){
-      *next_lower = lowsz;
-      break;
-    }
-    cursz = bf_large_wosize (cur);
-    if (cursz == wosz){
-      best = p;
-      *next_lower = wosz;
-      break;
-    }else if (cursz > wosz){
-      best = p;
-      p = &(cur->left);
-    }else{
-      CAMLassert (cursz < wosz);
-      lowsz = cursz;
-      p = &(cur->right);
-    }
-  }
-  return best;
-}
-
-/* Splay the tree at the given size. If a node of this size exists, it will
-   become the root. If not, the last visited node will be the root. This is
-   either the least node larger or the greatest node smaller than the given
-   size.
-   We use simple top-down splaying as described in S&T 85.
-*/
-static void bf_splay (mlsize_t wosz)
-{
-  large_free_block *x, *y;
-  mlsize_t xsz;
-  large_free_block *left_top = NULL;
-  large_free_block *right_top = NULL;
-  large_free_block **left_bottom = &left_top;
-  large_free_block **right_bottom = &right_top;
-
-  x = bf_large_tree;
-  if (x == NULL) return;
-  while (1){
-    xsz = bf_large_wosize (x);
-    if (xsz == wosz) break;
-    if (xsz > wosz){
-      /* zig */
-      y = x->left;
-      CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-      if (y == NULL) break;
-      if (bf_large_wosize (y) > wosz){
-        /* zig-zig: rotate right */
-        x->left = y->right;
-        y->right = x;
-        x = y;
-        y = x->left;
-        CAML_EVENTLOG_DO(EV_ALLOC_JUMP (2));
-        if (y == NULL) break;
-      }
-      /* link right */
-      *right_bottom = x;
-      right_bottom = &(x->left);
-      x = y;
-    }else{
-      CAMLassert (xsz < wosz);
-      /* zag */
-      y = x->right;
-      CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-      if (y == NULL) break;
-      if (bf_large_wosize (y) < wosz){
-        /* zag-zag : rotate left */
-        x->right = y->left;
-        y->left = x;
-        x = y;
-        y = x->right;
-        CAML_EVENTLOG_DO(EV_ALLOC_JUMP (2));
-        if (y == NULL) break;
-      }
-      /* link left */
-      *left_bottom = x;
-      left_bottom = &(x->right);
-      x = y;
-    }
-  }
-  /* reassemble the tree */
-  *left_bottom = x->left;
-  *right_bottom = x->right;
-  x->left = left_top;
-  x->right = right_top;
-  CAML_EVENTLOG_DO(EV_ALLOC_JUMP (2));
-  bf_large_tree = x;
-}
-
-/* Splay the subtree at [p] on its leftmost (least) node. After this
-   operation, the root node of the subtree is the least node and it
-   has no left child.
-   The subtree must not be empty.
-*/
-static void bf_splay_least (large_free_block **p)
-{
-  large_free_block *x, *y;
-  large_free_block *right_top = NULL;
-  large_free_block **right_bottom = &right_top;
-
-  x = *p;
-  CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-  CAMLassert (x != NULL);
-  while (1){
-    /* We are always in the zig case. */
-    y = x->left;
-    CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-    if (y == NULL) break;
-    /* And in the zig-zig case. rotate right */
-    x->left = y->right;
-    y->right = x;
-    x = y;
-    y = x->left;
-    CAML_EVENTLOG_DO(EV_ALLOC_JUMP (2));
-    if (y == NULL) break;
-    /* link right */
-    *right_bottom = x;
-    right_bottom = &(x->left);
-    x = y;
-  }
-  /* reassemble the tree */
-  CAMLassert (x->left == NULL);
-  *right_bottom = x->right;
-  CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-  x->right = right_top;
-  *p = x;
-}
-
-/* Remove the node at [p], if any. */
-static void bf_remove_node (large_free_block **p)
-{
-  large_free_block *x;
-  large_free_block *l, *r;
-
-  x = *p;
-  CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-  if (x == NULL) return;
-  if (x == bf_large_least) bf_large_least = NULL;
-  l = x->left;
-  r = x->right;
-  CAML_EVENTLOG_DO(EV_ALLOC_JUMP (2));
-  if (l == NULL){
-    *p = r;
-  }else if (r == NULL){
-    *p = l;
-  }else{
-    bf_splay_least (&r);
-    r->left = l;
-    *p = r;
-  }
-}
-
-/* Insert a block into the tree, either as a new node or as a block in an
-   existing list.
-   Splay if the list is already present.
-*/
-static void bf_insert_block (large_free_block *n)
-{
-  mlsize_t sz = bf_large_wosize (n);
-  large_free_block **p = bf_search (sz);
-  large_free_block *x = *p;
-  CAML_EVENTLOG_DO(EV_ALLOC_JUMP (1));
-
-  if (bf_large_least != NULL){
-    mlsize_t least_sz = bf_large_wosize (bf_large_least);
-    if (sz < least_sz){
-      CAMLassert (x == NULL);
-      bf_large_least = n;
-    }else if (sz == least_sz){
-      CAMLassert (x == bf_large_least);
-      bf_large_least = NULL;
-    }
-  }
-
-  CAMLassert (Color_val ((value) n) == Caml_blue);
-  CAMLassert (Wosize_val ((value) n) > BF_NUM_SMALL);
-  if (x == NULL){
-    /* add new node */
-    n->isnode = 1;
-    n->left = n->right = NULL;
-    n->prev = n->next = n;
-    *p = n;
-  }else{
-    /* insert at tail of doubly-linked list */
-    CAMLassert (x->isnode == 1);
-    n->isnode = 0;
-#ifdef DEBUG
-    n->left = n->right = (large_free_block *) Debug_free_unused;
-#endif
-    n->prev = x->prev;
-    n->next = x;
-    x->prev->next = n;
-    x->prev = n;
-    CAML_EVENTLOG_DO(EV_ALLOC_JUMP (2));
-    bf_splay (sz);
-  }
-}
-
-#if defined (DEBUG) || FREELIST_DEBUG
-static int bf_is_in_tree (large_free_block *b)
-{
-  int wosz = bf_large_wosize (b);
-  large_free_block **p = bf_search (wosz);
-  large_free_block *n = *p;
-  large_free_block *cur = n;
-
-  if (n == NULL) return 0;
-  while (1){
-    if (cur == b) return 1;
-    cur = cur->next;
-    if (cur == n) return 0;
-  }
-}
-#endif /* DEBUG || FREELIST_DEBUG */
-
-/**************************************************************************/
-
-/* Add back a remnant into a small free list. The block must be small
-   and white (or a 0-size fragment).
-   The block may be left out of the list depending on the sweeper's state.
-   The free list size is updated accordingly.
-
-   The block will be left out of the list if the GC is in its Sweep phase
-   and the block is in the still-to-be-swept region because every block of
-   the free list encountered by the sweeper must be blue and linked in
-   its proper place in the increasing-addresses order of the list. This is
-   to ensure that coalescing is always done when two or more free blocks
-   are adjacent.
-*/
-static void bf_insert_remnant_small (value v)
-{
-  mlsize_t wosz = Wosize_val (v);
-
-  CAMLassert (Color_val (v) == Caml_white);
-  CAMLassert (wosz <= BF_NUM_SMALL);
-  if (wosz != 0
-      && (caml_gc_phase != Phase_sweep
-          || (char *) Hp_val (v) < (char *) caml_gc_sweep_hp)){
-    caml_fl_cur_wsz += Whsize_wosize (wosz);
-    Next_small (v) = bf_small_fl[wosz].free;
-    bf_small_fl[wosz].free = v;
-    if (bf_small_fl[wosz].merge == &bf_small_fl[wosz].free){
-      bf_small_fl[wosz].merge = &Next_small (v);
-    }
-    set_map (wosz);
-  }
-}
-
-/* Add back a remnant into the free set. The block must have the
-   appropriate color:
-   - White if it is a fragment or a small block (wosize <= BF_NUM_SMALL)
-   - Blue if it is a large block (BF_NUM_SMALL < wosize)
-   The block may be left out or the set, depending on its size and the
-   sweeper's state.
-   The free list size is updated accordingly.
-*/
-static void bf_insert_remnant (value v)
-{
-  mlsize_t wosz = Wosize_val (v);
-
-  if (wosz <= BF_NUM_SMALL){
-    CAMLassert (Color_val (v) == Caml_white);
-    bf_insert_remnant_small (v);
-  }else{
-    CAMLassert (Color_val (v) == Caml_blue);
-    bf_insert_block ((large_free_block *) v);
-    caml_fl_cur_wsz += Whsize_wosize (wosz);
-  }
-}
-/* Insert the block into the free set during sweep. The block must be blue. */
-static void bf_insert_sweep (value v)
-{
-  mlsize_t wosz = Wosize_val (v);
-  value next;
-
-  CAMLassert (Color_val (v) == Caml_blue);
-  if (wosz <= BF_NUM_SMALL){
-    while (1){
-      next = *bf_small_fl[wosz].merge;
-      if (next == Val_NULL){
-        set_map (wosz);
-        break;
-      }
-      if (Bp_val (next) >= Bp_val (v)) break;
-      bf_small_fl[wosz].merge = &Next_small (next);
-    }
-    Next_small (v) = *bf_small_fl[wosz].merge;
-    *bf_small_fl[wosz].merge = v;
-    bf_small_fl[wosz].merge = &Next_small (v);
-  }else{
-    bf_insert_block ((large_free_block *) v);
-  }
-}
-
-/* Remove a given block from the free set. */
-static void bf_remove (value v)
-{
-  mlsize_t wosz = Wosize_val (v);
-
-  CAMLassert (Color_val (v) == Caml_blue);
-  if (wosz <= BF_NUM_SMALL){
-    while (*bf_small_fl[wosz].merge != v){
-      CAMLassert (Bp_val (*bf_small_fl[wosz].merge) < Bp_val (v));
-      bf_small_fl[wosz].merge = &Next_small (*bf_small_fl[wosz].merge);
-    }
-    *bf_small_fl[wosz].merge = Next_small (v);
-    if (bf_small_fl[wosz].free == Val_NULL) unset_map (wosz);
-  }else{
-    large_free_block *b = (large_free_block *) v;
-    CAMLassert (bf_is_in_tree (b));
-    CAMLassert (b->prev->next == b);
-    CAMLassert (b->next->prev == b);
-    if (b->isnode){
-      large_free_block **p = bf_search (bf_large_wosize (b));
-      CAMLassert (*p != NULL);
-      if (b->next == b){
-        bf_remove_node (p);
-      }else{
-        large_free_block *n = b->next;
-        n->prev = b->prev;
-        b->prev->next = n;
-        *p = n;
-        n->isnode = 1;
-        n->left = b->left;
-        n->right = b->right;
-#ifdef DEBUG
-        Field ((value) b, 0) = Debug_free_major;
-        b->left = b->right = b->next = b->prev =
-          (large_free_block *) Debug_free_major;
-#endif
-      }
-    }else{
-      b->prev->next = b->next;
-      b->next->prev = b->prev;
-    }
-  }
-}
-
-/* Split the given block, return a new block of the given size.
-   The remnant is still at the same address, its size is changed
-   and its color becomes white.
-   The size of the free set is decremented by the whole block size
-   and the caller must readjust it if the remnant is reinserted or
-   remains in the free set.
-   The size of [v] must be strictly greater than [wosz].
-*/
-static header_t *bf_split_small (mlsize_t wosz, value v)
-{
-  intnat blocksz = Whsize_val (v);
-  intnat remwhsz = blocksz - Whsize_wosize (wosz);
-
-  CAMLassert (Wosize_val (v) > wosz);
-  caml_fl_cur_wsz -= blocksz;
-  Hd_val (v) = Make_header (Wosize_whsize (remwhsz), Abstract_tag, Caml_white);
-  return (header_t *) &Field (v, Wosize_whsize (remwhsz));
-}
-
-/* Split the given block, return a new block of the given size.
-   The original block is at the same address but its size is changed.
-   Its color and tag are changed as appropriate for calling the
-   insert_remnant* functions.
-   The size of the free set is decremented by the whole block size
-   and the caller must readjust it if the remnant is reinserted or
-   remains in the free set.
-   The size of [v] must be strictly greater than [wosz].
-*/
-static header_t *bf_split (mlsize_t wosz, value v)
-{
-  header_t hd = Hd_val (v);
-  mlsize_t remwhsz = Whsize_hd (hd) - Whsize_wosize (wosz);
-
-  CAMLassert (Wosize_val (v) > wosz);
-  CAMLassert (remwhsz > 0);
-  caml_fl_cur_wsz -= Whsize_hd (hd);
-  if (remwhsz <= Whsize_wosize (BF_NUM_SMALL)){
-    /* Same as bf_split_small. */
-    Hd_val (v) = Make_header (Wosize_whsize(remwhsz), Abstract_tag, Caml_white);
-  }else{
-    Hd_val (v) = Make_header (Wosize_whsize (remwhsz), 0, Caml_blue);
-  }
-  return (header_t *) &Field (v, Wosize_whsize (remwhsz));
-}
-
-/* Allocate from a large block at [p]. If the node is single and the remaining
-   size is greater than [bound], it stays at the same place in the tree.
-   If [set_least] is true, [wosz] is guaranteed to be [<= BF_NUM_SMALL], so
-   the block has the smallest size in the tree.
-   In this case, the large block becomes (or remains) the single smallest
-   in the tree and we set the [bf_large_least] pointer.
-*/
-static header_t *bf_alloc_from_large (mlsize_t wosz, large_free_block **p,
-                                      mlsize_t bound, int set_least)
-{
-  large_free_block *n = *p;
-  large_free_block *b;
-  header_t *result;
-  mlsize_t wosize_n = bf_large_wosize (n);
-
-  CAMLassert (bf_large_wosize (n) >= wosz);
-  if (n->next == n){
-    if (wosize_n > bound + Whsize_wosize (wosz)){
-      /* TODO splay at [n]? if the remnant is larger than [wosz]? */
-      if (set_least){
-        CAMLassert (bound == BF_NUM_SMALL);
-        bf_large_least = n;
-      }
-      result = bf_split (wosz, (value) n);
-      caml_fl_cur_wsz += Whsize_wosize (wosize_n) - Whsize_wosize (wosz);
-        /* remnant stays in tree */
-      return result;
-    }else{
-      bf_remove_node (p);
-      if (wosize_n == wosz){
-        caml_fl_cur_wsz -= Whsize_wosize (wosz);
-        return Hp_val ((value) n);
-      }else{
-        result = bf_split (wosz, (value) n);
-        bf_insert_remnant ((value) n);
-        return result;
-      }
-    }
-  }else{
-    b = n->next;
-    CAMLassert (bf_large_wosize (b) == bf_large_wosize (n));
-    n->next = b->next;
-    b->next->prev = n;
-    if (wosize_n == wosz){
-      caml_fl_cur_wsz -= Whsize_wosize (wosz);
-      return Hp_val ((value) b);
-    }else{
-      result = bf_split (wosz, (value) b);
-      bf_insert_remnant ((value) b);
-      /* TODO: splay at [n] if the remnant is smaller than [wosz] */
-      if (set_least){
-        CAMLassert (bound == BF_NUM_SMALL);
-        if (bf_large_wosize (b) > BF_NUM_SMALL){
-          bf_large_least = b;
-        }
-      }
-      return result;
-    }
-  }
-}
-
-static header_t *bf_allocate_from_tree (mlsize_t wosz, int set_least)
-{
-  large_free_block **n;
-  mlsize_t bound;
-
-  n = bf_search_best (wosz, &bound);
-  if (n == NULL) return NULL;
-  return bf_alloc_from_large (wosz, n, bound, set_least);
-}
-
-static header_t *bf_allocate (mlsize_t wosz)
-{
-  value block;
-  header_t *result;
-
-  CAMLassert (sizeof (char *) == sizeof (value));
-  CAMLassert (wosz >= 1);
-
-  if (wosz <= BF_NUM_SMALL){
-    if (bf_small_fl[wosz].free != Val_NULL){
-      /* fast path: allocate from the corresponding free list */
-      block = bf_small_fl[wosz].free;
-      if (bf_small_fl[wosz].merge == &Next_small (block)){
-        bf_small_fl[wosz].merge = &bf_small_fl[wosz].free;
-      }
-      bf_small_fl[wosz].free = Next_small (block);
-      if (bf_small_fl[wosz].free == Val_NULL) unset_map (wosz);
-      caml_fl_cur_wsz -= Whsize_wosize (wosz);
-      FREELIST_DEBUG_bf_check ();
-      return Hp_val (block);
-    }else{
-      /* allocate from the next available size */
-      mlsize_t s = ffs (bf_small_map & ((~0U) << wosz));
-      FREELIST_DEBUG_bf_check ();
-      if (s != 0){
-        block = bf_small_fl[s].free;
-        CAMLassert (block != Val_NULL);
-        if (bf_small_fl[s].merge == &Next_small (block)){
-          bf_small_fl[s].merge = &bf_small_fl[s].free;
-        }
-        bf_small_fl[s].free = Next_small (block);
-        if (bf_small_fl[s].free == Val_NULL) unset_map (s);
-        result = bf_split_small (wosz, block);
-        bf_insert_remnant_small (block);
-        FREELIST_DEBUG_bf_check ();
-        return result;
-      }
-    }
-    /* Failed to find a suitable small block: try [bf_large_least]. */
-    if (bf_large_least != NULL){
-      mlsize_t least_wosz = bf_large_wosize (bf_large_least);
-      if (least_wosz > BF_NUM_SMALL + Whsize_wosize (wosz)){
-        result = bf_split (wosz, (value) bf_large_least);
-        caml_fl_cur_wsz += Whsize_wosize (least_wosz) - Whsize_wosize (wosz);
-          /* remnant stays in tree */
-        CAMLassert (Color_val ((value) bf_large_least) == Caml_blue);
-        return result;
-      }
-    }
-
-    /* Allocate from the tree and update [bf_large_least]. */
-    result = bf_allocate_from_tree (wosz, 1);
-    FREELIST_DEBUG_bf_check ();
-    return result;
-  }else{
-    result = bf_allocate_from_tree (wosz, 0);
-    FREELIST_DEBUG_bf_check ();
-    return result;
-  }
-}
-
-static void bf_init_merge (void)
-{
-  mlsize_t i;
-
-  CAML_EV_ALLOC_FLUSH();
-
-  caml_fl_merge = Val_NULL;
-
-  for (i = 1; i <= BF_NUM_SMALL; i++){
-    /* At the beginning of each small free list is a segment of remnants
-       that were pushed back to the list after splitting. These are white
-       and they are not in order. We need to remove them
-       from the list for coalescing to work. They
-       will be picked up by the sweeping code and inserted in the right
-       place in the list.
-    */
-    value p = bf_small_fl[i].free;
-    while (1){
-      if (p == Val_NULL){
-        unset_map (i);
-        break;
-      }
-      if (Color_val (p) == Caml_blue) break;
-      CAMLassert (Color_val (p) == Caml_white);
-      caml_fl_cur_wsz -= Whsize_val (p);
-      p = Next_small (p);
-    }
-    bf_small_fl[i].free = p;
-    /* Set the merge pointer to its initial value */
-    bf_small_fl[i].merge = &bf_small_fl[i].free;
-  }
-}
-
-static void bf_init (void)
-{
-  mlsize_t i;
-
-  for (i = 1; i <= BF_NUM_SMALL; i++){
-    bf_small_fl[i].free = Val_NULL;
-    bf_small_fl[i].merge = &bf_small_fl[i].free;
-  }
-  bf_small_map = 0;
-  bf_large_tree = NULL;
-  bf_large_least = NULL;
-  caml_fl_cur_wsz = 0;
-}
-
-/* Make sure all free blocks are blue and tear down the BF data structures. */
-static void bf_reset (void)
-{
-  mlsize_t i;
-
-  for (i = 1; i <= BF_NUM_SMALL; i++){
-    /* At the beginning of each small free list is a segment of remnants
-       that were pushed back to the list after splitting. These are white
-       and they are not in order. We must make them blue before we can
-       compact or change the allocator policy.
-    */
-    value p = bf_small_fl[i].free;
-    while (1){
-      if (p == Val_NULL || Color_val (p) == Caml_blue) break;
-      CAMLassert (Color_val (p) == Caml_white);
-      Hd_val (p) = Bluehd_hd (Hd_val (p));
-      p = Next_small (p);
-    }
-  }
-  /* We have no malloced data structures, so we can just call [bf_init] to
-     clear all our pointers. */
-  bf_init ();
-}
-
-static header_t *bf_merge_block (value bp, char *limit)
-{
-  value start;
-  value cur;
-  mlsize_t wosz;
-
-  CAMLassert (Color_val (bp) == Caml_white);
-  /* Find the starting point of the current run of free blocks. */
-  if (caml_fl_merge != Val_NULL && Next_in_mem (caml_fl_merge) == bp
-      && Color_val (caml_fl_merge) == Caml_blue){
-    start = caml_fl_merge;
-    bf_remove (start);
-  }else{
-    start = bp;
-  }
-  cur = bp;
-  while (1){
-    /* This slightly convoluted loop is just going over the run of
-       white or blue blocks, doing the right thing for each color, and
-       stopping on a gray or black block or when limit is passed.
-       It is convoluted because we start knowing that the first block
-       is white. */
-  white:
-    if (Tag_val (cur) == Custom_tag){
-      void (*final_fun)(value) = Custom_ops_val(cur)->finalize;
-      if (final_fun != NULL) final_fun(cur);
-    }
-    caml_fl_cur_wsz += Whsize_val (cur);
-  next:
-    caml_prefetch(Hp_val(cur + 4096));
-    cur = Next_in_mem (cur);
-    if (Hp_val (cur) >= (header_t *) limit){
-      CAMLassert (Hp_val (cur) == (header_t *) limit);
-      goto end_of_run;
-    }
-    switch (Color_val (cur)){
-    case Caml_white: goto white;
-    case Caml_blue: bf_remove (cur); goto next;
-    case Caml_black:
-      goto end_of_run;
-    }
-  }
- end_of_run:
-  wosz = Wosize_whsize ((value *) cur - (value *) start);
-#ifdef DEBUG
-  {
-    value *p;
-    for (p = (value *) start; p < (value *) Hp_val (cur); p++){
-      *p = Debug_free_major;
-    }
-  }
-#endif
-  while (wosz > Max_wosize){
-    Hd_val (start) = Make_header (Max_wosize, 0, Caml_blue);
-    bf_insert_sweep (start);
-    start = Next_in_mem (start);
-    wosz -= Whsize_wosize (Max_wosize);
-  }
-  if (wosz > 0){
-    Hd_val (start) = Make_header (wosz, 0, Caml_blue);
-    bf_insert_sweep (start);
-  }else{
-    Hd_val (start) = Make_header (0, 0, Caml_white);
-    caml_fl_cur_wsz -= Whsize_wosize (0);
-  }
-  FREELIST_DEBUG_bf_check ();
-  return Hp_val (cur);
-}
-
-static void bf_add_blocks (value bp)
-{
-  while (bp != Val_NULL){
-    value next = Next_small (bp);
-    mlsize_t wosz = Wosize_val (bp);
-
-    if (wosz > BF_NUM_SMALL){
-      caml_fl_cur_wsz += Whsize_wosize (wosz);
-      bf_insert_block ((large_free_block *) bp);
-    }else{
-      Hd_val (bp) = Make_header (wosz, Abstract_tag, Caml_white);
-      bf_insert_remnant_small (bp);
-    }
-    bp = next;
-  }
-}
-
-static void bf_make_free_blocks (value *p, mlsize_t size, int do_merge,
-                                 int color)
-{
-  mlsize_t sz, wosz;
-
-  while (size > 0){
-    if (size > Whsize_wosize (Max_wosize)){
-      sz = Whsize_wosize (Max_wosize);
-    }else{
-      sz = size;
-    }
-    wosz = Wosize_whsize (sz);
-    if (do_merge){
-      if (wosz <= BF_NUM_SMALL){
-        color = Caml_white;
-      }else{
-        color = Caml_blue;
-      }
-      *(header_t *)p = Make_header (wosz, 0, color);
-      bf_insert_remnant (Val_hp (p));
-    }else{
-      *(header_t *)p = Make_header (wosz, 0, color);
-    }
-    size -= sz;
-    p += sz;
-  }
-}
-
-/********************* exported functions *****************************/
-
-/* [caml_fl_allocate] does not set the header of the newly allocated block.
-   The calling function must do it before any GC function gets called.
-   [caml_fl_allocate] returns a head pointer, or NULL if no suitable block
-   is found in the free set.
-*/
-header_t *(*caml_fl_p_allocate) (mlsize_t wo_sz) = NULL;
-
-/* Initialize the merge_block machinery (at start of sweeping). */
-void (*caml_fl_p_init_merge) (void) = NULL;
-
-/* These are called internally. */
-static void (*caml_fl_p_init) (void) = NULL;
-static void (*caml_fl_p_reset) (void) = NULL;
-
-/* [caml_fl_merge_block] returns the head pointer of the next block after [bp],
-   because merging blocks may change the size of [bp]. */
-header_t *(*caml_fl_p_merge_block) (value bp, char *limit) = NULL;
-
-/* [bp] must point to a list of blocks of wosize >= 1 chained by their field 0,
-   terminated by Val_NULL, and field 1 of the first block must point to
-   the last block.
-   The blocks must be blue.
-*/
-void (*caml_fl_p_add_blocks) (value bp) = NULL;
-
-/* Cut a block of memory into pieces of size [Max_wosize], give them headers,
-   and optionally merge them into the free list.
-   arguments:
-   p: pointer to the first word of the block
-   size: size of the block (in words)
-   do_merge: 1 -> do merge; 0 -> do not merge
-   color: which color to give to the pieces; if [do_merge] is 1, this
-          is overridden by the merge code, but we have historically used
-          [Caml_white].
-*/
-void (*caml_fl_p_make_free_blocks)
-  (value *p, mlsize_t size, int do_merge, int color)
-  = NULL;
-
-#ifdef DEBUG
-void (*caml_fl_p_check) (void) = NULL;
-#endif
-
-/* This variable and the above function pointers must be initialized with
-   a call to [caml_set_allocation_policy]. */
-uintnat caml_allocation_policy = 999;
-
-void caml_set_allocation_policy (uintnat p)
-{
-  switch (p){
-  case caml_policy_next_fit:
-    caml_allocation_policy = p;
-    caml_fl_p_allocate = &nf_allocate;
-    caml_fl_p_init_merge = &nf_init_merge;
-    caml_fl_p_reset = &nf_reset;
-    caml_fl_p_init = &nf_init;
-    caml_fl_p_merge_block = &nf_merge_block;
-    caml_fl_p_add_blocks = &nf_add_blocks;
-    caml_fl_p_make_free_blocks = &nf_make_free_blocks;
-#ifdef DEBUG
-    caml_fl_p_check = &nf_check;
-#endif
-    break;
-
-  case caml_policy_first_fit:
-    caml_allocation_policy = p;
-    caml_fl_p_allocate = &ff_allocate;
-    caml_fl_p_init_merge = &ff_init_merge;
-    caml_fl_p_reset = &ff_reset;
-    caml_fl_p_init = &ff_init;
-    caml_fl_p_merge_block = &ff_merge_block;
-    caml_fl_p_add_blocks = &ff_add_blocks;
-    caml_fl_p_make_free_blocks = &ff_make_free_blocks;
-#ifdef DEBUG
-    caml_fl_p_check = &ff_check;
-#endif
-    break;
-
-  default:
-  case caml_policy_best_fit:
-    caml_allocation_policy = caml_policy_best_fit;
-    caml_fl_p_allocate = &bf_allocate;
-    caml_fl_p_init_merge = &bf_init_merge;
-    caml_fl_p_reset = &bf_reset;
-    caml_fl_p_init = &bf_init;
-    caml_fl_p_merge_block = &bf_merge_block;
-    caml_fl_p_add_blocks = &bf_add_blocks;
-    caml_fl_p_make_free_blocks = &bf_make_free_blocks;
-#ifdef DEBUG
-    caml_fl_p_check = &bf_check;
-#endif
-    break;
-  }
-}
-
-/* This is called by caml_compact_heap. */
-void caml_fl_reset_and_switch_policy (intnat new_allocation_policy)
-{
-  /* reset the fl data structures */
-  (*caml_fl_p_reset) ();
-  if (new_allocation_policy != -1){
-    caml_set_allocation_policy (new_allocation_policy);
-    (*caml_fl_p_init) (); /* initialize the new allocation policy */
-  }
-}
diff --git a/runtime/gc_ctrl.c b/runtime/gc_ctrl.c
index 250a6a27f77d..b8c38419785f 100644
--- a/runtime/gc_ctrl.c
+++ b/runtime/gc_ctrl.c
@@ -16,31 +16,30 @@
 #define CAML_INTERNALS
 
 #include "caml/alloc.h"
-#include "caml/backtrace.h"
-#include "caml/compact.h"
 #include "caml/custom.h"
-#include "caml/fail.h"
 #include "caml/finalise.h"
-#include "caml/freelist.h"
 #include "caml/gc.h"
 #include "caml/gc_ctrl.h"
 #include "caml/major_gc.h"
-#include "caml/memory.h"
 #include "caml/minor_gc.h"
+#include "caml/shared_heap.h"
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
-#include "caml/signals.h"
-#include "caml/eventlog.h"
 #ifdef NATIVE_CODE
 #include "caml/stack.h"
-#else
-#include "caml/stacks.h"
+#include "caml/frame_descriptors.h"
 #endif
-#include "caml/startup_aux.h"
+#include "caml/domain.h"
+#include "caml/fiber.h"
+#include "caml/globroots.h"
+#include "caml/signals.h"
+#include "caml/startup.h"
+#include "caml/domain.h"
+#include "caml/eventlog.h"
+#include "caml/fail.h"
 
-#ifndef NATIVE_CODE
-extern uintnat caml_max_stack_size;    /* defined in stacks.c */
-#endif
+uintnat caml_max_stack_size;
+uintnat caml_fiber_wsz;
 
 extern uintnat caml_major_heap_increment; /* percent or words; see major_gc.c */
 extern uintnat caml_percent_free;         /*        see major_gc.c */
@@ -50,279 +49,49 @@ extern uintnat caml_custom_major_ratio;   /* see custom.c */
 extern uintnat caml_custom_minor_ratio;   /* see custom.c */
 extern uintnat caml_custom_minor_max_bsz; /* see custom.c */
 
-#define Next(hp) ((header_t *)(hp) + Whsize_hp (hp))
-
-#ifdef DEBUG
-
-/* Check that [v]'s header looks good.  [v] must be a block in the heap. */
-static void check_head (value v)
-{
-  CAMLassert (Is_block (v));
-  CAMLassert (Is_in_heap (v));
-
-  CAMLassert (Wosize_val (v) != 0);
-  CAMLassert (Color_hd (Hd_val (v)) != Caml_blue);
-  CAMLassert (Is_in_heap (v));
-  if (Tag_val (v) == Infix_tag){
-    int offset = Wsize_bsize (Infix_offset_val (v));
-    value trueval = Val_op (&Field (v, -offset));
-    CAMLassert (Tag_val (trueval) == Closure_tag);
-    CAMLassert (Wosize_val (trueval) > offset);
-    CAMLassert (Is_in_heap (&Field (trueval, Wosize_val (trueval) - 1)));
-  }else{
-    CAMLassert (Is_in_heap (&Field (v, Wosize_val (v) - 1)));
-  }
-  if (Tag_val (v) ==  Double_tag){
-    CAMLassert (Wosize_val (v) == Double_wosize);
-  }else if (Tag_val (v) == Double_array_tag){
-    CAMLassert (Wosize_val (v) % Double_wosize == 0);
-  }
-}
-
-static void check_block (header_t *hp)
-{
-  mlsize_t i;
-  value v = Val_hp (hp);
-  value f;
-
-  check_head (v);
-  switch (Tag_hp (hp)){
-  case Abstract_tag: break;
-  case String_tag:
-    break;
-  case Double_tag:
-    CAMLassert (Wosize_val (v) == Double_wosize);
-    break;
-  case Double_array_tag:
-    CAMLassert (Wosize_val (v) % Double_wosize == 0);
-    break;
-  case Custom_tag:
-    CAMLassert (!Is_in_heap (Custom_ops_val (v)));
-    break;
-
-  case Infix_tag:
-    CAMLassert (0);
-    break;
-
-  default:
-    CAMLassert (Tag_hp (hp) < No_scan_tag);
-    for (i = 0; i < Wosize_hp (hp); i++){
-      f = Field (v, i);
-      if (Is_block (f) && Is_in_heap (f)){
-        check_head (f);
-        CAMLassert (Color_val (f) != Caml_blue);
-      }
-    }
-  }
-}
-
-#endif /* DEBUG */
-
-/* Check the heap structure (if compiled in debug mode) and
-   gather statistics; return the stats if [returnstats] is true,
-   otherwise return [Val_unit].
-*/
-static value heap_stats (int returnstats)
-{
-  CAMLparam0 ();
-  intnat live_words = 0, live_blocks = 0,
-         free_words = 0, free_blocks = 0, largest_free = 0,
-         fragments = 0, heap_chunks = 0;
-  char *chunk = caml_heap_start, *chunk_end;
-  header_t *cur_hp;
-#ifdef DEBUG
-  header_t *prev_hp;
-#endif
-  header_t cur_hd;
-
-#ifdef DEBUG
-  caml_gc_message (-1, "### OCaml runtime: heap check ###\n");
-#endif
-
-  while (chunk != NULL){
-    ++ heap_chunks;
-    chunk_end = chunk + Chunk_size (chunk);
-#ifdef DEBUG
-    prev_hp = NULL;
-#endif
-    cur_hp = (header_t *) chunk;
-    while (cur_hp < (header_t *) chunk_end){
-      cur_hd = Hd_hp (cur_hp);
-      CAMLassert (Next (cur_hp) <= (header_t *) chunk_end);
-      switch (Color_hd (cur_hd)){
-      case Caml_white:
-        if (Wosize_hd (cur_hd) == 0){
-          ++ fragments;
-          CAMLassert (prev_hp == NULL
-                      || Color_hp (prev_hp) != Caml_blue
-                      || cur_hp == (header_t *) caml_gc_sweep_hp
-                      || Wosize_hp (prev_hp) == Max_wosize);
-        }else{
-          if (caml_gc_phase == Phase_sweep
-              && cur_hp >= (header_t *) caml_gc_sweep_hp){
-            ++ free_blocks;
-            free_words += Whsize_hd (cur_hd);
-            if (Whsize_hd (cur_hd) > largest_free){
-              largest_free = Whsize_hd (cur_hd);
-            }
-          }else{
-            ++ live_blocks;
-            live_words += Whsize_hd (cur_hd);
-#ifdef DEBUG
-            check_block (cur_hp);
-#endif
-          }
-        }
-        break;
-      case Caml_black:
-        CAMLassert (Wosize_hd (cur_hd) > 0);
-        ++ live_blocks;
-        live_words += Whsize_hd (cur_hd);
-#ifdef DEBUG
-        check_block (cur_hp);
-#endif
-        break;
-      case Caml_blue:
-        CAMLassert (Wosize_hd (cur_hd) > 0);
-        ++ free_blocks;
-        free_words += Whsize_hd (cur_hd);
-        if (Whsize_hd (cur_hd) > largest_free){
-          largest_free = Whsize_hd (cur_hd);
-        }
-        /* not true any more with big heap chunks
-        CAMLassert (prev_hp == NULL
-                    || (Color_hp (prev_hp) != Caml_blue
-                        && Wosize_hp (prev_hp) > 0)
-                    || cur_hp == caml_gc_sweep_hp);
-        CAMLassert (Next (cur_hp) == chunk_end
-                    || (Color_hp (Next (cur_hp)) != Caml_blue
-                       && Wosize_hp (Next (cur_hp)) > 0)
-                    || (Whsize_hd (cur_hd) + Wosize_hp (Next (cur_hp))
-                       > Max_wosize)
-                    || Next (cur_hp) == caml_gc_sweep_hp);
-        */
-        break;
-      }
-#ifdef DEBUG
-      prev_hp = cur_hp;
-#endif
-      cur_hp = Next (cur_hp);
-    }
-    CAMLassert (cur_hp == (header_t *) chunk_end);
-    chunk = Chunk_next (chunk);
-  }
-
-#ifdef DEBUG
-  caml_final_invariant_check();
-  caml_fl_check ();
-#endif
-
-  CAMLassert (heap_chunks == Caml_state->stat_heap_chunks);
-  CAMLassert (live_words + free_words + fragments == Caml_state->stat_heap_wsz);
-
-  if (returnstats){
-    CAMLlocal1 (res);
-
-    /* get a copy of these before allocating anything... */
-    double minwords =
-      Caml_state->stat_minor_words
-      + (double) (Caml_state->young_alloc_end - Caml_state->young_ptr);
-    double prowords = Caml_state->stat_promoted_words;
-    double majwords =
-      Caml_state->stat_major_words + (double) caml_allocated_words;
-    intnat mincoll = Caml_state->stat_minor_collections;
-    intnat majcoll = Caml_state->stat_major_collections;
-    intnat heap_words = Caml_state->stat_heap_wsz;
-    intnat cpct = Caml_state->stat_compactions;
-    intnat forcmajcoll = Caml_state->stat_forced_major_collections;
-    intnat top_heap_words = Caml_state->stat_top_heap_wsz;
-
-    res = caml_alloc_tuple (17);
-    Store_field (res, 0, caml_copy_double (minwords));
-    Store_field (res, 1, caml_copy_double (prowords));
-    Store_field (res, 2, caml_copy_double (majwords));
-    Store_field (res, 3, Val_long (mincoll));
-    Store_field (res, 4, Val_long (majcoll));
-    Store_field (res, 5, Val_long (heap_words));
-    Store_field (res, 6, Val_long (heap_chunks));
-    Store_field (res, 7, Val_long (live_words));
-    Store_field (res, 8, Val_long (live_blocks));
-    Store_field (res, 9, Val_long (free_words));
-    Store_field (res, 10, Val_long (free_blocks));
-    Store_field (res, 11, Val_long (largest_free));
-    Store_field (res, 12, Val_long (fragments));
-    Store_field (res, 13, Val_long (cpct));
-    Store_field (res, 14, Val_long (top_heap_words));
-    Store_field (res, 15, Val_long (caml_stack_usage()));
-    Store_field (res, 16, Val_long (forcmajcoll));
-    CAMLreturn (res);
-  }else{
-    CAMLreturn (Val_unit);
-  }
-}
-
-#ifdef DEBUG
-void caml_heap_check (void)
-{
-  heap_stats (0);
-}
-#endif
-
-CAMLprim value caml_gc_stat(value v)
-{
-  value result;
-  CAML_EV_BEGIN(EV_EXPLICIT_GC_STAT);
-  CAMLassert (v == Val_unit);
-  result = heap_stats (1);
-  CAML_EV_END(EV_EXPLICIT_GC_STAT);
-  return result;
-}
-
 CAMLprim value caml_gc_quick_stat(value v)
 {
   CAMLparam0 ();
   CAMLlocal1 (res);
 
   /* get a copy of these before allocating anything... */
-  double minwords =
-    Caml_state->stat_minor_words
-    + (double) (Caml_state->young_alloc_end - Caml_state->young_ptr);
-  double prowords = Caml_state->stat_promoted_words;
-  double majwords =
-    Caml_state->stat_major_words + (double) caml_allocated_words;
-  intnat mincoll = Caml_state->stat_minor_collections;
-  intnat majcoll = Caml_state->stat_major_collections;
-  intnat heap_words = Caml_state->stat_heap_wsz;
-  intnat top_heap_words = Caml_state->stat_top_heap_wsz;
-  intnat cpct = Caml_state->stat_compactions;
-  intnat forcmajcoll = Caml_state->stat_forced_major_collections;
-  intnat heap_chunks = Caml_state->stat_heap_chunks;
+  intnat majcoll;
+  struct gc_stats s;
+  caml_sample_gc_stats(&s);
+  majcoll = Caml_state->stat_major_collections;
 
   res = caml_alloc_tuple (17);
-  Store_field (res, 0, caml_copy_double (minwords));
-  Store_field (res, 1, caml_copy_double (prowords));
-  Store_field (res, 2, caml_copy_double (majwords));
-  Store_field (res, 3, Val_long (mincoll));
+  Store_field (res, 0, caml_copy_double ((double)s.minor_words));
+  Store_field (res, 1, caml_copy_double ((double)s.promoted_words));
+  Store_field (res, 2, caml_copy_double ((double)s.major_words));
+  Store_field (res, 3, Val_long (s.minor_collections));
   Store_field (res, 4, Val_long (majcoll));
-  Store_field (res, 5, Val_long (heap_words));
-  Store_field (res, 6, Val_long (heap_chunks));
-  Store_field (res, 7, Val_long (0));
-  Store_field (res, 8, Val_long (0));
-  Store_field (res, 9, Val_long (0));
+  Store_field (res, 5, Val_long (
+    s.major_heap.pool_words + s.major_heap.large_words));
+  Store_field (res, 6, Val_long (0));
+  Store_field (res, 7, Val_long (
+    s.major_heap.pool_live_words + s.major_heap.large_words));
+  Store_field (res, 8, Val_long (
+    s.major_heap.pool_live_blocks + s.major_heap.large_blocks));
+  Store_field (res, 9, Val_long (
+    s.major_heap.pool_words - s.major_heap.pool_live_words
+    - s.major_heap.pool_frag_words));
   Store_field (res, 10, Val_long (0));
   Store_field (res, 11, Val_long (0));
-  Store_field (res, 12, Val_long (0));
-  Store_field (res, 13, Val_long (cpct));
-  Store_field (res, 14, Val_long (top_heap_words));
-  Store_field (res, 15, Val_long (caml_stack_usage()));
-  Store_field (res, 16, Val_long (forcmajcoll));
+  Store_field (res, 12, Val_long (s.major_heap.pool_frag_words));
+  Store_field (res, 13, Val_long (0));
+  Store_field (res, 14, Val_long (
+    s.major_heap.pool_max_words + s.major_heap.large_max_words));
+  Store_field (res, 15, Val_long (0));
+  Store_field (res, 16, Val_long (s.forced_major_collections));
   CAMLreturn (res);
 }
 
-double caml_gc_minor_words_unboxed()
+double caml_gc_minor_words_unboxed (void)
 {
   return (Caml_state->stat_minor_words
-          + (double) (Caml_state->young_alloc_end - Caml_state->young_ptr));
+          + ((double) ((uintnat)Caml_state->young_end -
+              (uintnat)Caml_state->young_ptr)) / sizeof(value));
 }
 
 CAMLprim value caml_gc_minor_words(value v)
@@ -337,46 +106,33 @@ CAMLprim value caml_gc_counters(value v)
   CAMLlocal1 (res);
 
   /* get a copy of these before allocating anything... */
-  double minwords =
-    Caml_state->stat_minor_words
-    + (double) (Caml_state->young_alloc_end - Caml_state->young_ptr);
+  double minwords = Caml_state->stat_minor_words
+    + ((double) Wsize_bsize ((uintnat)Caml_state->young_end -
+        (uintnat) Caml_state->young_ptr)) / sizeof(value);
   double prowords = Caml_state->stat_promoted_words;
-  double majwords =
-    Caml_state->stat_major_words + (double) caml_allocated_words;
+  double majwords = Caml_state->stat_major_words +
+                    (double) Caml_state->allocated_words;
 
-  res = caml_alloc_tuple (3);
-  Store_field (res, 0, caml_copy_double (minwords));
-  Store_field (res, 1, caml_copy_double (prowords));
-  Store_field (res, 2, caml_copy_double (majwords));
+  res = caml_alloc_3(0,
+    caml_copy_double (minwords),
+    caml_copy_double (prowords),
+    caml_copy_double (majwords));
   CAMLreturn (res);
 }
 
-CAMLprim value caml_gc_huge_fallback_count (value v)
-{
-  return Val_long (caml_huge_fallback_count);
-}
-
 CAMLprim value caml_gc_get(value v)
 {
   CAMLparam0 ();   /* v is ignored */
   CAMLlocal1 (res);
 
   res = caml_alloc_tuple (11);
-  Store_field (res, 0, Val_long (Caml_state->minor_heap_wsz));          /* s */
-  Store_field (res, 1, Val_long (caml_major_heap_increment));           /* i */
-  Store_field (res, 2, Val_long (caml_percent_free));                   /* o */
-  Store_field (res, 3, Val_long (caml_verb_gc));                        /* v */
-  Store_field (res, 4, Val_long (caml_percent_max));                    /* O */
-#ifndef NATIVE_CODE
-  Store_field (res, 5, Val_long (caml_max_stack_size));                 /* l */
-#else
-  Store_field (res, 5, Val_long (0));
-#endif
-  Store_field (res, 6, Val_long (caml_allocation_policy));              /* a */
-  Store_field (res, 7, Val_long (caml_major_window));                   /* w */
-  Store_field (res, 8, Val_long (caml_custom_major_ratio));             /* M */
-  Store_field (res, 9, Val_long (caml_custom_minor_ratio));             /* m */
-  Store_field (res, 10, Val_long (caml_custom_minor_max_bsz));          /* n */
+  Store_field (res, 0, Val_long (Caml_state->minor_heap_wsz));  /* s */
+  Store_field (res, 2, Val_long (caml_percent_free));           /* o */
+  Store_field (res, 3, Val_long (caml_params->verb_gc));        /* v */
+  Store_field (res, 5, Val_long (caml_max_stack_size));         /* l */
+  Store_field (res, 8, Val_long (caml_custom_major_ratio));     /* M */
+  Store_field (res, 9, Val_long (caml_custom_minor_ratio));     /* m */
+  Store_field (res, 10, Val_long (caml_custom_minor_max_bsz));  /* n */
   CAMLreturn (res);
 }
 
@@ -387,30 +143,6 @@ static uintnat norm_pfree (uintnat p)
   return Max (p, 1);
 }
 
-static uintnat norm_pmax (uintnat p)
-{
-  return p;
-}
-
-static intnat norm_minsize (intnat s)
-{
-  intnat page_wsize = Wsize_bsize(Page_size);
-  if (s < Minor_heap_min) s = Minor_heap_min;
-  if (s > Minor_heap_max) s = Minor_heap_max;
-  /* PR#9128 : Make sure the minor heap occupies an integral number of
-     pages, so that no page contains both bytecode and OCaml
-     values. This would confuse, e.g., caml_hash. */
-  s = (s + page_wsize - 1) / page_wsize * page_wsize;
-  return s;
-}
-
-static uintnat norm_window (intnat w)
-{
-  if (w < 1) w = 1;
-  if (w > Max_major_window) w = Max_major_window;
-  return w;
-}
-
 static uintnat norm_custom_maj (uintnat p)
 {
   return Max (p, 1);
@@ -423,18 +155,12 @@ static uintnat norm_custom_min (uintnat p)
 
 CAMLprim value caml_gc_set(value v)
 {
-  uintnat newpf, newpm;
-  asize_t newheapincr;
-  asize_t newminwsz;
-  uintnat newpolicy;
+  uintnat newpf;
+  uintnat newminwsz;
   uintnat new_custom_maj, new_custom_min, new_custom_sz;
   CAML_EV_BEGIN(EV_EXPLICIT_GC_SET);
 
-  caml_verb_gc = Long_val (Field (v, 3));
-
-#ifndef NATIVE_CODE
   caml_change_max_stack_size (Long_val (Field (v, 5)));
-#endif
 
   newpf = norm_pfree (Long_val (Field (v, 2)));
   if (newpf != caml_percent_free){
@@ -443,37 +169,6 @@ CAMLprim value caml_gc_set(value v)
                      ARCH_INTNAT_PRINTF_FORMAT "u%%\n", caml_percent_free);
   }
 
-  newpm = norm_pmax (Long_val (Field (v, 4)));
-  if (newpm != caml_percent_max){
-    caml_percent_max = newpm;
-    caml_gc_message (0x20, "New max overhead: %"
-                     ARCH_INTNAT_PRINTF_FORMAT "u%%\n", caml_percent_max);
-  }
-
-  newheapincr = Long_val (Field (v, 1));
-  if (newheapincr != caml_major_heap_increment){
-    caml_major_heap_increment = newheapincr;
-    if (newheapincr > 1000){
-      caml_gc_message (0x20, "New heap increment size: %"
-                       ARCH_INTNAT_PRINTF_FORMAT "uk words\n",
-                       caml_major_heap_increment/1024);
-    }else{
-      caml_gc_message (0x20, "New heap increment size: %"
-                       ARCH_INTNAT_PRINTF_FORMAT "u%%\n",
-                       caml_major_heap_increment);
-    }
-  }
-
-  /* This field was added in 4.03.0. */
-  if (Wosize_val (v) >= 8){
-    int old_window = caml_major_window;
-    caml_set_major_window (norm_window (Long_val (Field (v, 7))));
-    if (old_window != caml_major_window){
-      caml_gc_message (0x20, "New smoothing window size: %d\n",
-                       caml_major_window);
-    }
-  }
-
   /* These fields were added in 4.08.0. */
   if (Wosize_val (v) >= 11){
     new_custom_maj = norm_custom_maj (Long_val (Field (v, 8)));
@@ -499,212 +194,123 @@ CAMLprim value caml_gc_set(value v)
     }
   }
 
-  /* Save field 0 before [v] is invalidated. */
-  newminwsz = norm_minsize (Long_val (Field (v, 0)));
-
-  /* Switching allocation policies must trigger a compaction, so it
-     invalidates [v]. */
-  newpolicy = Long_val (Field (v, 6));
-  if (newpolicy != caml_allocation_policy){
-    caml_empty_minor_heap ();
-    caml_gc_message (0x1, "Full major GC cycle (changing allocation policy)\n");
-    caml_finish_major_cycle ();
-    caml_finish_major_cycle ();
-    ++ Caml_state->stat_forced_major_collections;
-    caml_compact_heap (newpolicy);
-    caml_gc_message (0x20, "New allocation policy: %"
-                     ARCH_INTNAT_PRINTF_FORMAT "u\n", newpolicy);
-  }
-
-  /* Minor heap size comes last because it can raise [Out_of_memory]. */
+  /* Minor heap size comes last because it will trigger a minor collection
+     (thus invalidating [v]) and it can raise [Out_of_memory]. */
+  newminwsz = caml_norm_minor_heap_size (Long_val (Field (v, 0)));
   if (newminwsz != Caml_state->minor_heap_wsz){
     caml_gc_message (0x20, "New minor heap size: %"
                      ARCH_SIZET_PRINTF_FORMAT "uk words\n", newminwsz / 1024);
-    caml_set_minor_heap_size (Bsize_wsize (newminwsz));
+    caml_set_minor_heap_size (newminwsz);
   }
   CAML_EV_END(EV_EXPLICIT_GC_SET);
 
-  /* The compaction may have triggered some finalizers that we need to call. */
-  caml_process_pending_actions();
-
   return Val_unit;
 }
 
 CAMLprim value caml_gc_minor(value v)
 {
-  value exn;
-
   CAML_EV_BEGIN(EV_EXPLICIT_GC_MINOR);
   CAMLassert (v == Val_unit);
-  caml_request_minor_gc ();
-  // call the gc and call finalisers
-  exn = caml_process_pending_actions_exn();
+  caml_minor_collection ();
   CAML_EV_END(EV_EXPLICIT_GC_MINOR);
-  caml_raise_if_exception(exn);
   return Val_unit;
 }
 
-static void test_and_compact (void)
-{
-  double fp;
-
-  fp = 100.0 * caml_fl_cur_wsz / (Caml_state->stat_heap_wsz - caml_fl_cur_wsz);
-  if (fp > 999999.0) fp = 999999.0;
-  caml_gc_message (0x200, "Estimated overhead (lower bound) = %"
-                          ARCH_INTNAT_PRINTF_FORMAT "u%%\n",
-                   (uintnat) fp);
-  if (fp >= caml_percent_max){
-    caml_gc_message (0x200, "Automatic compaction triggered.\n");
-    caml_compact_heap (-1);
-  }
-}
-
 CAMLprim value caml_gc_major(value v)
 {
-  value exn;
-
   CAML_EV_BEGIN(EV_EXPLICIT_GC_MAJOR);
   CAMLassert (v == Val_unit);
-  caml_gc_message (0x1, "Finishing major GC cycle (requested by user)\n");
-  caml_empty_minor_heap ();
-  caml_finish_major_cycle ();
-  test_and_compact ();
-  // call finalisers
-  exn = caml_process_pending_actions_exn();
+  caml_gc_log ("Major GC cycle requested");
+  caml_empty_minor_heaps_once();
+  caml_finish_major_cycle();
+  caml_final_do_calls ();
   CAML_EV_END(EV_EXPLICIT_GC_MAJOR);
-  caml_raise_if_exception(exn);
   return Val_unit;
 }
 
 CAMLprim value caml_gc_full_major(value v)
 {
-  value exn;
-
+  int i;
   CAML_EV_BEGIN(EV_EXPLICIT_GC_FULL_MAJOR);
   CAMLassert (v == Val_unit);
-  caml_gc_message (0x1, "Full major GC cycle (requested by user)\n");
-  caml_empty_minor_heap ();
-  caml_finish_major_cycle ();
-  // call finalisers
-  exn = caml_process_pending_actions_exn();
-  if (Is_exception_result(exn)) goto cleanup;
-  caml_empty_minor_heap ();
-  caml_finish_major_cycle ();
+  caml_gc_log ("Full Major GC requested");
+  /* In general, it can require up to 3 GC cycles for a
+     currently-unreachable object to be collected. */
+  for (i = 0; i < 3; i++) {
+    caml_empty_minor_heaps_once();
+    caml_finish_major_cycle();
+    caml_final_do_calls ();
+  }
   ++ Caml_state->stat_forced_major_collections;
-  test_and_compact ();
-  // call finalisers
-  exn = caml_process_pending_actions_exn();
-
-cleanup:
   CAML_EV_END(EV_EXPLICIT_GC_FULL_MAJOR);
-  caml_raise_if_exception(exn);
-
   return Val_unit;
 }
 
 CAMLprim value caml_gc_major_slice (value v)
 {
-  value exn = Val_unit;
   CAML_EV_BEGIN(EV_EXPLICIT_GC_MAJOR_SLICE);
   CAMLassert (Is_long (v));
-  if (caml_gc_phase == Phase_idle){
-    /* We need to start a new major GC cycle. Go through the pending_action
-       machinery. */
-    caml_request_major_slice ();
-    exn = caml_process_pending_actions_exn ();
-      /* Calls the major GC without passing [v] but the initial slice
-         ignores this parameter anyway. */
-  }else{
-    caml_major_collection_slice (Long_val (v));
-  }
+  caml_major_collection_slice(Long_val(v));
   CAML_EV_END(EV_EXPLICIT_GC_MAJOR_SLICE);
-  caml_raise_if_exception (exn);
   return Val_long (0);
 }
 
 CAMLprim value caml_gc_compaction(value v)
 {
-  value exn;
-
   CAML_EV_BEGIN(EV_EXPLICIT_GC_COMPACT);
   CAMLassert (v == Val_unit);
-  caml_gc_message (0x10, "Heap compaction requested\n");
-  caml_empty_minor_heap ();
-  caml_gc_message (0x1, "Full major GC cycle (compaction)\n");
-  caml_finish_major_cycle ();
-  // call finalisers
-  exn = caml_process_pending_actions_exn();
-  if (Is_exception_result(exn)) goto cleanup;
-  caml_empty_minor_heap ();
-  caml_finish_major_cycle ();
+  caml_gc_major(v);
   ++ Caml_state->stat_forced_major_collections;
-  caml_compact_heap (-1);
-  // call finalisers
-  exn = caml_process_pending_actions_exn();
-
- cleanup:
   CAML_EV_END(EV_EXPLICIT_GC_COMPACT);
-  caml_raise_if_exception(exn);
   return Val_unit;
 }
 
-CAMLprim value caml_get_minor_free (value v)
-{
-  return Val_int (Caml_state->young_ptr - Caml_state->young_alloc_start);
-}
 
-CAMLprim value caml_get_major_bucket (value v)
+CAMLprim value caml_gc_stat(value v)
 {
-  long i = Long_val (v);
-  if (i < 0) caml_invalid_argument ("Gc.get_bucket");
-  if (i < caml_major_window){
-    i += caml_major_ring_index;
-    if (i >= caml_major_window) i -= caml_major_window;
-    CAMLassert (0 <= i && i < caml_major_window);
-    return Val_long ((long) (caml_major_ring[i] * 1e6));
-  }else{
-    return Val_long (0);
-  }
+  value result;
+  CAML_EV_BEGIN(EV_EXPLICIT_GC_STAT);
+  caml_gc_full_major(Val_unit);
+  result = caml_gc_quick_stat(Val_unit);
+  CAML_EV_END(EV_EXPLICIT_GC_STAT);
+  return result;
 }
 
-CAMLprim value caml_get_major_credit (value v)
+CAMLprim value caml_get_minor_free (value v)
 {
-  CAMLassert (v == Val_unit);
-  return Val_long ((long) (caml_major_work_credit * 1e6));
+  return Val_int
+    ((uintnat)Caml_state->young_ptr - (uintnat)Caml_state->young_start);
 }
 
-/* [minor_size] and [major_size] are numbers of words
-   [major_incr] is either a percentage or a number of words */
-void caml_init_gc (uintnat minor_size, uintnat major_size,
-                   uintnat major_incr, uintnat percent_fr,
-                   uintnat percent_m, uintnat window,
-                   uintnat custom_maj, uintnat custom_min,
-                   uintnat custom_bsz, uintnat policy)
+void caml_init_gc (void)
 {
-  uintnat major_bsize;
-  if (major_size < Heap_chunk_min) major_size = Heap_chunk_min;
-  major_bsize = Bsize_wsize(major_size);
-  major_bsize = ((major_bsize + Page_size - 1) >> Page_log) << Page_log;
+  caml_max_stack_size = caml_params->init_max_stack_wsz;
+  caml_fiber_wsz = caml_params->init_fiber_wsz;
+  caml_percent_free = norm_pfree (caml_params->init_percent_free);
+  caml_gc_log ("Initial stack limit: %"
+               ARCH_INTNAT_PRINTF_FORMAT "uk bytes",
+               caml_max_stack_size / 1024 * sizeof (value));
 
-  if (caml_page_table_initialize(Bsize_wsize(minor_size) + major_bsize)){
-    caml_fatal_error ("cannot initialize page table");
-  }
-  caml_set_minor_heap_size (Bsize_wsize (norm_minsize (minor_size)));
+  caml_custom_major_ratio =
+      norm_custom_maj (caml_params->init_custom_major_ratio);
+  caml_custom_minor_ratio =
+      norm_custom_min (caml_params->init_custom_minor_ratio);
+  caml_custom_minor_max_bsz = caml_params->init_custom_minor_max_bsz;
+
+  caml_gc_phase = Phase_sweep_and_mark_main;
+  #ifdef NATIVE_CODE
+  caml_init_frame_descriptors();
+  #endif
+  caml_init_domains(caml_params->init_minor_heap_wsz);
+/*
   caml_major_heap_increment = major_incr;
   caml_percent_free = norm_pfree (percent_fr);
   caml_percent_max = norm_pmax (percent_m);
-  caml_set_allocation_policy (policy);
-  caml_init_major_heap (major_bsize);
-  caml_major_window = norm_window (window);
-  caml_custom_major_ratio = norm_custom_maj (custom_maj);
-  caml_custom_minor_ratio = norm_custom_min (custom_min);
-  caml_custom_minor_max_bsz = custom_bsz;
-  caml_gc_message (0x20, "Initial minor heap size: %"
-                   ARCH_SIZET_PRINTF_FORMAT "uk words\n",
-                   Caml_state->minor_heap_wsz / 1024);
-  caml_gc_message (0x20, "Initial major heap size: %"
-                   ARCH_INTNAT_PRINTF_FORMAT "uk bytes\n",
-                   major_bsize / 1024);
+  caml_init_major_heap (major_heap_size);
+  caml_gc_message (0x20, "Initial minor heap size: %luk bytes\n",
+                   Caml_state->minor_heap_size / 1024);
+  caml_gc_message (0x20, "Initial major heap size: %luk bytes\n",
+                   major_heap_size / 1024);
   caml_gc_message (0x20, "Initial space overhead: %"
                    ARCH_INTNAT_PRINTF_FORMAT "u%%\n", caml_percent_free);
   caml_gc_message (0x20, "Initial max overhead: %"
@@ -718,13 +324,11 @@ void caml_init_gc (uintnat minor_size, uintnat major_size,
                      ARCH_INTNAT_PRINTF_FORMAT "u%%\n",
                      caml_major_heap_increment);
   }
-  caml_gc_message (0x20, "Initial allocation policy: %"
-                   ARCH_INTNAT_PRINTF_FORMAT "u\n", caml_allocation_policy);
-  caml_gc_message (0x20, "Initial smoothing window: %d\n",
-                   caml_major_window);
+  caml_gc_message (0x20, "Initial allocation policy: %d\n",
+                   caml_allocation_policy);
+*/
 }
 
-
 /* FIXME After the startup_aux.c unification, move these functions there. */
 
 CAMLprim value caml_runtime_variant (value unit)
@@ -747,31 +351,8 @@ CAMLprim value caml_runtime_parameters (value unit)
 #define F_S ARCH_SIZET_PRINTF_FORMAT
 
   CAMLassert (unit == Val_unit);
-  return caml_alloc_sprintf
-    ("a=%d,b=%d,H=%"F_Z"u,i=%"F_Z"u,l=%"F_Z"u,o=%"F_Z"u,O=%"F_Z"u,p=%d,"
-     "s=%"F_S"u,t=%"F_Z"u,v=%"F_Z"u,w=%d,W=%"F_Z"u",
-     /* a */ (int) caml_allocation_policy,
-     /* b */ (int) Caml_state->backtrace_active,
-     /* h */ /* missing */ /* FIXME add when changed to min_heap_size */
-     /* H */ caml_use_huge_pages,
-     /* i */ caml_major_heap_increment,
-#ifdef NATIVE_CODE
-     /* l */ (uintnat) 0,
-#else
-     /* l */ caml_max_stack_size,
-#endif
-     /* o */ caml_percent_free,
-     /* O */ caml_percent_max,
-     /* p */ caml_parser_trace,
-     /* R */ /* missing */
-     /* s */ Caml_state->minor_heap_wsz,
-     /* t */ caml_trace_level,
-     /* v */ caml_verb_gc,
-     /* w */ caml_major_window,
-     /* W */ caml_runtime_warnings
-     );
-#undef F_Z
-#undef F_S
+  /* TODO KC */
+  return caml_alloc_sprintf ("caml_runtime_parameters not implemented: %d", 0);
 }
 
 /* Control runtime warnings */
diff --git a/runtime/gen_primitives.sh b/runtime/gen_primitives.sh
index a727d5c25cfc..41d8d8ff7517 100755
--- a/runtime/gen_primitives.sh
+++ b/runtime/gen_primitives.sh
@@ -24,7 +24,8 @@ export LC_ALL=C
   for prim in \
       alloc array compare extern floats gc_ctrl hash intern interp ints io \
       lexing md5 meta memprof obj parsing signals str sys callback weak \
-      finalise stacks dynlink backtrace_byt backtrace afl \
+      finalise domain platform fiber memory startup_aux sync \
+      dynlink backtrace_byt backtrace afl \
       bigarray eventlog
   do
       sed -n -e 's/^CAMLprim value \([a-z0-9_][a-z0-9_]*\).*/\1/p' "$prim.c"
diff --git a/runtime/globroots.c b/runtime/globroots.c
index 3025d09559c1..3292601da2ff 100644
--- a/runtime/globroots.c
+++ b/runtime/globroots.c
@@ -21,6 +21,9 @@
 #include "caml/roots.h"
 #include "caml/globroots.h"
 #include "caml/skiplist.h"
+#include "caml/stack.h"
+
+static caml_plat_mutex roots_mutex = CAML_PLAT_MUTEX_INITIALIZER;
 
 /* The three global root lists.
    Each is represented by a skip list with the key being the address
@@ -40,30 +43,22 @@ struct skiplist caml_global_roots_old = SKIPLIST_STATIC_INITIALIZER;
      in [caml_global_roots_old] or in [caml_global_roots_young];
    - Otherwise (the root contains a pointer outside of the heap or an integer),
      then neither [caml_global_roots_young] nor [caml_global_roots_old] contain
-     it.
-*/
+     it. */
 
 /* Insertion and deletion */
 
 Caml_inline void caml_insert_global_root(struct skiplist * list, value * r)
 {
+  caml_plat_lock(&roots_mutex);
   caml_skiplist_insert(list, (uintnat) r, 0);
+  caml_plat_unlock(&roots_mutex);
 }
 
 Caml_inline void caml_delete_global_root(struct skiplist * list, value * r)
 {
+  caml_plat_lock(&roots_mutex);
   caml_skiplist_remove(list, (uintnat) r);
-}
-
-/* Iterate a GC scanning action over a global root list */
-
-static void caml_iterate_global_roots(scanning_action f,
-                                      struct skiplist * rootlist)
-{
-  FOREACH_SKIPLIST_ELEMENT(e, rootlist, {
-      value * r = (value *) (e->key);
-      f(*r, r);
-    })
+  caml_plat_unlock(&roots_mutex);
 }
 
 /* Register a global C root of the mutable kind */
@@ -91,9 +86,6 @@ static enum gc_root_class classify_gc_root(value v)
 {
   if(!Is_block(v)) return UNTRACKED;
   if(Is_young(v)) return YOUNG;
-#ifndef NO_NAKED_POINTERS
-  if(!Is_in_heap(v)) return UNTRACKED;
-#endif
   return OLD;
 }
 
@@ -162,26 +154,104 @@ CAMLexport void caml_modify_generational_global_root(value *r, value newval)
   *r = newval;
 }
 
-/* Scan all global roots */
+#ifdef NATIVE_CODE
+
+/* Linked-list of natdynlink'd globals */
+
+typedef struct link {
+  void *data;
+  struct link *next;
+} link;
+
+static link *cons(void *data, link *tl) {
+  link *lnk = caml_stat_alloc(sizeof(link));
+  lnk->data = data;
+  lnk->next = tl;
+  return lnk;
+}
+
+#define iter_list(list,lnk) \
+  for (lnk = list; lnk != NULL; lnk = lnk->next)
+
+
+/* protected by roots_mutex */
+static link * caml_dyn_globals = NULL;
+
+void caml_register_dyn_global(void *v) {
+  caml_plat_lock(&roots_mutex);
+  caml_dyn_globals = cons((void*) v,caml_dyn_globals);
+  caml_plat_unlock(&roots_mutex);
+}
 
-void caml_scan_global_roots(scanning_action f)
+static void scan_native_globals(scanning_action f, void* fdata)
 {
-  caml_iterate_global_roots(f, &caml_global_roots);
-  caml_iterate_global_roots(f, &caml_global_roots_young);
-  caml_iterate_global_roots(f, &caml_global_roots_old);
+  int i, j;
+  static link* dyn_globals;
+  value* glob;
+  link* lnk;
+
+  caml_plat_lock(&roots_mutex);
+  dyn_globals = caml_dyn_globals;
+  caml_plat_unlock(&roots_mutex);
+
+  /* The global roots */
+  for (i = 0; i <= caml_globals_inited && caml_globals[i] != 0; i++) {
+    for(glob = caml_globals[i]; *glob != 0; glob++) {
+      for (j = 0; j < Wosize_val(*glob); j++){
+        f(fdata, Field(*glob, j), &Field(*glob, j));
+      }
+    }
+  }
+
+  /* Dynamic (natdynlink) global roots */
+  iter_list(dyn_globals, lnk) {
+    for(glob = (value *) lnk->data; *glob != 0; glob++) {
+      for (j = 0; j < Wosize_val(*glob); j++){
+        f(fdata, Field(*glob, j), &Field(*glob, j));
+      }
+    }
+  }
 }
 
-/* Scan global roots for a minor collection */
+#endif
 
-void caml_scan_global_young_roots(scanning_action f)
+/* Iterate a GC scanning action over a global root list */
+Caml_inline void caml_iterate_global_roots(scanning_action f,
+                                      struct skiplist * rootlist, void* fdata)
 {
+  FOREACH_SKIPLIST_ELEMENT(e, rootlist, {
+      value * r = (value *) (e->key);
+      f(fdata, *r, r);
+    })
+}
+
+/* Scan all global roots */
+void caml_scan_global_roots(scanning_action f, void* fdata) {
+  caml_plat_lock(&roots_mutex);
+  caml_iterate_global_roots(f, &caml_global_roots, fdata);
+  caml_iterate_global_roots(f, &caml_global_roots_young, fdata);
+  caml_iterate_global_roots(f, &caml_global_roots_old, fdata);
+  caml_plat_unlock(&roots_mutex);
+
+  #ifdef NATIVE_CODE
+  scan_native_globals(f, fdata);
+  #endif
+}
+
+/* Scan global roots for a minor collection */
+void caml_scan_global_young_roots(scanning_action f, void* fdata)
+{
+  caml_plat_lock(&roots_mutex);
+
+  caml_iterate_global_roots(f, &caml_global_roots, fdata);
+  caml_iterate_global_roots(f, &caml_global_roots_young, fdata);
 
-  caml_iterate_global_roots(f, &caml_global_roots);
-  caml_iterate_global_roots(f, &caml_global_roots_young);
   /* Move young roots to old roots */
   FOREACH_SKIPLIST_ELEMENT(e, &caml_global_roots_young, {
       value * r = (value *) (e->key);
-      caml_insert_global_root(&caml_global_roots_old, r);
+      caml_skiplist_insert(&caml_global_roots_old, (uintnat) r, 0);
     });
   caml_skiplist_empty(&caml_global_roots_young);
+
+  caml_plat_unlock(&roots_mutex);
 }
diff --git a/runtime/hash.c b/runtime/hash.c
index f33634c22744..8fb0a991f34a 100644
--- a/runtime/hash.c
+++ b/runtime/hash.c
@@ -204,14 +204,7 @@ CAMLprim value caml_hash(value count, value limit, value seed, value obj)
     if (Is_long(v)) {
       h = caml_hash_mix_intnat(h, v);
       num--;
-    }
-    else if (!Is_in_value_area(v)) {
-      /* v is a pointer outside the heap, probably a code pointer.
-         Shall we count it?  Let's say yes by compatibility with old code. */
-      h = caml_hash_mix_intnat(h, v);
-      num--;
-    }
-    else {
+    } else {
       switch (Tag_val(v)) {
       case String_tag:
         h = caml_hash_mix_string(h, v);
@@ -242,7 +235,7 @@ CAMLprim value caml_hash(value count, value limit, value seed, value obj)
            Forward_tag links being followed */
         for (i = MAX_FORWARD_DEREFERENCE; i > 0; i--) {
           v = Forward_val(v);
-          if (Is_long(v) || !Is_in_value_area(v) || Tag_val(v) != Forward_tag)
+          if (Is_long(v) || Tag_val(v) != Forward_tag)
             goto again;
         }
         /* Give up on this object and move to the next */
@@ -260,7 +253,6 @@ CAMLprim value caml_hash(value count, value limit, value seed, value obj)
           num--;
         }
         break;
-#ifdef NO_NAKED_POINTERS
       case Closure_tag: {
         mlsize_t startenv;
         len = Wosize_val(v);
@@ -281,7 +273,11 @@ CAMLprim value caml_hash(value count, value limit, value seed, value obj)
         }
         break;
       }
-#endif
+      case Cont_tag:
+        /* All continuations hash to the same value,
+           since we have no idea how to distinguish them. */
+        break;
+
       default:
         /* Mix in the tag and size, but do not count this towards [num] */
         h = caml_hash_mix_uint32(h, Whitehd_hd(Hd_val(v)));
diff --git a/runtime/instrtrace.c b/runtime/instrtrace.c
index 2760475ed5a0..216a86566933 100644
--- a/runtime/instrtrace.c
+++ b/runtime/instrtrace.c
@@ -23,27 +23,34 @@
 #include <string.h>
 #include <ctype.h>
 
+#include "caml/fiber.h"
+#include "caml/domain.h"
 #include "caml/instrtrace.h"
 #include "caml/instruct.h"
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
 #include "caml/opnames.h"
 #include "caml/prims.h"
-#include "caml/stacks.h"
-#include "caml/startup_aux.h"
+#include "caml/startup.h"
 
 extern code_t caml_start_code;
 
-intnat caml_icount = 0;
+__thread intnat caml_icount = 0;
 
-void caml_stop_here () {}
+void caml_stop_here (void)
+{
+}
 
-void caml_disasm_instr(pc)
-     code_t pc;
+char * caml_instr_string (code_t pc);
+
+void caml_disasm_instr(code_t pc)
 {
+  char buf[256];
+  char opbuf[128];
   int instr = *pc;
-  printf("%6ld  %s", (long) (pc - caml_start_code),
-         instr < 0 || instr > STOP ? "???" : names_of_instructions[instr]);
+  snprintf(opbuf, sizeof(opbuf), "%6ld  %s", (long) (pc - caml_start_code),
+           (instr < 0 || instr >= FIRST_UNIMPLEMENTED_OP) ?
+             "???" : names_of_instructions[instr]);
   pc++;
   switch(instr) {
     /* Instructions with one integer operand */
@@ -57,126 +64,38 @@ void caml_disasm_instr(pc)
   case BRANCH: case BRANCHIF: case BRANCHIFNOT: case PUSHTRAP:
   case CONSTINT: case PUSHCONSTINT: case OFFSETINT: case OFFSETREF:
   case OFFSETCLOSURE: case PUSHOFFSETCLOSURE:
-    printf(" %d\n", pc[0]); break;
+  case RESUMETERM: case REPERFORMTERM:
+    snprintf(buf, sizeof(buf), "%s %d\n", opbuf, pc[0]); break;
     /* Instructions with two operands */
   case APPTERM: case CLOSURE: case CLOSUREREC: case PUSHGETGLOBALFIELD:
   case GETGLOBALFIELD: case MAKEBLOCK:
   case BEQ: case BNEQ: case BLTINT: case BLEINT: case BGTINT: case BGEINT:
   case BULTINT: case BUGEINT:
-    printf(" %d, %d\n", pc[0], pc[1]); break;
+    snprintf(buf, sizeof(buf), "%s %d, %d\n", opbuf, pc[0], pc[1]); break;
     /* Instructions with a C primitive as operand */
   case C_CALLN:
-    printf(" %d,", pc[0]); pc++;
+    snprintf(buf, sizeof(buf), "%s %d,", opbuf, pc[0]); pc++;
     /* fallthrough */
   case C_CALL1: case C_CALL2: case C_CALL3: case C_CALL4: case C_CALL5:
     if (pc[0] < 0 || pc[0] >= caml_prim_name_table.size)
-      printf(" unknown primitive %d\n", pc[0]);
+      snprintf(buf, sizeof(buf), "%s unknown primitive %d\n", opbuf, pc[0]);
     else
-      printf(" %s\n", (char *) caml_prim_name_table.contents[pc[0]]);
-    break;
-  default:
-    printf("\n");
-  }
-  fflush (stdout);
-}
-
-char * caml_instr_string (code_t pc)
-{
-  static char buf[256];
-  char nambuf[128];
-  int instr = *pc;
-  char *nam;
-
-  nam = (instr < 0 || instr > STOP)
-    ? (snprintf (nambuf, sizeof(nambuf), "???%d", instr), nambuf)
-    : names_of_instructions[instr];
-  pc++;
-  switch (instr) {
-    /* Instructions with one integer operand */
-  case PUSHACC:
-  case ACC:
-  case POP:
-  case ASSIGN:
-  case PUSHENVACC:
-  case ENVACC:
-  case PUSH_RETADDR:
-  case APPLY:
-  case APPTERM1:
-  case APPTERM2:
-  case APPTERM3:
-  case RETURN:
-  case GRAB:
-  case PUSHGETGLOBAL:
-  case GETGLOBAL:
-  case SETGLOBAL:
-  case PUSHATOM:
-  case ATOM:
-  case MAKEBLOCK1:
-  case MAKEBLOCK2:
-  case MAKEBLOCK3:
-  case MAKEFLOATBLOCK:
-  case GETFIELD:
-  case SETFIELD:
-  case GETFLOATFIELD:
-  case SETFLOATFIELD:
-  case BRANCH:
-  case BRANCHIF:
-  case BRANCHIFNOT:
-  case PUSHTRAP:
-  case CONSTINT:
-  case PUSHCONSTINT:
-  case OFFSETINT:
-  case OFFSETREF:
-  case OFFSETCLOSURE:
-  case PUSHOFFSETCLOSURE:
-    snprintf(buf, sizeof(buf), "%s %d", nam, pc[0]);
-    break;
-    /* Instructions with two operands */
-  case APPTERM:
-  case CLOSURE:
-  case CLOSUREREC:
-  case PUSHGETGLOBALFIELD:
-  case GETGLOBALFIELD:
-  case MAKEBLOCK:
-  case BEQ:
-  case BNEQ:
-  case BLTINT:
-  case BLEINT:
-  case BGTINT:
-  case BGEINT:
-  case BULTINT:
-  case BUGEINT:
-    snprintf(buf, sizeof(buf), "%s %d, %d", nam, pc[0], pc[1]);
+      snprintf(buf, sizeof(buf), "%s %s\n", opbuf,
+               (char *) caml_prim_name_table.contents[pc[0]]);
     break;
   case SWITCH:
-    snprintf(buf, sizeof(buf), "SWITCH sz%#lx=%ld::ntag%lu nint%lu",
-            (long) pc[0], (long) pc[0], (unsigned long) pc[0] >> 16,
-            (unsigned long) pc[0] & 0xffff);
-    break;
-    /* Instructions with a C primitive as operand */
-  case C_CALLN:
-    snprintf(buf, sizeof(buf), "%s %d,", nam, pc[0]);
-    pc++;
-    /* fallthrough */
-  case C_CALL1:
-  case C_CALL2:
-  case C_CALL3:
-  case C_CALL4:
-  case C_CALL5:
-    if (pc[0] < 0 || pc[0] >= caml_prim_name_table.size)
-      snprintf(buf, sizeof(buf), "%s unknown primitive %d", nam, pc[0]);
-    else
-      snprintf(buf, sizeof(buf), "%s %s",
-               nam, (char *) caml_prim_name_table.contents[pc[0]]);
+    snprintf(buf, sizeof(buf), "%s ntag=%lu nint=%lu\n",
+                 opbuf,
+                 (unsigned long) pc[0] >> 16,
+                 (unsigned long) pc[0] & 0xffff);
     break;
   default:
-    snprintf(buf, sizeof(buf), "%s", nam);
-    break;
-  };
-  return buf;
+    snprintf(buf, sizeof(buf), "%s\n", opbuf);
+  }
+  printf("[%02d] %s", Caml_state->id, buf);
+  fflush (stdout);
 }
 
-
 void
 caml_trace_value_file (value v, code_t prog, asize_t proglen, FILE * f)
 {
@@ -190,10 +109,10 @@ caml_trace_value_file (value v, code_t prog, asize_t proglen, FILE * f)
     fprintf (f, "=code@%ld", (long) ((code_t) v - prog));
   else if (Is_long (v))
     fprintf (f, "=long%" ARCH_INTNAT_PRINTF_FORMAT "d", Long_val (v));
-  else if ((void*)v >= (void*)Caml_state->stack_low
-           && (void*)v < (void*)Caml_state->stack_high)
+  else if (Stack_base(Caml_state->current_stack) <= (value*)v &&
+           (value*)v < Stack_high(Caml_state->current_stack))
     fprintf (f, "=stack_%ld",
-             (long) ((intnat*)Caml_state->stack_high - (intnat*)v));
+             (long)((intnat*)Stack_high(Caml_state->current_stack)-(intnat*)v));
   else if (Is_block (v)) {
     int s = Wosize_val (v);
     int tg = Tag_val (v);
@@ -257,11 +176,12 @@ caml_trace_accu_sp_file (value accu, value * sp, code_t prog, asize_t proglen,
   fprintf (f, "accu=");
   caml_trace_value_file (accu, prog, proglen, f);
   fprintf (f, "\n sp=%#" ARCH_INTNAT_PRINTF_FORMAT "x @%ld:",
-           (intnat) sp, (long) (Caml_state->stack_high - sp));
+           (intnat) sp, (long) (Stack_high(Caml_state->current_stack) - sp));
   for (p = sp, i = 0;
-       i < 12 + (1 << caml_trace_level) && p < Caml_state->stack_high;
+       i < 12 + (1 << caml_params->trace_level) &&
+         p < Stack_high(Caml_state->current_stack);
        p++, i++) {
-    fprintf (f, "\n[%ld] ", (long) (Caml_state->stack_high - p));
+    fprintf (f, "\n[%ld] ", (long) (Stack_high(Caml_state->current_stack) - p));
     caml_trace_value_file (*p, prog, proglen, f);
   };
   putc ('\n', f);
diff --git a/runtime/intern.c b/runtime/intern.c
index 1bb66adc44c1..3d8355c30cd8 100644
--- a/runtime/intern.c
+++ b/runtime/intern.c
@@ -35,38 +35,84 @@
 #include "caml/mlvalues.h"
 #include "caml/misc.h"
 #include "caml/reverse.h"
+#include "caml/shared_heap.h"
 #include "caml/signals.h"
 
+/* Item on the stack with defined operation */
+struct intern_item {
+  value * dest;
+  intnat arg;
+  enum {
+    OReadItems, /* read arg items and store them in dest[0], dest[1], ... */
+    OFreshOID,  /* generate a fresh OID and store it in *dest */
+    OShift      /* offset *dest by arg */
+  } op;
+};
+
+/* FIXME: This is duplicated in two other places, with the only difference of
+   the type of elements stored in the stack. Possible solution in C would
+   be to instantiate stack these function via. C preprocessor macro.
+ */
+
+#define INTERN_STACK_INIT_SIZE 256
+#define INTERN_STACK_MAX_SIZE (1024*1024*100)
 
-static unsigned char * intern_src;
-/* Reading pointer in block holding input data. */
+struct caml_intern_state {
 
-static unsigned char * intern_input = NULL;
-/* Pointer to beginning of block holding input data,
-   if non-NULL this pointer will be freed by the cleanup function. */
+  const unsigned char * intern_src;
+  /* Reading pointer in block holding input data. */
 
-static header_t * intern_dest;
-/* Writing pointer in destination block */
+  unsigned char * intern_input;
+  /* Pointer to beginning of block holding input data,
+    if non-NULL this pointer will be freed by the cleanup function. */
 
-static char * intern_extra_block = NULL;
-/* If non-NULL, point to new heap chunk allocated with caml_alloc_for_heap. */
+  asize_t obj_counter;
+  /* Count how many objects seen so far */
 
-static asize_t obj_counter;
-/* Count how many objects seen so far */
+  value * intern_obj_table;
+  /* The pointers to objects already seen */
 
-static value * intern_obj_table = NULL;
-/* The pointers to objects already seen */
+  struct intern_item intern_stack_init[INTERN_STACK_INIT_SIZE];
+  /* The initial intern stack */
 
-static color_t intern_color;
-/* Color to assign to newly created headers */
+  struct intern_item * intern_stack;
+  /* Initially points to [intern_stack_init] */
 
-static header_t intern_header;
-/* Original header of the destination block.
-   Meaningful only if intern_extra_block is NULL. */
+  struct intern_item * intern_stack_limit;
 
-static value intern_block = 0;
-/* Point to the heap block allocated as destination block.
-   Meaningful only if intern_extra_block is NULL. */
+  header_t * intern_dest;
+  /* Writing pointer in destination block. Only used when the object fits in
+     the minor heap. */
+};
+
+/* Allocates the domain local intern state if needed */
+static struct caml_intern_state* get_intern_state (void)
+{
+  struct caml_intern_state* s;
+
+  if (Caml_state->intern_state != NULL)
+    return Caml_state->intern_state;
+
+  s = caml_stat_alloc(sizeof(struct caml_intern_state));
+
+  s->intern_src = NULL;
+  s->intern_input = NULL;
+  s->obj_counter = 0;
+  s->intern_obj_table = NULL;
+  s->intern_stack = s->intern_stack_init;
+  s->intern_stack_limit = s->intern_stack + INTERN_STACK_INIT_SIZE;
+  s->intern_dest = NULL;
+
+  Caml_state->intern_state = s;
+  return s;
+}
+
+void caml_free_intern_state (void)
+{
+  if (Caml_state->intern_state != NULL)
+    caml_stat_free(Caml_state->intern_state);
+  Caml_state->intern_state = NULL;
+}
 
 static char * intern_resolve_code_pointer(unsigned char digest[16],
                                           asize_t offset);
@@ -75,110 +121,114 @@ CAMLnoreturn_start
 static void intern_bad_code_pointer(unsigned char digest[16])
 CAMLnoreturn_end;
 
-static void intern_free_stack(void);
-
-Caml_inline unsigned char read8u(void)
-{ return *intern_src++; }
+Caml_inline unsigned char read8u(struct caml_intern_state* s)
+{ return *s->intern_src++; }
 
-Caml_inline signed char read8s(void)
-{ return *intern_src++; }
+Caml_inline signed char read8s(struct caml_intern_state* s)
+{ return *s->intern_src++; }
 
-Caml_inline uint16_t read16u(void)
+Caml_inline uint16_t read16u(struct caml_intern_state* s)
 {
-  uint16_t res = (intern_src[0] << 8) + intern_src[1];
-  intern_src += 2;
+  uint16_t res = (s->intern_src[0] << 8) + s->intern_src[1];
+  s->intern_src += 2;
   return res;
 }
 
-Caml_inline int16_t read16s(void)
+Caml_inline int16_t read16s(struct caml_intern_state* s)
 {
-  int16_t res = (intern_src[0] << 8) + intern_src[1];
-  intern_src += 2;
+  int16_t res = (s->intern_src[0] << 8) + s->intern_src[1];
+  s->intern_src += 2;
   return res;
 }
 
-Caml_inline uint32_t read32u(void)
+Caml_inline uint32_t read32u(struct caml_intern_state* s)
 {
   uint32_t res =
-    ((uint32_t)(intern_src[0]) << 24) + (intern_src[1] << 16)
-    + (intern_src[2] << 8) + intern_src[3];
-  intern_src += 4;
+    ((uint32_t)(s->intern_src[0]) << 24) + (s->intern_src[1] << 16)
+    + (s->intern_src[2] << 8) + s->intern_src[3];
+  s->intern_src += 4;
   return res;
 }
 
-Caml_inline int32_t read32s(void)
+Caml_inline int32_t read32s(struct caml_intern_state* s)
 {
   int32_t res =
-    ((uint32_t)(intern_src[0]) << 24) + (intern_src[1] << 16)
-    + (intern_src[2] << 8) + intern_src[3];
-  intern_src += 4;
+    ((uint32_t)(s->intern_src[0]) << 24) + (s->intern_src[1] << 16)
+    + (s->intern_src[2] << 8) + s->intern_src[3];
+  s->intern_src += 4;
   return res;
 }
 
 #ifdef ARCH_SIXTYFOUR
-static uintnat read64u(void)
+static uintnat read64u(struct caml_intern_state* s)
 {
   uintnat res =
-    ((uintnat) (intern_src[0]) << 56)
-    + ((uintnat) (intern_src[1]) << 48)
-    + ((uintnat) (intern_src[2]) << 40)
-    + ((uintnat) (intern_src[3]) << 32)
-    + ((uintnat) (intern_src[4]) << 24)
-    + ((uintnat) (intern_src[5]) << 16)
-    + ((uintnat) (intern_src[6]) << 8)
-    + (uintnat) (intern_src[7]);
-  intern_src += 8;
+    ((uintnat) (s->intern_src[0]) << 56)
+    + ((uintnat) (s->intern_src[1]) << 48)
+    + ((uintnat) (s->intern_src[2]) << 40)
+    + ((uintnat) (s->intern_src[3]) << 32)
+    + ((uintnat) (s->intern_src[4]) << 24)
+    + ((uintnat) (s->intern_src[5]) << 16)
+    + ((uintnat) (s->intern_src[6]) << 8)
+    + (uintnat) (s->intern_src[7]);
+  s->intern_src += 8;
   return res;
 }
 #endif
 
-Caml_inline void readblock(void * dest, intnat len)
+Caml_inline void readblock(struct caml_intern_state* s,
+                           void * dest, intnat len)
 {
-  memcpy(dest, intern_src, len);
-  intern_src += len;
+  memcpy(dest, s->intern_src, len);
+  s->intern_src += len;
 }
 
-static void intern_init(void * src, void * input)
+static void intern_init(struct caml_intern_state* s, const void * src,
+                        void * input)
 {
+  CAMLassert (s);
   /* This is asserted at the beginning of demarshaling primitives.
      If it fails, it probably means that an exception was raised
      without calling intern_cleanup() during the previous demarshaling. */
-  CAMLassert (intern_input == NULL && intern_obj_table == NULL \
-     && intern_extra_block == NULL && intern_block == 0);
-  intern_src = src;
-  intern_input = input;
+  CAMLassert (s->intern_input == NULL && s->intern_obj_table == NULL);
+  s->intern_src = src;
+  s->intern_input = input;
 }
 
-static void intern_cleanup(void)
+/* Free the recursion stack if needed */
+static void intern_free_stack(struct caml_intern_state* s)
 {
-  if (intern_input != NULL) {
-     caml_stat_free(intern_input);
-     intern_input = NULL;
+  if (s->intern_stack != s->intern_stack_init) {
+    caml_stat_free(s->intern_stack);
+    /* Reinitialize the globals for next time around */
+    s->intern_stack = s->intern_stack_init;
+    s->intern_stack_limit = s->intern_stack + INTERN_STACK_INIT_SIZE;
   }
-  if (intern_obj_table != NULL) {
-    caml_stat_free(intern_obj_table);
-    intern_obj_table = NULL;
+}
+
+static void intern_cleanup(struct caml_intern_state* s)
+{
+  if (s->intern_input != NULL) {
+     caml_stat_free(s->intern_input);
+     s->intern_input = NULL;
   }
-  if (intern_extra_block != NULL) {
-    /* free newly allocated heap chunk */
-    caml_free_for_heap(intern_extra_block);
-    intern_extra_block = NULL;
-  } else if (intern_block != 0) {
-    /* restore original header for heap block, otherwise GC is confused */
-    Hd_val(intern_block) = intern_header;
-    intern_block = 0;
+  if (s->intern_obj_table != NULL) {
+    caml_stat_free(s->intern_obj_table);
+    s->intern_obj_table = NULL;
   }
+  s->intern_dest = NULL;
   /* free the recursion stack */
-  intern_free_stack();
+  intern_free_stack(s);
 }
 
-static void readfloat(double * dest, unsigned int code)
+static void readfloat(struct caml_intern_state* s,
+                      double * dest, unsigned int code)
 {
   if (sizeof(double) != 8) {
-    intern_cleanup();
+    intern_cleanup(s);
     caml_invalid_argument("input_value: non-standard floats");
   }
-  readblock((char *) dest, 8);
+  readblock(s, (char *) dest, 8);
   /* Fix up endianness, if needed */
 #if ARCH_FLOAT_ENDIANNESS == 0x76543210
   /* Host is big-endian; fix up if data read is little-endian */
@@ -196,14 +246,15 @@ static void readfloat(double * dest, unsigned int code)
 }
 
 /* [len] is a number of floats */
-static void readfloats(double * dest, mlsize_t len, unsigned int code)
+static void readfloats(struct caml_intern_state* s,
+                       double * dest, mlsize_t len, unsigned int code)
 {
   mlsize_t i;
   if (sizeof(double) != 8) {
-    intern_cleanup();
+    intern_cleanup(s);
     caml_invalid_argument("input_value: non-standard floats");
   }
-  readblock((char *) dest, len * 8);
+  readblock(s, (char *) dest, len * 8);
   /* Fix up endianness, if needed */
 #if ARCH_FLOAT_ENDIANNESS == 0x76543210
   /* Host is big-endian; fix up if data read is little-endian */
@@ -230,94 +281,116 @@ static void readfloats(double * dest, mlsize_t len, unsigned int code)
 #endif
 }
 
-/* Item on the stack with defined operation */
-struct intern_item {
-  value * dest;
-  intnat arg;
-  enum {
-    OReadItems, /* read arg items and store them in dest[0], dest[1], ... */
-    OFreshOID,  /* generate a fresh OID and store it in *dest */
-    OShift      /* offset *dest by arg */
-  } op;
-};
-
-/* FIXME: This is duplicated in two other places, with the only difference of
-   the type of elements stored in the stack. Possible solution in C would
-   be to instantiate stack these function via. C preprocessor macro.
- */
-
-#define INTERN_STACK_INIT_SIZE 256
-#define INTERN_STACK_MAX_SIZE (1024*1024*100)
-
-static struct intern_item intern_stack_init[INTERN_STACK_INIT_SIZE];
-
-static struct intern_item * intern_stack = intern_stack_init;
-static struct intern_item * intern_stack_limit = intern_stack_init
-                                                   + INTERN_STACK_INIT_SIZE;
-
-/* Free the recursion stack if needed */
-static void intern_free_stack(void)
-{
-  if (intern_stack != intern_stack_init) {
-    caml_stat_free(intern_stack);
-    /* Reinitialize the globals for next time around */
-    intern_stack = intern_stack_init;
-    intern_stack_limit = intern_stack + INTERN_STACK_INIT_SIZE;
-  }
-}
-
-/* Same, then raise Out_of_memory */
 CAMLnoreturn_start
-static void intern_stack_overflow(void)
+static void intern_stack_overflow(struct caml_intern_state*)
 CAMLnoreturn_end;
 
-static void intern_stack_overflow(void)
+static void intern_stack_overflow(struct caml_intern_state* s)
 {
   caml_gc_message (0x04, "Stack overflow in un-marshaling value\n");
-  intern_free_stack();
+  intern_cleanup(s);
   caml_raise_out_of_memory();
 }
 
-static struct intern_item * intern_resize_stack(struct intern_item * sp)
+static struct intern_item * intern_resize_stack(struct caml_intern_state* s,
+                                                struct intern_item * sp)
 {
-  asize_t newsize = 2 * (intern_stack_limit - intern_stack);
-  asize_t sp_offset = sp - intern_stack;
+  asize_t newsize = 2 * (s->intern_stack_limit - s->intern_stack);
+  asize_t sp_offset = sp - s->intern_stack;
   struct intern_item * newstack;
 
-  if (newsize >= INTERN_STACK_MAX_SIZE) intern_stack_overflow();
-  if (intern_stack == intern_stack_init) {
-    newstack = caml_stat_alloc_noexc(sizeof(struct intern_item) * newsize);
-    if (newstack == NULL) intern_stack_overflow();
-    memcpy(newstack, intern_stack_init,
-           sizeof(struct intern_item) * INTERN_STACK_INIT_SIZE);
-  } else {
-    newstack = caml_stat_resize_noexc(intern_stack,
-                                      sizeof(struct intern_item) * newsize);
-    if (newstack == NULL) intern_stack_overflow();
-  }
-  intern_stack = newstack;
-  intern_stack_limit = newstack + newsize;
+  if (newsize >= INTERN_STACK_MAX_SIZE) intern_stack_overflow(s);
+  newstack = caml_stat_calloc_noexc(newsize, sizeof(struct intern_item));
+  if (newstack == NULL) intern_stack_overflow(s);
+
+  /* Copy items from the old stack to the new stack */
+  memcpy(newstack, s->intern_stack,
+         sizeof(struct intern_item) * sp_offset);
+
+  /* Free to old stack if it is not the initial stack */
+  if (s->intern_stack != s->intern_stack_init)
+    caml_stat_free(s->intern_stack);
+
+  s->intern_stack = newstack;
+  s->intern_stack_limit = newstack + newsize;
   return newstack + sp_offset;
 }
 
 /* Convenience macros for requesting operation on the stack */
-#define PushItem()                                                      \
+#define PushItem(s)                                                     \
   do {                                                                  \
     sp++;                                                               \
-    if (sp >= intern_stack_limit) sp = intern_resize_stack(sp);         \
+    if (sp >= s->intern_stack_limit) sp = intern_resize_stack(s, sp);   \
   } while(0)
 
-#define ReadItems(_dest,_n)                                             \
+#define ReadItems(s,_dest,_n)                                           \
   do {                                                                  \
     if (_n > 0) {                                                       \
-      PushItem();                                                       \
+      PushItem(s);                                                      \
       sp->op = OReadItems;                                              \
       sp->dest = _dest;                                                 \
       sp->arg = _n;                                                     \
     }                                                                   \
   } while(0)
 
-static void intern_rec(value *dest)
+static void intern_alloc_storage(struct caml_intern_state* s, mlsize_t whsize,
+                                 mlsize_t num_objects)
+{
+  mlsize_t wosize;
+  value v;
+
+  if (whsize == 0) {
+    CAMLassert (s->intern_obj_table == NULL);
+    return;
+  }
+  wosize = Wosize_whsize(whsize);
+
+  if (wosize <= Max_young_wosize && wosize != 0) {
+    v = caml_alloc_small (wosize, String_tag);
+    s->intern_dest = (header_t *) Hp_val(v);
+  } else {
+    CAMLassert (s->intern_dest == NULL);
+  }
+  s->obj_counter = 0;
+  if (num_objects > 0) {
+    s->intern_obj_table =
+      (value *) caml_stat_alloc_noexc(num_objects * sizeof(value));
+    if (s->intern_obj_table == NULL) {
+      intern_cleanup(s);
+      caml_raise_out_of_memory();
+    }
+  } else {
+    CAMLassert(s->intern_obj_table == NULL);
+  }
+
+  return;
+}
+
+static value intern_alloc_obj(struct caml_intern_state* s, caml_domain_state* d,
+                              mlsize_t wosize, tag_t tag)
+{
+  void* p;
+
+  if (s->intern_dest) {
+    CAMLassert ((value*)s->intern_dest >= d->young_start &&
+                (value*)s->intern_dest < d->young_end);
+    p = s->intern_dest;
+    *s->intern_dest = Make_header (wosize, tag, 0);
+    s->intern_dest += 1 + wosize;
+  } else {
+    p = caml_shared_try_alloc(d->shared_heap, wosize, tag, 0 /* not pinned */);
+    d->allocated_words += Whsize_wosize(wosize);
+    if (p == NULL) {
+      intern_cleanup (s);
+      caml_raise_out_of_memory();
+    }
+    Hd_hp(p) = Make_header (wosize, tag, caml_global_heap_state.MARKED);
+  }
+  return Val_hp(p);
+}
+
+static void intern_rec(struct caml_intern_state* s,
+                       value *dest)
 {
   unsigned int code;
   tag_t tag;
@@ -329,14 +402,15 @@ static void intern_rec(value *dest)
   struct custom_operations * ops;
   char * codeptr;
   struct intern_item * sp;
+  caml_domain_state * d = Caml_state;
 
-  sp = intern_stack;
+  sp = s->intern_stack;
 
   /* Initially let's try to read the first object from the stream */
-  ReadItems(dest, 1);
+  ReadItems(s, dest, 1);
 
   /* The un-marshaler loop, the recursion is unrolled */
-  while(sp != intern_stack) {
+  while(sp != s->intern_stack) {
 
   /* Interpret next item on the stack */
   dest = sp->dest;
@@ -360,7 +434,7 @@ static void intern_rec(value *dest)
     sp->dest++;
     if (--(sp->arg) == 0) sp--;
     /* Read a value and set v to this value */
-  code = read8u();
+  code = read8u(s);
   if (code >= PREFIX_SMALL_INT) {
     if (code >= PREFIX_SMALL_BLOCK) {
       /* Small block */
@@ -370,25 +444,24 @@ static void intern_rec(value *dest)
       if (size == 0) {
         v = Atom(tag);
       } else {
-        v = Val_hp(intern_dest);
-        if (intern_obj_table != NULL) intern_obj_table[obj_counter++] = v;
-        *intern_dest = Make_header(size, tag, intern_color);
-        intern_dest += 1 + size;
+        v = intern_alloc_obj (s, d, size, tag);
+        if (s->intern_obj_table != NULL)
+          s->intern_obj_table[s->obj_counter++] = v;
         /* For objects, we need to freshen the oid */
         if (tag == Object_tag) {
           CAMLassert(size >= 2);
           /* Request to read rest of the elements of the block */
-          ReadItems(&Field(v, 2), size - 2);
+          ReadItems(s, &Field(v, 2), size - 2);
           /* Request freshing OID */
-          PushItem();
+          PushItem(s);
           sp->op = OFreshOID;
           sp->dest = (value*) v;
           sp->arg = 1;
           /* Finally read first two block elements: method table and old OID */
-          ReadItems(&Field(v, 0), 2);
+          ReadItems(s, &Field(v, 0), 2);
         } else
           /* If it's not an object then read the contents of the block */
-          ReadItems(&Field(v, 0), size);
+          ReadItems(s, &Field(v, 0), size);
       }
     } else {
       /* Small integer */
@@ -400,110 +473,105 @@ static void intern_rec(value *dest)
       len = (code & 0x1F);
     read_string:
       size = (len + sizeof(value)) / sizeof(value);
-      v = Val_hp(intern_dest);
-      if (intern_obj_table != NULL) intern_obj_table[obj_counter++] = v;
-      *intern_dest = Make_header(size, String_tag, intern_color);
-      intern_dest += 1 + size;
+      v = intern_alloc_obj (s, d, size, String_tag);
+      if (s->intern_obj_table != NULL)
+        s->intern_obj_table[s->obj_counter++] = v;
       Field(v, size - 1) = 0;
       ofs_ind = Bsize_wsize(size) - 1;
       Byte(v, ofs_ind) = ofs_ind - len;
-      readblock((char *)String_val(v), len);
+      readblock(s, (char *)String_val(v), len);
     } else {
       switch(code) {
       case CODE_INT8:
-        v = Val_long(read8s());
+        v = Val_long(read8s(s));
         break;
       case CODE_INT16:
-        v = Val_long(read16s());
+        v = Val_long(read16s(s));
         break;
       case CODE_INT32:
-        v = Val_long(read32s());
+        v = Val_long(read32s(s));
         break;
       case CODE_INT64:
 #ifdef ARCH_SIXTYFOUR
-        v = Val_long((intnat) (read64u()));
+        v = Val_long((intnat) (read64u(s)));
         break;
 #else
-        intern_cleanup();
+        intern_cleanup(s);
         caml_failwith("input_value: integer too large");
         break;
 #endif
       case CODE_SHARED8:
-        ofs = read8u();
+        ofs = read8u(s);
       read_shared:
         CAMLassert (ofs > 0);
-        CAMLassert (ofs <= obj_counter);
-        CAMLassert (intern_obj_table != NULL);
-        v = intern_obj_table[obj_counter - ofs];
+        CAMLassert (ofs <= s->obj_counter);
+        CAMLassert (s->intern_obj_table != NULL);
+        v = s->intern_obj_table[s->obj_counter - ofs];
         break;
       case CODE_SHARED16:
-        ofs = read16u();
+        ofs = read16u(s);
         goto read_shared;
       case CODE_SHARED32:
-        ofs = read32u();
+        ofs = read32u(s);
         goto read_shared;
 #ifdef ARCH_SIXTYFOUR
       case CODE_SHARED64:
-        ofs = read64u();
+        ofs = read64u(s);
         goto read_shared;
 #endif
       case CODE_BLOCK32:
-        header = (header_t) read32u();
+        header = (header_t) read32u(s);
         tag = Tag_hd(header);
         size = Wosize_hd(header);
         goto read_block;
 #ifdef ARCH_SIXTYFOUR
       case CODE_BLOCK64:
-        header = (header_t) read64u();
+        header = (header_t) read64u(s);
         tag = Tag_hd(header);
         size = Wosize_hd(header);
         goto read_block;
 #endif
       case CODE_STRING8:
-        len = read8u();
+        len = read8u(s);
         goto read_string;
       case CODE_STRING32:
-        len = read32u();
+        len = read32u(s);
         goto read_string;
 #ifdef ARCH_SIXTYFOUR
       case CODE_STRING64:
-        len = read64u();
+        len = read64u(s);
         goto read_string;
 #endif
       case CODE_DOUBLE_LITTLE:
       case CODE_DOUBLE_BIG:
-        v = Val_hp(intern_dest);
-        if (intern_obj_table != NULL) intern_obj_table[obj_counter++] = v;
-        *intern_dest = Make_header(Double_wosize, Double_tag,
-                                   intern_color);
-        intern_dest += 1 + Double_wosize;
-        readfloat((double *) v, code);
+        v = intern_alloc_obj (s, d, Double_wosize, Double_tag);
+        if (s->intern_obj_table != NULL)
+          s->intern_obj_table[s->obj_counter++] = v;
+        readfloat(s, (double *) v, code);
         break;
       case CODE_DOUBLE_ARRAY8_LITTLE:
       case CODE_DOUBLE_ARRAY8_BIG:
-        len = read8u();
+        len = read8u(s);
       read_double_array:
         size = len * Double_wosize;
-        v = Val_hp(intern_dest);
-        if (intern_obj_table != NULL) intern_obj_table[obj_counter++] = v;
-        *intern_dest = Make_header(size, Double_array_tag,
-                                   intern_color);
-        intern_dest += 1 + size;
-        readfloats((double *) v, len, code);
+        v = intern_alloc_obj (s, d, size, Double_array_tag);
+        if (s->intern_obj_table != NULL)
+          s->intern_obj_table[s->obj_counter++] = v;
+        readfloats(s, (double *) v, len, code);
         break;
       case CODE_DOUBLE_ARRAY32_LITTLE:
       case CODE_DOUBLE_ARRAY32_BIG:
-        len = read32u();
+        len = read32u(s);
         goto read_double_array;
 #ifdef ARCH_SIXTYFOUR
       case CODE_DOUBLE_ARRAY64_LITTLE:
       case CODE_DOUBLE_ARRAY64_BIG:
-        len = read64u();
+        len = read64u(s);
         goto read_double_array;
 #endif
       case CODE_CODEPOINTER:
-        ofs = read32u();
-        readblock(digest, 16);
+        ofs = read32u(s);
+        readblock(s, digest, 16);
         codeptr = intern_resolve_code_pointer(digest, ofs);
         if (codeptr != NULL) {
           v = (value) codeptr;
@@ -514,82 +582,83 @@ static void intern_rec(value *dest)
             /* Use the code pointer from the "placeholder" function */
             v = (value) Code_val(*function_placeholder);
           } else {
-            intern_cleanup();
+            intern_cleanup(s);
             intern_bad_code_pointer(digest);
           }
         }
         break;
       case CODE_INFIXPOINTER:
-        ofs = read32u();
+        ofs = read32u(s);
         /* Read a value to *dest, then offset *dest by ofs */
-        PushItem();
+        PushItem(s);
         sp->dest = dest;
         sp->op = OShift;
         sp->arg = ofs;
-        ReadItems(dest, 1);
+        ReadItems(s, dest, 1);
         continue;  /* with next iteration of main loop, skipping *dest = v */
-      case CODE_CUSTOM:
       case CODE_CUSTOM_LEN:
       case CODE_CUSTOM_FIXED: {
-        ops = caml_find_custom_operations((char *) intern_src);
+        uintnat expected_size, temp_size;
+        ops = caml_find_custom_operations((char *) s->intern_src);
         if (ops == NULL) {
-          intern_cleanup();
+          intern_cleanup(s);
           caml_failwith("input_value: unknown custom block identifier");
         }
         if (code == CODE_CUSTOM_FIXED && ops->fixed_length == NULL) {
-          intern_cleanup();
+          intern_cleanup(s);
           caml_failwith("input_value: expected a fixed-size custom block");
         }
-        while (*intern_src++ != 0) /*nothing*/;  /*skip identifier*/
-        if (code == CODE_CUSTOM) {
-          /* deprecated */
-          size = ops->deserialize((void *) (intern_dest + 2));
-        } else {
-          uintnat expected_size;
+        while (*s->intern_src++ != 0) /*nothing*/;  /*skip identifier*/
 #ifdef ARCH_SIXTYFOUR
-          if (code == CODE_CUSTOM_FIXED) {
-            expected_size = ops->fixed_length->bsize_64;
-          } else {
-            intern_src += 4;
-            expected_size = read64u();
-          }
+        if (code == CODE_CUSTOM_FIXED) {
+          expected_size = ops->fixed_length->bsize_64;
+        } else {
+          s->intern_src += 4;
+          expected_size = read64u(s);
+        }
 #else
-          if (code == CODE_CUSTOM_FIXED) {
-            expected_size = ops->fixed_length->bsize_32;
-          } else {
-            expected_size = read32u();
-            intern_src += 8;
-          }
-#endif
-          size = ops->deserialize((void *) (intern_dest + 2));
-          if (size != expected_size) {
-            intern_cleanup();
-            caml_failwith(
-              "input_value: incorrect length of serialized custom block");
-          }
+        if (code == CODE_CUSTOM_FIXED) {
+          expected_size = ops->fixed_length->bsize_32;
+        } else {
+          expected_size = read32u(s);
+          s->intern_src += 8;
         }
-        size = 1 + (size + sizeof(value) - 1) / sizeof(value);
-        v = Val_hp(intern_dest);
-        if (intern_obj_table != NULL) intern_obj_table[obj_counter++] = v;
-        *intern_dest = Make_header(size, Custom_tag,
-                                   intern_color);
+#endif
+        temp_size = 1 + (expected_size + sizeof(value) - 1) / sizeof(value);
+        v = intern_alloc_obj(s, d, temp_size, Custom_tag);
         Custom_ops_val(v) = ops;
-
+        size = ops->deserialize(Data_custom_val(v));
+        if (size != expected_size) {
+          intern_cleanup(s);
+          caml_failwith(
+            "input_value: incorrect length of serialized custom block");
+        }
+        if (s->intern_obj_table != NULL)
+          s->intern_obj_table[s->obj_counter++] = v;
         if (ops->finalize != NULL && Is_young(v)) {
           /* Remember that the block has a finalizer. */
-          add_to_custom_table (Caml_state->custom_table, v, 0, 1);
+          add_to_custom_table (&d->minor_tables->custom, v, 0, 1);
         }
-
-        intern_dest += 1 + size;
         break;
       }
       default:
-        intern_cleanup();
+        intern_cleanup(s);
         caml_failwith("input_value: ill-formed message");
       }
     }
   }
   /* end of case OReadItems */
+  /* The following direct-assignment to [*dest] rather than [caml_modify] is
+     safe since either it is the case that
+
+       1. [dest] points within the minor heap of the current domain or
+       2. [dest] is a freshly-allocated major heap block, but not yet visible
+          to the GC, and if [v] is a block, then it is also in the major heap.
+          So no major to minor heap references are created.
+
+     Moreover, since [*dest] is uninitialised, using `caml_modify` is
+     incorrect; the deletion barrier will mark the old uninitialised value and
+     may crash. */
   *dest = v;
   break;
   default:
@@ -597,116 +666,17 @@ static void intern_rec(value *dest)
   }
   }
   /* We are done. Cleanup the stack and leave the function */
-  intern_free_stack();
+  intern_free_stack(s);
 }
 
-static void intern_alloc(mlsize_t whsize, mlsize_t num_objects)
-{
-  mlsize_t wosize;
-
-  if (whsize == 0) {
-    CAMLassert (intern_extra_block == NULL && intern_block == 0
-         && intern_obj_table == NULL);
-    return;
-  }
-  wosize = Wosize_whsize(whsize);
-  if (wosize > Max_wosize) {
-    /* Round desired size up to next page */
-    asize_t request =
-      ((Bsize_wsize(whsize) + Page_size - 1) >> Page_log) << Page_log;
-    intern_extra_block = caml_alloc_for_heap(request);
-    if (intern_extra_block == NULL) {
-      intern_cleanup();
-      caml_raise_out_of_memory();
-    }
-    intern_color = caml_allocation_color(intern_extra_block);
-    intern_dest = (header_t *) intern_extra_block;
-    CAMLassert (intern_block == 0);
-  } else {
-    /* this is a specialised version of caml_alloc from alloc.c */
-    if (wosize <= Max_young_wosize){
-      if (wosize == 0){
-        intern_block = Atom (String_tag);
-      }else{
-#define Setup_for_gc
-#define Restore_after_gc
-        Alloc_small_no_track(intern_block, wosize, String_tag);
-#undef Setup_for_gc
-#undef Restore_after_gc
-      }
-    }else{
-      intern_block = caml_alloc_shr_no_track_noexc (wosize, String_tag);
-      /* do not do the urgent_gc check here because it might darken
-         intern_block into gray and break the intern_color assertion below */
-      if (intern_block == 0) {
-        intern_cleanup();
-        caml_raise_out_of_memory();
-      }
-    }
-    intern_header = Hd_val(intern_block);
-    intern_color = Color_hd(intern_header);
-    CAMLassert (intern_color == Caml_white || intern_color == Caml_black);
-    intern_dest = (header_t *) Hp_val(intern_block);
-    CAMLassert (intern_extra_block == NULL);
-  }
-  obj_counter = 0;
-  if (num_objects > 0) {
-    intern_obj_table =
-      (value *) caml_stat_alloc_noexc(num_objects * sizeof(value));
-    if (intern_obj_table == NULL) {
-      intern_cleanup();
-      caml_raise_out_of_memory();
-    }
-  } else
-    CAMLassert(intern_obj_table == NULL);
-}
-
-static header_t* intern_add_to_heap(mlsize_t whsize)
-{
-  header_t* res = NULL;
-  /* Add new heap chunk to heap if needed */
-  if (intern_extra_block != NULL) {
-    /* If heap chunk not filled totally, build free block at end */
-    asize_t request = Chunk_size (intern_extra_block);
-    header_t * end_extra_block =
-      (header_t *) intern_extra_block + Wsize_bsize(request);
-    CAMLassert(intern_block == 0);
-    CAMLassert(intern_dest <= end_extra_block);
-    if (intern_dest < end_extra_block){
-      caml_make_free_blocks ((value *) intern_dest,
-                             end_extra_block - intern_dest, 0, Caml_white);
-    }
-    caml_allocated_words +=
-      Wsize_bsize ((char *) intern_dest - intern_extra_block);
-    if(caml_add_to_heap(intern_extra_block) != 0) {
-      intern_cleanup();
-      caml_raise_out_of_memory();
-    }
-    res = (header_t*)intern_extra_block;
-    intern_extra_block = NULL; // To prevent intern_cleanup freeing it
-  } else if(intern_block != 0) { /* [intern_block = 0] when [whsize = 0]  */
-    res = Hp_val(intern_block);
-    intern_block = 0; // To prevent intern_cleanup rewriting its header
-  }
-  return res;
-}
-
-static value intern_end(value res, mlsize_t whsize)
+static value intern_end(struct caml_intern_state* s, value res)
 {
   CAMLparam1(res);
-  header_t *block = intern_add_to_heap(whsize);
-  header_t *blockend = intern_dest;
-
   /* Free everything */
-  intern_cleanup();
+  intern_cleanup(s);
 
-  /* Memprof tracking has to be done here, because unmarshalling can
-     still fail until now. */
-  if(block != NULL)
-    caml_memprof_track_interned(block, blockend);
-
-  // Give gc a chance to run, and run memprof callbacks
-  caml_process_pending_actions();
+  /* Give gc a chance to run, and run memprof callbacks */
+  res = caml_check_urgent_gc(res);
 
   CAMLreturn(res);
 }
@@ -721,32 +691,33 @@ struct marshal_header {
   uintnat whsize;
 };
 
-static void caml_parse_header(char * fun_name,
+static void caml_parse_header(struct caml_intern_state* s,
+                              char * fun_name,
                               /*out*/ struct marshal_header * h)
 {
   char errmsg[100];
 
-  h->magic = read32u();
+  h->magic = read32u(s);
   switch(h->magic) {
   case Intext_magic_number_small:
     h->header_len = 20;
-    h->data_len = read32u();
-    h->num_objects = read32u();
+    h->data_len = read32u(s);
+    h->num_objects = read32u(s);
 #ifdef ARCH_SIXTYFOUR
-    read32u();
-    h->whsize = read32u();
+    read32u(s);
+    h->whsize = read32u(s);
 #else
-    h->whsize = read32u();
-    read32u();
+    h->whsize = read32u(s);
+    read32u(s);
 #endif
     break;
   case Intext_magic_number_big:
 #ifdef ARCH_SIXTYFOUR
     h->header_len = 32;
-    read32u();
-    h->data_len = read64u();
-    h->num_objects = read64u();
-    h->whsize = read64u();
+    read32u(s);
+    h->data_len = read64u(s);
+    h->num_objects = read64u(s);
+    h->whsize = read64u(s);
 #else
     errmsg[sizeof(errmsg) - 1] = 0;
     snprintf(errmsg, sizeof(errmsg) - 1,
@@ -773,6 +744,7 @@ value caml_input_val(struct channel *chan)
   struct marshal_header h;
   char * block;
   value res;
+  struct caml_intern_state* s = get_intern_state ();
 
   if (! caml_channel_binary_mode(chan))
     caml_failwith("input_value: not a binary channel");
@@ -782,14 +754,14 @@ value caml_input_val(struct channel *chan)
     caml_raise_end_of_file();
   else if (r < 20)
     caml_failwith("input_value: truncated object");
-  intern_src = (unsigned char *) header;
-  if (read32u() == Intext_magic_number_big) {
+  s->intern_src = (unsigned char *) header;
+  if (read32u(s) == Intext_magic_number_big) {
     /* Finish reading the header */
     if (caml_really_getblock(chan, header + 20, 32 - 20) < 32 - 20)
       caml_failwith("input_value: truncated object");
   }
-  intern_src = (unsigned char *) header;
-  caml_parse_header("input_value", &h);
+  s->intern_src = (unsigned char *) header;
+  caml_parse_header(s, "input_value", &h);
   /* Read block from channel */
   block = caml_stat_alloc(h.data_len);
   /* During [caml_really_getblock], concurrent [caml_input_val] operations
@@ -801,11 +773,11 @@ value caml_input_val(struct channel *chan)
     caml_failwith("input_value: truncated object");
   }
   /* Initialize global state */
-  intern_init(block, block);
-  intern_alloc(h.whsize, h.num_objects);
+  intern_init(s, block, block);
+  intern_alloc_storage(s, h.whsize, h.num_objects);
   /* Fill it in */
-  intern_rec(&res);
-  return intern_end(res, h.whsize);
+  intern_rec(s, &res);
+  return intern_end(s, res);
 }
 
 CAMLprim value caml_input_value(value vchan)
@@ -822,23 +794,30 @@ CAMLprim value caml_input_value(value vchan)
 
 /* Reading from memory-resident blocks */
 
+/* XXX KC: Unused primitive. Remove with boostrap. */
+CAMLprim value caml_input_value_to_outside_heap(value vchan)
+{
+  return caml_input_value(vchan);
+}
+
 CAMLexport value caml_input_val_from_bytes(value str, intnat ofs)
 {
   CAMLparam1 (str);
   CAMLlocal1 (obj);
   struct marshal_header h;
+  struct caml_intern_state* s = get_intern_state ();
 
   /* Initialize global state */
-  intern_init(&Byte_u(str, ofs), NULL);
-  caml_parse_header("input_val_from_string", &h);
+  intern_init(s, &Byte_u(str, ofs), NULL);
+  caml_parse_header(s, "input_val_from_string", &h);
   if (ofs + h.header_len + h.data_len > caml_string_length(str))
     caml_failwith("input_val_from_string: bad length");
   /* Allocate result */
-  intern_alloc(h.whsize, h.num_objects);
-  intern_src = &Byte_u(str, ofs + h.header_len); /* If a GC occurred */
+  intern_alloc_storage(s, h.whsize, h.num_objects);
+  s->intern_src = &Byte_u(str, ofs + h.header_len); /* If a GC occurred */
   /* Fill it in */
-  intern_rec(&obj);
-  CAMLreturn (intern_end(obj, h.whsize));
+  intern_rec(s, &obj);
+  CAMLreturn (intern_end(s, obj));
 }
 
 CAMLprim value caml_input_value_from_bytes(value str, value ofs)
@@ -846,38 +825,41 @@ CAMLprim value caml_input_value_from_bytes(value str, value ofs)
   return caml_input_val_from_bytes(str, Long_val(ofs));
 }
 
-static value input_val_from_block(struct marshal_header * h)
+static value input_val_from_block(struct caml_intern_state* s,
+                                  struct marshal_header * h)
 {
   value obj;
   /* Allocate result */
-  intern_alloc(h->whsize, h->num_objects);
+  intern_alloc_storage(s, h->whsize, h->num_objects);
   /* Fill it in */
-  intern_rec(&obj);
-  return (intern_end(obj, h->whsize));
+  intern_rec(s, &obj);
+  return (intern_end(s, obj));
 }
 
 CAMLexport value caml_input_value_from_malloc(char * data, intnat ofs)
 {
   struct marshal_header h;
+  struct caml_intern_state* s = get_intern_state ();
 
-  intern_init(data + ofs, data);
+  intern_init(s, data + ofs, data);
 
-  caml_parse_header("input_value_from_malloc", &h);
+  caml_parse_header(s, "input_value_from_malloc", &h);
 
-  return input_val_from_block(&h);
+  return input_val_from_block(s, &h);
 }
 
 /* [len] is a number of bytes */
-CAMLexport value caml_input_value_from_block(char * data, intnat len)
+CAMLexport value caml_input_value_from_block(const char * data, intnat len)
 {
   struct marshal_header h;
+  struct caml_intern_state* s = get_intern_state ();
 
   /* Initialize global state */
-  intern_init(data, NULL);
-  caml_parse_header("input_value_from_block", &h);
+  intern_init(s, data, NULL);
+  caml_parse_header(s, "input_value_from_block", &h);
   if (h.header_len + h.data_len > len)
     caml_failwith("input_val_from_block: bad length");
-  return input_val_from_block(&h);
+  return input_val_from_block(s, &h);
 }
 
 /* [ofs] is a [value] that represents a number of bytes
@@ -892,19 +874,20 @@ CAMLprim value caml_marshal_data_size(value buff, value ofs)
   uint32_t magic;
   int header_len;
   uintnat data_len;
+  struct caml_intern_state *s = get_intern_state ();
 
-  intern_src = &Byte_u(buff, Long_val(ofs));
-  magic = read32u();
+  s->intern_src = &Byte_u(buff, Long_val(ofs));
+  magic = read32u(s);
   switch(magic) {
   case Intext_magic_number_small:
     header_len = 20;
-    data_len = read32u();
+    data_len = read32u(s);
     break;
   case Intext_magic_number_big:
 #ifdef ARCH_SIXTYFOUR
     header_len = 32;
-    read32u();
-    data_len = read64u();
+    read32u(s);
+    data_len = read64u(s);
 #else
     caml_failwith("Marshal.data_size: "
                   "object too large to be read back on a 32-bit platform");
@@ -946,32 +929,38 @@ static void intern_bad_code_pointer(unsigned char digest[16])
 
 CAMLexport int caml_deserialize_uint_1(void)
 {
-  return read8u();
+  struct caml_intern_state* s = get_intern_state ();
+  return read8u(s);
 }
 
 CAMLexport int caml_deserialize_sint_1(void)
 {
-  return read8s();
+  struct caml_intern_state* s = get_intern_state ();
+  return read8s(s);
 }
 
 CAMLexport int caml_deserialize_uint_2(void)
 {
-  return read16u();
+  struct caml_intern_state* s = get_intern_state ();
+  return read16u(s);
 }
 
 CAMLexport int caml_deserialize_sint_2(void)
 {
-  return read16s();
+  struct caml_intern_state* s = get_intern_state ();
+  return read16s(s);
 }
 
 CAMLexport uint32_t caml_deserialize_uint_4(void)
 {
-  return read32u();
+  struct caml_intern_state* s = get_intern_state ();
+  return read32u(s);
 }
 
 CAMLexport int32_t caml_deserialize_sint_4(void)
 {
-  return read32s();
+  struct caml_intern_state* s = get_intern_state ();
+  return read32s(s);
 }
 
 CAMLexport uint64_t caml_deserialize_uint_8(void)
@@ -1004,69 +993,75 @@ CAMLexport double caml_deserialize_float_8(void)
 
 CAMLexport void caml_deserialize_block_1(void * data, intnat len)
 {
-  memcpy(data, intern_src, len);
-  intern_src += len;
+  struct caml_intern_state* s = get_intern_state ();
+  memcpy(data, s->intern_src, len);
+  s->intern_src += len;
 }
 
 CAMLexport void caml_deserialize_block_2(void * data, intnat len)
 {
+  struct caml_intern_state* s = get_intern_state ();
 #ifndef ARCH_BIG_ENDIAN
-  unsigned char * p, * q;
-  for (p = intern_src, q = data; len > 0; len--, p += 2, q += 2)
+  const unsigned char * p, * q;
+  for (p = s->intern_src, q = data; len > 0; len--, p += 2, q += 2)
     Reverse_16(q, p);
-  intern_src = p;
+  s->intern_src = p;
 #else
-  memcpy(data, intern_src, len * 2);
-  intern_src += len * 2;
+  memcpy(data, s->intern_src, len * 2);
+  s->intern_src += len * 2;
 #endif
 }
 
 CAMLexport void caml_deserialize_block_4(void * data, intnat len)
 {
+  struct caml_intern_state* s = get_intern_state ();
 #ifndef ARCH_BIG_ENDIAN
-  unsigned char * p, * q;
-  for (p = intern_src, q = data; len > 0; len--, p += 4, q += 4)
+  const unsigned char * p, * q;
+  for (p = s->intern_src, q = data; len > 0; len--, p += 4, q += 4)
     Reverse_32(q, p);
-  intern_src = p;
+  s->intern_src = p;
 #else
-  memcpy(data, intern_src, len * 4);
-  intern_src += len * 4;
+  memcpy(data, s->intern_src, len * 4);
+  s->intern_src += len * 4;
 #endif
 }
 
 CAMLexport void caml_deserialize_block_8(void * data, intnat len)
 {
+  struct caml_intern_state* s = get_intern_state ();
 #ifndef ARCH_BIG_ENDIAN
-  unsigned char * p, * q;
-  for (p = intern_src, q = data; len > 0; len--, p += 8, q += 8)
+  const unsigned char * p, * q;
+  for (p = s->intern_src, q = data; len > 0; len--, p += 8, q += 8)
     Reverse_64(q, p);
-  intern_src = p;
+  s->intern_src = p;
 #else
-  memcpy(data, intern_src, len * 8);
-  intern_src += len * 8;
+  memcpy(data, s->intern_src, len * 8);
+  s->intern_src += len * 8;
 #endif
 }
 
 CAMLexport void caml_deserialize_block_float_8(void * data, intnat len)
 {
+  struct caml_intern_state* s = get_intern_state ();
 #if ARCH_FLOAT_ENDIANNESS == 0x01234567
-  memcpy(data, intern_src, len * 8);
-  intern_src += len * 8;
+  memcpy(data, s->intern_src, len * 8);
+  s->intern_src += len * 8;
 #elif ARCH_FLOAT_ENDIANNESS == 0x76543210
-  unsigned char * p, * q;
-  for (p = intern_src, q = data; len > 0; len--, p += 8, q += 8)
+  const unsigned char * p, * q;
+  for (p = s->intern_src, q = data; len > 0; len--, p += 8, q += 8)
     Reverse_64(q, p);
-  intern_src = p;
+  s->intern_src = p;
 #else
-  unsigned char * p, * q;
-  for (p = intern_src, q = data; len > 0; len--, p += 8, q += 8)
+  const unsigned char * p, * q;
+  for (p = s->intern_src, q = data; len > 0; len--, p += 8, q += 8)
     Permute_64(q, ARCH_FLOAT_ENDIANNESS, p, 0x01234567);
-  intern_src = p;
+  s->intern_src = p;
 #endif
 }
 
 CAMLexport void caml_deserialize_error(char * msg)
 {
-  intern_cleanup();
+  struct caml_intern_state* s = get_intern_state ();
+  intern_cleanup(s);
   caml_failwith(msg);
 }
diff --git a/runtime/interp.c b/runtime/interp.c
index a59811c87d86..40089087f165 100644
--- a/runtime/interp.c
+++ b/runtime/interp.c
@@ -20,6 +20,7 @@
 #include "caml/alloc.h"
 #include "caml/backtrace.h"
 #include "caml/callback.h"
+#include "caml/codefrag.h"
 #include "caml/debugger.h"
 #include "caml/fail.h"
 #include "caml/fix_code.h"
@@ -32,7 +33,10 @@
 #include "caml/mlvalues.h"
 #include "caml/prims.h"
 #include "caml/signals.h"
-#include "caml/stacks.h"
+#include "caml/fiber.h"
+#include "caml/domain.h"
+#include "caml/globroots.h"
+#include "caml/startup.h"
 #include "caml/startup_aux.h"
 
 /* Registers for the abstract machine:
@@ -40,7 +44,7 @@
         sp         the stack pointer (grows downward)
         accu       the accumulator
         env        heap-allocated environment
-        Caml_state->trapsp pointer to the current trap frame
+        Caml_state->trap_sp_off offset to the current trap frame
         extra_args number of extra arguments provided by the caller
 
 sp is a local copy of the global variable Caml_state->extern_sp. */
@@ -72,17 +76,22 @@ sp is a local copy of the global variable Caml_state->extern_sp. */
 #define Alloc_small_origin CAML_FROM_CAML
 #define Setup_for_gc \
   { sp -= 3; sp[0] = accu; sp[1] = env; sp[2] = (value)pc; \
-    Caml_state->extern_sp = sp; }
+    domain_state->current_stack->sp = sp; }
 #define Restore_after_gc \
-  { sp = Caml_state->extern_sp; accu = sp[0]; env = sp[1]; sp += 3; }
+  { sp = domain_state->current_stack->sp; accu = sp[0]; env = sp[1]; sp += 3; }
+#define Enter_gc \
+  { Setup_for_gc; \
+    caml_process_pending_actions();         \
+    Restore_after_gc; }
 
 /* We store [pc+1] in the stack so that, in case of an exception, the
    first backtrace slot points to the event following the C call
    instruction. */
 #define Setup_for_c_call \
-  { sp -= 2; sp[0] = env; sp[1] = (value)(pc + 1); Caml_state->extern_sp = sp; }
+  { sp -= 2; sp[0] = env; sp[1] = (value)(pc + 1); \
+    domain_state->current_stack->sp = sp; }
 #define Restore_after_c_call \
-  { sp = Caml_state->extern_sp; env = *sp; sp += 2; }
+  { sp = domain_state->current_stack->sp; env = *sp; sp += 2; }
 
 /* For VM threads purposes, an event frame must look like accu + a
    C_CALL frame + a RETURN 1 frame.
@@ -93,12 +102,12 @@ sp is a local copy of the global variable Caml_state->extern_sp. */
     sp[0] = accu; /* accu */ \
     sp[1] = Val_unit; /* C_CALL frame: dummy environment */ \
     sp[2] = Val_unit; /* RETURN frame: dummy local 0 */ \
-    sp[3] = (value) pc; /* RETURN frame: saved return address */ \
+    sp[3] = (value) pc; /* RETURN frame: saved return address */  \
     sp[4] = env; /* RETURN frame: saved environment */ \
     sp[5] = Val_long(extra_args); /* RETURN frame: saved extra args */ \
-    Caml_state->extern_sp = sp; }
+    domain_state->current_stack->sp = sp; }
 #define Restore_after_event \
-  { sp = Caml_state->extern_sp; accu = sp[0]; \
+  { sp = domain_state->current_stack->sp; accu = sp[0]; \
     pc = (code_t) sp[3]; env = sp[4]; extra_args = Long_val(sp[5]); \
     sp += 6; }
 
@@ -108,9 +117,9 @@ sp is a local copy of the global variable Caml_state->extern_sp. */
    { sp -= 4; \
      sp[0] = accu; sp[1] = (value)(pc - 1); \
      sp[2] = env; sp[3] = Val_long(extra_args); \
-     Caml_state->extern_sp = sp; }
+     domain_state->current_stack->sp = sp; }
 #define Restore_after_debugger \
-   { CAMLassert(sp == Caml_state->extern_sp); \
+   { CAMLassert(sp == domain_state->current_stack->sp); \
      CAMLassert(sp[0] == accu); \
      CAMLassert(sp[2] == env); \
      sp += 4; }
@@ -125,7 +134,7 @@ sp is a local copy of the global variable Caml_state->extern_sp. */
 #endif
 
 #define Check_trap_barrier \
-  if (Caml_state->trapsp >= Caml_state->trap_barrier) \
+  if (domain_state->trap_sp_off >= domain_state->trap_barrier_off) \
     caml_debugger(TRAP_BARRIER, Val_unit)
 
 /* Register optimization.
@@ -206,9 +215,11 @@ sp is a local copy of the global variable Caml_state->extern_sp. */
 #endif
 
 #ifdef DEBUG
-static intnat caml_bcodcount;
+static __thread intnat caml_bcodcount;
 #endif
 
+static value raise_unhandled;
+
 /* The interpreter itself */
 
 value caml_interprete(code_t prog, asize_t prog_size)
@@ -231,12 +242,15 @@ value caml_interprete(code_t prog, asize_t prog_size)
 #endif
   value env;
   intnat extra_args;
-  struct longjmp_buffer * initial_external_raise;
-  intnat initial_sp_offset;
-  /* volatile ensures that initial_local_roots
-     will keep correct value across longjmp */
-  struct caml__roots_block * volatile initial_local_roots;
+  struct caml_exception_context * initial_external_raise;
+  int initial_stack_words;
+  intnat initial_trap_sp_off;
+  volatile value raise_exn_bucket = Val_unit;
   struct longjmp_buffer raise_buf;
+  value resume_fn, resume_arg;
+  caml_domain_state* domain_state = Caml_state;
+  struct caml_exception_context exception_ctx =
+    { &raise_buf, domain_state->local_roots, &raise_exn_bucket};
 #ifndef THREADED_CODE
   opcode_t curr_instr;
 #endif
@@ -248,39 +262,57 @@ value caml_interprete(code_t prog, asize_t prog_size)
 #endif
 
   if (prog == NULL) {           /* Interpreter is initializing */
+    static opcode_t raise_unhandled_code[] = { ACC, 0, RAISE };
+    value raise_unhandled_closure;
+
+    caml_register_code_fragment(
+      (char *) raise_unhandled_code,
+      (char *) raise_unhandled_code + sizeof(raise_unhandled_code),
+      DIGEST_IGNORE, NULL);
 #ifdef THREADED_CODE
     caml_instr_table = (char **) jumptable;
     caml_instr_base = Jumptbl_base;
+    caml_thread_code(raise_unhandled_code,
+                     sizeof(raise_unhandled_code));
 #endif
+    raise_unhandled_closure = caml_alloc_small (2, Closure_tag);
+    Code_val(raise_unhandled_closure) = (code_t)raise_unhandled_code;
+    Closinfo_val(raise_unhandled_closure) = Make_closinfo(0, 2);
+    raise_unhandled = raise_unhandled_closure;
+    caml_register_generational_global_root(&raise_unhandled);
+    caml_global_data = Val_unit;
+    caml_register_generational_global_root(&caml_global_data);
+    caml_init_callbacks();
     return Val_unit;
   }
 
 #if defined(THREADED_CODE) && defined(ARCH_SIXTYFOUR) && !defined(ARCH_CODE32)
   jumptbl_base = Jumptbl_base;
 #endif
-  initial_local_roots = Caml_state->local_roots;
-  initial_sp_offset =
-    (char *) Caml_state->stack_high - (char *) Caml_state->extern_sp;
-  initial_external_raise = Caml_state->external_raise;
-  caml_callback_depth++;
+  initial_trap_sp_off = domain_state->trap_sp_off;
+  initial_stack_words =
+    Stack_high(domain_state->current_stack) - domain_state->current_stack->sp;
+  initial_external_raise = domain_state->external_raise;
 
   if (sigsetjmp(raise_buf.buf, 0)) {
-    Caml_state->local_roots = initial_local_roots;
-    sp = Caml_state->extern_sp;
-    accu = Caml_state->exn_bucket;
+    /* no non-volatile local variables read here */
+    sp = domain_state->current_stack->sp;
+    accu = raise_exn_bucket;
 
     Check_trap_barrier;
-    if (Caml_state->backtrace_active) {
-      /* pc has already been pushed on the stack when calling the C
+    if (domain_state->backtrace_active) {
+         /* pc has already been pushed on the stack when calling the C
          function that raised the exception. No need to push it again
          here. */
       caml_stash_backtrace(accu, sp, 0);
     }
     goto raise_notrace;
   }
-  Caml_state->external_raise = &raise_buf;
+  domain_state->external_raise = &exception_ctx;
+
+  domain_state->trap_sp_off = 1;
 
-  sp = Caml_state->extern_sp;
+  sp = domain_state->current_stack->sp;
   pc = prog;
   extra_args = 0;
   env = Atom(0);
@@ -290,8 +322,8 @@ value caml_interprete(code_t prog, asize_t prog_size)
 #ifdef DEBUG
  next_instr:
   if (caml_icount-- == 0) caml_stop_here ();
-  CAMLassert(sp >= Caml_state->stack_low);
-  CAMLassert(sp <= Caml_state->stack_high);
+  CAMLassert(Stack_base(domain_state->current_stack) <= sp);
+  CAMLassert(sp <= Stack_high(domain_state->current_stack));
 #endif
   goto *(void *)(jumptbl_base + *pc++); /* Jump to the first instruction */
 #else
@@ -299,18 +331,19 @@ value caml_interprete(code_t prog, asize_t prog_size)
 #ifdef DEBUG
     caml_bcodcount++;
     if (caml_icount-- == 0) caml_stop_here ();
-    if (caml_trace_level>1) printf("\n##%" ARCH_INTNAT_PRINTF_FORMAT "d\n",
-                                   caml_bcodcount);
-    if (caml_trace_level>0) caml_disasm_instr(pc);
-    if (caml_trace_level>1) {
+    if (caml_params->trace_level>1)
+      printf("\n##%" ARCH_INTNAT_PRINTF_FORMAT "d\n", caml_bcodcount);
+    if (caml_params->trace_level>0) caml_disasm_instr(pc);
+    if (caml_params->trace_level>1) {
       printf("env=");
       caml_trace_value_file(env,prog,prog_size,stdout);
       putchar('\n');
       caml_trace_accu_sp_file(accu,sp,prog,prog_size,stdout);
       fflush(stdout);
     };
-    CAMLassert(sp >= Caml_state->stack_low);
-    CAMLassert(sp <= Caml_state->stack_high);
+    CAMLassert(Stack_base(domain_state->current_stack) <= sp);
+    CAMLassert(sp <= Stack_high(domain_state->current_stack));
+
 #endif
     curr_instr = *pc++;
 
@@ -509,14 +542,41 @@ value caml_interprete(code_t prog, asize_t prog_size)
         extra_args--;
         pc = Code_val(accu);
         env = accu;
+        Next;
       } else {
+        goto do_return;
+      }
+    }
+
+    do_return:
+      if (sp == Stack_high(domain_state->current_stack)) {
+        /* return to parent stack */
+        struct stack_info* old_stack = domain_state->current_stack;
+        struct stack_info* parent_stack = Stack_parent(old_stack);
+        value hval = Stack_handle_value(old_stack);
+        CAMLassert(parent_stack != NULL);
+
+        domain_state->current_stack = parent_stack;
+        sp = domain_state->current_stack->sp;
+        caml_free_stack(old_stack);
+
+        domain_state->trap_sp_off = Long_val(sp[0]);
+        extra_args = Long_val(sp[1]);
+        sp++;
+        sp[0] = accu;
+
+        accu = hval;
+        pc = Code_val(accu);
+        env = accu;
+        goto check_stacks;
+      } else {
+        /* return to callee, no stack switching */
         pc = (code_t)(sp[0]);
         env = sp[1];
         extra_args = Long_val(sp[2]);
         sp += 3;
       }
       Next;
-    }
 
     Instruct(RESTART): {
       int num_args = Wosize_val(env) - 3;
@@ -532,21 +592,18 @@ value caml_interprete(code_t prog, asize_t prog_size)
       int required = *pc++;
       if (extra_args >= required) {
         extra_args -= required;
+        Next;
       } else {
         mlsize_t num_args, i;
         num_args = 1 + extra_args; /* arg1 + extra args */
-        Alloc_small(accu, num_args + 3, Closure_tag);
+        Alloc_small(accu, num_args + 3, Closure_tag, Enter_gc);
         Field(accu, 2) = env;
         for (i = 0; i < num_args; i++) Field(accu, i + 3) = sp[i];
         Code_val(accu) = pc - 3; /* Point to the preceding RESTART instr. */
         Closinfo_val(accu) = Make_closinfo(0, 2);
         sp += num_args;
-        pc = (code_t)(sp[0]);
-        env = sp[1];
-        extra_args = Long_val(sp[2]);
-        sp += 3;
+        goto do_return;
       }
-      Next;
     }
 
     Instruct(CLOSURE): {
@@ -555,7 +612,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
       if (nvars > 0) *--sp = accu;
       if (nvars <= Max_young_wosize - 2) {
         /* nvars + 2 <= Max_young_wosize, can allocate in minor heap */
-        Alloc_small(accu, 2 + nvars, Closure_tag);
+        Alloc_small(accu, 2 + nvars, Closure_tag, Enter_gc);
         for (i = 0; i < nvars; i++) Field(accu, i + 2) = sp[i];
       } else {
         /* PR#6385: must allocate in major heap */
@@ -582,7 +639,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
       value * p;
       if (nvars > 0) *--sp = accu;
       if (blksize <= Max_young_wosize) {
-        Alloc_small(accu, blksize, Closure_tag);
+        Alloc_small(accu, blksize, Closure_tag, Enter_gc);
         p = &Field(accu, envofs);
         for (i = 0; i < nvars; i++, p++) *p = sp[i];
       } else {
@@ -601,7 +658,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
       *p++ = (value) (pc + pc[0]);
       *p++ = Make_closinfo(0, envofs);
       for (i = 1; i < nfuncs; i++) {
-        *p++ = Make_header(i * 3, Infix_tag, Caml_white); /* color irrelevant */
+        *p++ = Make_header(i * 3, Infix_tag, 0); /* color irrelevant */
         *--sp = (value) p;
         *p++ = (value) (pc + pc[i]);
         envofs -= 3;
@@ -651,11 +708,12 @@ value caml_interprete(code_t prog, asize_t prog_size)
       Next;
     }
 
-    Instruct(SETGLOBAL):
+    Instruct(SETGLOBAL):  {
       caml_modify(&Field(caml_global_data, *pc), accu);
       accu = Val_unit;
       pc++;
       Next;
+    }
 
 /* Allocation of blocks */
 
@@ -677,7 +735,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
       mlsize_t i;
       value block;
       if (wosize <= Max_young_wosize) {
-        Alloc_small(block, wosize, tag);
+        Alloc_small(block, wosize, tag, Enter_gc);
         Field(block, 0) = accu;
         for (i = 1; i < wosize; i++) Field(block, i) = *sp++;
       } else {
@@ -691,7 +749,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
     Instruct(MAKEBLOCK1): {
       tag_t tag = *pc++;
       value block;
-      Alloc_small(block, 1, tag);
+      Alloc_small(block, 1, tag, Enter_gc);
       Field(block, 0) = accu;
       accu = block;
       Next;
@@ -699,7 +757,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
     Instruct(MAKEBLOCK2): {
       tag_t tag = *pc++;
       value block;
-      Alloc_small(block, 2, tag);
+      Alloc_small(block, 2, tag, Enter_gc);
       Field(block, 0) = accu;
       Field(block, 1) = sp[0];
       sp += 1;
@@ -709,7 +767,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
     Instruct(MAKEBLOCK3): {
       tag_t tag = *pc++;
       value block;
-      Alloc_small(block, 3, tag);
+      Alloc_small(block, 3, tag, Enter_gc);
       Field(block, 0) = accu;
       Field(block, 1) = sp[0];
       Field(block, 2) = sp[1];
@@ -722,7 +780,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
       mlsize_t i;
       value block;
       if (size <= Max_young_wosize / Double_wosize) {
-        Alloc_small(block, size * Double_wosize, Double_array_tag);
+        Alloc_small(block, size * Double_wosize, Double_array_tag, Enter_gc);
       } else {
         block = caml_alloc_shr(size * Double_wosize, Double_array_tag);
       }
@@ -749,7 +807,7 @@ value caml_interprete(code_t prog, asize_t prog_size)
       accu = Field(accu, *pc); pc++; Next;
     Instruct(GETFLOATFIELD): {
       double d = Double_flat_field(accu, *pc++);
-      Alloc_small(accu, Double_wosize, Double_tag);
+      Alloc_small(accu, Double_wosize, Double_tag, Enter_gc);
       Store_double_val(accu, d);
       Next;
     }
@@ -848,22 +906,23 @@ value caml_interprete(code_t prog, asize_t prog_size)
     Instruct(PUSHTRAP):
       sp -= 4;
       Trap_pc(sp) = pc + *pc;
-      Trap_link_offset(sp) = Val_long(Caml_state->trapsp - sp);
+      Trap_link(sp) = Val_long(domain_state->trap_sp_off);
       sp[2] = env;
       sp[3] = Val_long(extra_args);
-      Caml_state->trapsp = sp;
+      domain_state->trap_sp_off = sp - Stack_high(domain_state->current_stack);
       pc++;
       Next;
 
     Instruct(POPTRAP):
-      if (caml_something_to_do) {
+      if (Caml_check_gc_interrupt(domain_state) ||
+          caml_check_for_pending_signals()) {
         /* We must check here so that if a signal is pending and its
            handler triggers an exception, the exception is trapped
            by the current try...with, not the enclosing one. */
         pc--; /* restart the POPTRAP after processing the signal */
-        goto process_actions;
+        goto process_signal;
       }
-      Caml_state->trapsp = sp + Long_val(Trap_link_offset(sp));
+      domain_state->trap_sp_off = Long_val(Trap_link(sp));
       sp += 4;
       Next;
 
@@ -873,52 +932,78 @@ value caml_interprete(code_t prog, asize_t prog_size)
 
     Instruct(RERAISE):
       Check_trap_barrier;
-      if (Caml_state->backtrace_active) {
+      if (domain_state->backtrace_active) {
         *--sp = (value)(pc - 1);
         caml_stash_backtrace(accu, sp, 1);
       }
       goto raise_notrace;
 
     Instruct(RAISE):
+    raise_exception:
       Check_trap_barrier;
-      if (Caml_state->backtrace_active) {
+      if (domain_state->backtrace_active) {
         *--sp = (value)(pc - 1);
         caml_stash_backtrace(accu, sp, 0);
       }
     raise_notrace:
-      if ((char *) Caml_state->trapsp
-          >= (char *) Caml_state->stack_high - initial_sp_offset) {
-        Caml_state->external_raise = initial_external_raise;
-        Caml_state->extern_sp = (value *) ((char *) Caml_state->stack_high
-                                    - initial_sp_offset);
-        caml_callback_depth--;
-        return Make_exception_result(accu);
+      if (domain_state->trap_sp_off > 0) {
+        if (Stack_parent(domain_state->current_stack) == NULL) {
+          domain_state->external_raise = initial_external_raise;
+          domain_state->trap_sp_off = initial_trap_sp_off;
+          domain_state->current_stack->sp =
+            Stack_high(domain_state->current_stack) - initial_stack_words ;
+          return Make_exception_result(accu);
+        } else {
+          struct stack_info* old_stack = domain_state->current_stack;
+          struct stack_info* parent_stack = Stack_parent(old_stack);
+          value hexn = Stack_handle_exception(old_stack);
+          old_stack->sp = sp;
+          domain_state->current_stack = parent_stack;
+          sp = domain_state->current_stack->sp;
+          caml_free_stack(old_stack);
+
+          domain_state->trap_sp_off = Long_val(sp[0]);
+          extra_args = Long_val(sp[1]);
+          sp++;
+          sp[0] = accu;
+
+          accu = hexn;
+          pc = Code_val(accu);
+          env = accu;
+          goto check_stacks;
+        }
+      } else {
+        sp =
+           Stack_high(domain_state->current_stack) + domain_state->trap_sp_off;
+        pc = Trap_pc(sp);
+        domain_state->trap_sp_off = Long_val(Trap_link(sp));
+        env = sp[2];
+        extra_args = Long_val(sp[3]);
+        sp += 4;
       }
-      sp = Caml_state->trapsp;
-      pc = Trap_pc(sp);
-      Caml_state->trapsp = sp + Long_val(Trap_link_offset(sp));
-      env = sp[2];
-      extra_args = Long_val(sp[3]);
-      sp += 4;
       Next;
 
 /* Stack checks */
 
     check_stacks:
-      if (sp < Caml_state->stack_threshold) {
-        Caml_state->extern_sp = sp;
-        caml_realloc_stack(Stack_threshold / sizeof(value));
-        sp = Caml_state->extern_sp;
+      if (sp < Stack_threshold_ptr(domain_state->current_stack)) {
+        domain_state->current_stack->sp = sp;
+        if (!caml_try_realloc_stack(Stack_threshold / sizeof(value))) {
+          Setup_for_c_call; caml_raise_stack_overflow();
+        }
+        sp = domain_state->current_stack->sp;
       }
       /* Fall through CHECK_SIGNALS */
 
 /* Signal handling */
 
     Instruct(CHECK_SIGNALS):    /* accu not preserved */
-      if (caml_something_to_do) goto process_actions;
+      if (Caml_check_gc_interrupt(domain_state) ||
+          caml_check_for_pending_signals())
+        goto process_signal;
       Next;
 
-    process_actions:
+    process_signal:
       Setup_for_event;
       caml_process_pending_actions();
       Restore_after_event;
@@ -1146,9 +1231,9 @@ value caml_interprete(code_t prog, asize_t prog_size)
 /* Debugging and machine control */
 
     Instruct(STOP):
-      Caml_state->external_raise = initial_external_raise;
-      Caml_state->extern_sp = sp;
-      caml_callback_depth--;
+      domain_state->external_raise = initial_external_raise;
+      domain_state->trap_sp_off = initial_trap_sp_off;
+      domain_state->current_stack->sp = sp;
       return accu;
 
     Instruct(EVENT):
@@ -1165,6 +1250,126 @@ value caml_interprete(code_t prog, asize_t prog_size)
       Restore_after_debugger;
       Restart_curr_instr;
 
+/* Context switching */
+
+    Instruct(RESUME):
+      resume_fn = sp[0];
+      resume_arg = sp[1];
+      sp -= 3;
+      sp[0] = Val_long(domain_state->trap_sp_off);
+      sp[1] = Val_long(0);
+      sp[2] = (value)pc;
+      sp[3] = env;
+      sp[4] = Val_long(extra_args);
+      goto do_resume;
+
+do_resume: {
+      struct stack_info* stk = Ptr_val(accu);
+      if (stk == NULL) {
+         accu = Field(caml_global_data, CONTINUATION_ALREADY_TAKEN_EXN);
+         goto raise_exception;
+      }
+      while (Stack_parent(stk) != NULL) stk = Stack_parent(stk);
+      Stack_parent(stk) = Caml_state->current_stack;
+
+      domain_state->current_stack->sp = sp;
+      domain_state->current_stack = Ptr_val(accu);
+      sp = domain_state->current_stack->sp;
+
+      domain_state->trap_sp_off = Long_val(sp[0]);
+      sp[0] = resume_arg;
+      accu = resume_fn;
+      pc = Code_val(accu);
+      env = accu;
+      extra_args = 0;
+      goto check_stacks;
+    }
+
+    Instruct(RESUMETERM):
+      resume_fn = sp[0];
+      resume_arg = sp[1];
+      sp = sp + *pc - 2;
+      sp[0] = Val_long(domain_state->trap_sp_off);
+      sp[1] = Val_long(extra_args);
+      goto do_resume;
+
+
+    Instruct(PERFORM): {
+      value cont;
+      struct stack_info* old_stack = domain_state->current_stack;
+      struct stack_info* parent_stack = Stack_parent(old_stack);
+
+      if (parent_stack == NULL) {
+        accu = Field(caml_global_data, UNHANDLED_EXN);
+        goto raise_exception;
+      }
+
+      Alloc_small(cont, 1, Cont_tag, Enter_gc);
+
+      sp -= 4;
+      sp[0] = Val_long(domain_state->trap_sp_off);
+      sp[1] = (value)pc;
+      sp[2] = env;
+      sp[3] = Val_long(extra_args);
+
+      old_stack->sp = sp;
+      domain_state->current_stack = parent_stack;
+      sp = parent_stack->sp;
+      Stack_parent(old_stack) = NULL;
+      Field(cont, 0) = Val_ptr(old_stack);
+
+      domain_state->trap_sp_off = Long_val(sp[0]);
+      extra_args = Long_val(sp[1]);
+      sp--;
+      sp[0] = accu;
+      sp[1] = cont;
+      sp[2] = Val_ptr(old_stack);
+      accu = Stack_handle_effect(old_stack);
+      pc = Code_val(accu);
+      env = accu;
+      extra_args += 2;
+      goto check_stacks;
+    }
+
+    Instruct(REPERFORMTERM): {
+      value eff = accu;
+      value cont = sp[0];
+      struct stack_info* cont_tail = Ptr_val(sp[1]);
+      struct stack_info* self = domain_state->current_stack;
+      struct stack_info* parent = Stack_parent(domain_state->current_stack);
+
+      sp = sp + *pc - 2;
+      sp[0] = Val_long(domain_state->trap_sp_off);
+      sp[1] = Val_long(extra_args);
+
+      if (parent == NULL) {
+        accu = caml_continuation_use(cont);
+        resume_fn = raise_unhandled;
+        resume_arg = Field(caml_global_data, UNHANDLED_EXN);
+        goto do_resume;
+      }
+
+      self->sp = sp;
+      domain_state->current_stack = parent;
+      sp = parent->sp;
+
+      CAMLassert(Stack_parent(cont_tail) == NULL);
+      Stack_parent(self) = NULL;
+      Stack_parent(cont_tail) = self;
+
+      domain_state->trap_sp_off = Long_val(sp[0]);
+      extra_args = Long_val(sp[1]);
+      sp--;
+      sp[0] = eff;
+      sp[1] = cont;
+      sp[2] = Val_ptr(self);
+      accu = Stack_handle_effect(self);
+      pc = Code_val(accu);
+      env = accu;
+      extra_args += 2;
+      goto check_stacks;
+    }
+
 #ifndef THREADED_CODE
     default:
 #if _MSC_VER >= 1200
diff --git a/runtime/ints.c b/runtime/ints.c
index c9584e4aba2c..eb539b94f446 100644
--- a/runtime/ints.c
+++ b/runtime/ints.c
@@ -214,7 +214,7 @@ static uintnat int32_deserialize(void * dst)
 
 static const struct custom_fixed_length int32_length = { 4, 4 };
 
-CAMLexport struct custom_operations caml_int32_ops = {
+CAMLexport const struct custom_operations caml_int32_ops = {
   "_i",
   custom_finalize_default,
   int32_cmp,
@@ -413,7 +413,7 @@ static uintnat int64_deserialize(void * dst)
 
 static const struct custom_fixed_length int64_length = { 8, 8 };
 
-CAMLexport struct custom_operations caml_int64_ops = {
+CAMLexport const struct custom_operations caml_int64_ops = {
   "_j",
   custom_finalize_default,
   int64_cmp,
@@ -710,7 +710,7 @@ static uintnat nativeint_deserialize(void * dst)
 }
 
 static const struct custom_fixed_length nativeint_length = { 4, 8 };
-CAMLexport struct custom_operations caml_nativeint_ops = {
+CAMLexport const struct custom_operations caml_nativeint_ops = {
   "_n",
   custom_finalize_default,
   nativeint_cmp,
diff --git a/runtime/io.c b/runtime/io.c
index e40968ac7e35..bd8107557dcd 100644
--- a/runtime/io.c
+++ b/runtime/io.c
@@ -31,6 +31,7 @@
 #include </usr/include/io.h>
 #endif
 #include "caml/alloc.h"
+#include "caml/camlatomic.h"
 #include "caml/custom.h"
 #include "caml/fail.h"
 #include "caml/io.h"
@@ -38,6 +39,7 @@
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
 #include "caml/osdeps.h"
+#include "caml/platform.h"
 #include "caml/signals.h"
 #include "caml/sys.h"
 
@@ -52,13 +54,54 @@
 #define lseek _lseeki64
 #endif
 
+/* List of opened channels and its mutex */
+CAMLexport caml_plat_mutex
+  caml_all_opened_channels_mutex = CAML_PLAT_MUTEX_INITIALIZER;
 
 /* Hooks for locking channels */
 
-CAMLexport void (*caml_channel_mutex_free) (struct channel *) = NULL;
-CAMLexport void (*caml_channel_mutex_lock) (struct channel *) = NULL;
-CAMLexport void (*caml_channel_mutex_unlock) (struct channel *) = NULL;
-CAMLexport void (*caml_channel_mutex_unlock_exn) (void) = NULL;
+static __thread struct channel* last_channel_locked = NULL;
+
+static void channel_mutex_free_default(struct channel *chan)
+{
+  caml_plat_mutex_free(&chan->mutex);
+}
+
+static void channel_mutex_lock_default(struct channel *chan)
+{
+  if( caml_plat_try_lock(&chan->mutex) ) {
+    last_channel_locked = chan;
+    return;
+  }
+
+  /* If unsuccessful, block on mutex */
+  caml_enter_blocking_section();
+  caml_plat_lock(&chan->mutex);
+  last_channel_locked = chan;
+  caml_leave_blocking_section();
+}
+
+static void channel_mutex_unlock_default(struct channel *chan)
+{
+  caml_plat_unlock(&chan->mutex);
+  last_channel_locked = NULL;
+}
+
+static void channel_mutex_unlock_exn_default(void)
+{
+  struct channel * chan = last_channel_locked;
+  if (chan != NULL && caml_channel_mutex_unlock != NULL)
+    caml_channel_mutex_unlock(chan);
+}
+
+CAMLexport void (*caml_channel_mutex_free) (struct channel *)
+  = channel_mutex_free_default;
+CAMLexport void (*caml_channel_mutex_lock) (struct channel *)
+  = channel_mutex_lock_default;
+CAMLexport void (*caml_channel_mutex_unlock) (struct channel *)
+  = channel_mutex_unlock_default;
+CAMLexport void (*caml_channel_mutex_unlock_exn) (void)
+  = channel_mutex_unlock_exn_default;
 
 /* List of opened channels */
 CAMLexport struct channel * caml_all_opened_channels = NULL;
@@ -91,6 +134,34 @@ Caml_inline int descriptor_is_in_binary_mode(int fd)
 #endif
 }
 
+static void link_channel (struct channel* channel)
+{
+  caml_plat_lock (&caml_all_opened_channels_mutex);
+  channel->next = caml_all_opened_channels;
+  CAMLassert(channel->prev == NULL);
+  if (caml_all_opened_channels != NULL)
+    caml_all_opened_channels->prev = channel;
+  caml_all_opened_channels = channel;
+  caml_plat_unlock (&caml_all_opened_channels_mutex);
+}
+
+static void unlink_channel(struct channel *channel)
+{
+  caml_plat_lock (&caml_all_opened_channels_mutex);
+  if (channel->prev == NULL) {
+    CAMLassert (channel == caml_all_opened_channels);
+    caml_all_opened_channels = caml_all_opened_channels->next;
+    if (caml_all_opened_channels != NULL)
+      caml_all_opened_channels->prev = NULL;
+  } else {
+    channel->prev->next = channel->next;
+    if (channel->next != NULL) channel->next->prev = channel->prev;
+  }
+  channel->next = NULL;
+  channel->prev = NULL;
+  caml_plat_unlock (&caml_all_opened_channels_mutex);
+}
+
 CAMLexport struct channel * caml_open_descriptor_in(int fd)
 {
   struct channel * channel;
@@ -102,15 +173,14 @@ CAMLexport struct channel * caml_open_descriptor_in(int fd)
   caml_leave_blocking_section();
   channel->curr = channel->max = channel->buff;
   channel->end = channel->buff + IO_BUFFER_SIZE;
-  channel->mutex = NULL;
-  channel->refcount = 0;
-  channel->flags = descriptor_is_in_binary_mode(fd) ? 0 : CHANNEL_TEXT_MODE;
-  channel->next = caml_all_opened_channels;
+  caml_plat_mutex_init(&channel->mutex);
+  atomic_store_rel(&channel->refcount, 0);
   channel->prev = NULL;
   channel->name = NULL;
-  if (caml_all_opened_channels != NULL)
-    caml_all_opened_channels->prev = channel;
-  caml_all_opened_channels = channel;
+  channel->flags = descriptor_is_in_binary_mode(fd) ? 0 : CHANNEL_TEXT_MODE;
+
+  link_channel (channel);
+
   return channel;
 }
 
@@ -123,24 +193,17 @@ CAMLexport struct channel * caml_open_descriptor_out(int fd)
   return channel;
 }
 
-static void unlink_channel(struct channel *channel)
-{
-  if (channel->prev == NULL) {
-    CAMLassert (channel == caml_all_opened_channels);
-    caml_all_opened_channels = caml_all_opened_channels->next;
-    if (caml_all_opened_channels != NULL)
-      caml_all_opened_channels->prev = NULL;
-  } else {
-    channel->prev->next = channel->next;
-    if (channel->next != NULL) channel->next->prev = channel->prev;
-  }
-}
-
 CAMLexport void caml_close_channel(struct channel *channel)
 {
   close(channel->fd);
-  if (caml_channel_mutex_free != NULL) (*caml_channel_mutex_free)(channel);
+
   unlink_channel(channel);
+    if (atomic_load_acq(&channel->refcount) > 0) {
+    /* [caml_ml_out_channels_list] may have a reference to this channel. */
+    link_channel (channel);
+    return;
+  }
+  if (caml_channel_mutex_free != NULL) (*caml_channel_mutex_free)(channel);
   caml_stat_free(channel->name);
   caml_stat_free(channel);
 }
@@ -446,7 +509,12 @@ void caml_finalize_channel(value vchan)
 {
   struct channel * chan = Channel(vchan);
   if ((chan->flags & CHANNEL_FLAG_MANAGED_BY_GC) == 0) return;
-  if (--chan->refcount > 0) return;
+  unlink_channel(chan);
+  if (atomic_fetch_add (&chan->refcount, -1) > 1) {
+    /* [caml_ml_out_channels_list] may have a reference to this channel. */
+    link_channel (chan);
+    return;
+  }
   if (caml_channel_mutex_free != NULL) (*caml_channel_mutex_free)(chan);
 
   if (chan->fd != -1 && chan->name && caml_runtime_warnings_active())
@@ -463,7 +531,6 @@ void caml_finalize_channel(value vchan)
       channel now, but (i) flushing can raise exceptions, and (ii) it
       is potentially a blocking operation.  Both are forbidden in a
       finalization function.
-
       Refs:
       http://caml.inria.fr/mantis/view.php?id=6902
       https://github.com/ocaml/ocaml/pull/210
@@ -472,8 +539,9 @@ void caml_finalize_channel(value vchan)
       fprintf(stderr,
               "[ocaml] (moreover, it has unflushed data)\n"
               );
-  } else {
-    unlink_channel(chan);
+  }
+  else
+  {
     caml_stat_free(chan->name);
     caml_stat_free(chan);
   }
@@ -505,7 +573,7 @@ static struct custom_operations channel_operations = {
 CAMLexport value caml_alloc_channel(struct channel *chan)
 {
   value res;
-  chan->refcount++;
+  atomic_fetch_add (&chan->refcount, 1);
   res = caml_alloc_custom_mem(&channel_operations, sizeof(struct channel *),
                               sizeof(struct channel));
   Channel(res) = chan;
@@ -537,27 +605,56 @@ CAMLprim value caml_ml_set_channel_name(value vchannel, value vname)
   return Val_unit;
 }
 
+#define Pair_tag 0
+
+struct channel_list {
+  struct channel* channel;
+  struct channel_list* next;
+};
+
 CAMLprim value caml_ml_out_channels_list (value unit)
 {
   CAMLparam0 ();
   CAMLlocal3 (res, tail, chan);
   struct channel * channel;
+  struct channel_list *channel_list = NULL, *cl_tmp;
+  mlsize_t i, num_channels = 0;
 
-  res = Val_emptylist;
+  caml_plat_lock (&caml_all_opened_channels_mutex);
   for (channel = caml_all_opened_channels;
        channel != NULL;
-       channel = channel->next)
-    /* Include only output channels opened from OCaml and not closed yet.
-       Testing channel->fd >= 0 looks unnecessary, as
+       channel = channel->next) {
+    /* Testing channel->fd >= 0 looks unnecessary, as
        caml_ml_close_channel changes max when setting fd to -1. */
-    if (channel->max == NULL
-        && channel->flags & CHANNEL_FLAG_MANAGED_BY_GC) {
-      chan = caml_alloc_channel (channel);
-      tail = res;
-      res = caml_alloc_small (2, Tag_cons);
-      Field (res, 0) = chan;
-      Field (res, 1) = tail;
+    if (channel->max == NULL) {
+      /* refcount is incremented here to keep the channel alive */
+      atomic_fetch_add (&channel->refcount, 1);
+      num_channels++;
+      cl_tmp = caml_stat_alloc_noexc (sizeof(struct channel_list));
+      if (cl_tmp == NULL)
+        caml_fatal_error ("caml_ml_out_channels_list: out of memory");
+      cl_tmp->channel = channel;
+      cl_tmp->next = channel_list;
+      channel_list = cl_tmp;
     }
+  }
+  caml_plat_unlock (&caml_all_opened_channels_mutex);
+
+  res = Val_emptylist;
+  cl_tmp = NULL;
+  for (i = 0; i < num_channels; i++) {
+    chan = caml_alloc_channel (channel_list->channel);
+    /* refcount would have been incremented by caml_alloc_channel. Decrement
+     * our earlier increment */
+    atomic_fetch_add (&channel_list->channel->refcount, -1);
+    tail = res;
+    res = caml_alloc_small (2, Pair_tag);
+    Field (res, 0) = chan;
+    Field (res, 1) = tail;
+    cl_tmp = channel_list;
+    channel_list = channel_list->next;
+    caml_stat_free (cl_tmp);
+  }
   CAMLreturn (res);
 }
 
diff --git a/runtime/lexing.c b/runtime/lexing.c
index b10499042528..f553ce8f1f30 100644
--- a/runtime/lexing.c
+++ b/runtime/lexing.c
@@ -19,7 +19,7 @@
 
 #include "caml/fail.h"
 #include "caml/mlvalues.h"
-#include "caml/stacks.h"
+#include "caml/fiber.h"
 
 struct lexer_buffer {
   value refill_buff;
diff --git a/runtime/lf_skiplist.c b/runtime/lf_skiplist.c
new file mode 100644
index 000000000000..7d7166efa24e
--- /dev/null
+++ b/runtime/lf_skiplist.c
@@ -0,0 +1,516 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*               Sadiq Jaffer, OCaml Labs Consultancy Ltd                 */
+/*               Xavier Leroy, projet Cambium, INRIA Paris                */
+/*                                                                        */
+/*   Copyright 2021 OCaml Labs Consultancy Ltd                            */
+/*   Copyright 2020 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#define CAML_INTERNALS
+
+/* A concurrent dictionary data structure implemented as skip lists. This
+    implementation is based on the sequential skip list implemetation in
+    the runtime by Xavier Leroy but extends it to be safe under concurrent
+    modification. It has the property that insert/remove are lock-free and
+    contains is further wait-free. It is literally a textbook implementation
+    and can be found in Herlihy et al's "The Art of Multiprocessor
+    Programming" 2nd Edition, section 14.4. It only differs from the
+    textbook implementation to fix errors in the pseudocode in [contains],
+    to add a [search_level] optimisation to the data structure, replacing Java's
+    volatile with atomics and to keep a list of removed nodes in order to do a
+    deferred free.
+
+    You _must_ call [caml_lf_skiplist_free_garbage] "every so often" in order
+    for the data structure to free removed nodes. This must be done by only
+    one thread at a time when no other thread can be accessing the structure.
+
+    It is roughly half the speed of the sequential skip list so only use
+    where concurrent access is necessary. For use-cases where there is
+    only infrequent contention and where acquiring a lock during find is
+    allowed then a sequential skip list guarded by a mutex may perform
+    better.
+
+    A sequential implementation of skip lists is in file skiplist.c and is based
+    on the paper by William Pugh, "Skip lists: a probabilistic alternative to
+    balanced binary trees", Comm. ACM 33(6), 1990). */
+
+#include "caml/lf_skiplist.h"
+#include "caml/config.h"
+#include "caml/memory.h"
+#include "caml/misc.h"
+#include <stddef.h>
+
+/* Size of struct lf_skipcell, in bytes, without the forward array */
+#if (__STDC_VERSION__ >= 199901L)
+#define SIZEOF_LF_SKIPCELL sizeof(struct lf_skipcell)
+#else
+#define SIZEOF_LF_SKIPCELL                                                     \
+  (sizeof(struct lf_skipcell) - sizeof(struct lf_skipcell *))
+#endif
+
+/* Generate a random level for a new node: 0 with probability 3/4,
+   1 with probability 3/16, 2 with probability 3/64, etc.
+   We use a simple linear congruential PRNG (see Knuth vol 2) instead
+   of random(), because we need exactly 32 bits of pseudo-random data
+   (i.e. 2 * (NUM_LEVELS - 1)).  Moreover, the congruential PRNG
+   is faster and guaranteed to be deterministic (to reproduce bugs). */
+
+static uint32_t _Atomic random_seed = 0;
+
+static int random_level(void) {
+  uint32_t r;
+  int level = 0;
+
+  /* Linear congruence with modulus = 2^32, multiplier = 69069
+   (Knuth vol 2 p. 106, line 15 of table 1), additive = 25173. */
+
+  while( 1 ) {
+    uint32_t curr =
+      atomic_load_explicit(&random_seed, memory_order_relaxed);
+
+    r = curr * 69069 + 25173;
+
+    if( atomic_compare_exchange_strong(&random_seed, &curr, r) ) {
+        break;
+    }
+  }
+  /* Knuth (vol 2 p. 13) shows that the least significant bits are
+   "less random" than the most significant bits with a modulus of 2^m,
+   so consume most significant bits first */
+  while ((r & 0xC0000000U) == 0xC0000000U) {
+    level++;
+    r = r << 2;
+  }
+  CAMLassert(level < NUM_LEVELS);
+  return level;
+}
+
+/* Initialize a skip list */
+
+void caml_lf_skiplist_init(struct lf_skiplist *sk) {
+  atomic_store_explicit(&sk->search_level, 0, memory_order_relaxed);
+
+  /* This concurrent skip list has two sentinel nodes, the first [head] is
+  less than any possible key in the data structure and the second [tail] is
+  greater than any key. */
+  sk->head = caml_stat_alloc(SIZEOF_LF_SKIPCELL +
+                             NUM_LEVELS * sizeof(struct lf_skipcell *));
+  sk->head->key = 0;
+  sk->head->data = 0;
+  sk->head->garbage_next = NULL;
+  sk->head->top_level = NUM_LEVELS - 1;
+
+  sk->tail = caml_stat_alloc(SIZEOF_LF_SKIPCELL +
+                             NUM_LEVELS * sizeof(struct lf_skipcell *));
+  sk->tail->key = UINTNAT_MAX;
+  sk->tail->data = 0;
+  sk->tail->garbage_next = NULL;
+  sk->tail->top_level = NUM_LEVELS - 1;
+
+  /* We do this so that later in find when we try to CAS a cell's
+     `garbage_next` in `skiplist_find` we can disambiguate between a cell with
+     an uninitialised `garbage_next` (that we may take ownership of) and one
+     that is already in the garbage list. If we instead used NULL then this
+     would not be possible.  */
+  sk->garbage_head = sk->head;
+
+  /* each level in the skip list starts of being just head pointing to tail */
+  for (int j = 0; j < NUM_LEVELS; j++) {
+    atomic_store_explicit
+      (&sk->head->forward[j], sk->tail, memory_order_release);
+
+    atomic_store_explicit
+      (&sk->tail->forward[j], NULL, memory_order_release);
+  }
+}
+
+/* [skiplist_find] is used for insert/remove and attempts to find a node in the
+   skiplist. It populates the [preds] and [succs] arrays at each level. These
+   arrays are later used for inserting or removing the node (by either CASing
+   the new link or marking it). Additional [skiplist_find] will snip out nodes
+   that have been marked for deletion if it finds during the search. The
+   function is lock-free. */
+static int skiplist_find(struct lf_skiplist *sk, uintnat key,
+                         struct lf_skipcell **preds,
+                         struct lf_skipcell **succs) {
+  /* [pred] is a node that precedes the node we are looking for */
+  struct lf_skipcell *pred = NULL;
+  /* [curr] is the current node we are examining. If it is less
+  than our key */
+  struct lf_skipcell *curr = NULL;
+  /* [succ] is the next node to examine at our current level */
+  struct lf_skipcell *succ = NULL;
+
+retry:
+  while (1) {
+    /* start at the the head of the skiplist. This node has a key less than any
+       key we could be searching for */
+    pred = sk->head;
+    /*
+    The algorithm itself is fairly simple, we start at the highest level (i.e
+    the top, the level with the fewest nodes) of the skiplist and keep walking
+    nodes along the level until [curr] is greater than the key we are looking
+    for. When that happens we drop down to the next level and start the whole
+    thing again from [pred]. If we could visualise searching for an element near
+    the end of the list it would look something like a staircase with wide steps
+    at the beginning and shorter ones as we descend down.
+
+    The only complexity is that we need to make sure that we don't examine any
+    nodes that are 'marked', that is the lowest bit of their forward pointer to
+    the next node is set to 1. When we encounter one of those it means [curr]
+    has been deleted and we need to snip it out. We might need to retry this
+    several times if there's concention with other threads and we fail the
+    compare-and-swap.
+    */
+    for (int level = NUM_LEVELS - 1; level >= 0; level--) {
+      curr = LF_SK_UNMARK(
+          atomic_load_explicit(&pred->forward[level], memory_order_acquire));
+      while (1) {
+        int is_marked;
+
+        LF_SK_EXTRACT(curr->forward[level], is_marked, succ);
+        while (is_marked) {
+          struct lf_skipcell *null_cell = NULL;
+          int snip = atomic_compare_exchange_strong(&pred->forward[level],
+                                                    &curr, succ);
+          if (!snip) {
+            goto retry;
+          }
+
+          /*
+          If we are at this point then we have successfully snipped out a
+          removed node. What we need to try to do now is add the node to the
+          skiplist's garbage list.
+
+          There's a bit of complexity here. While we use a compare-and-swap to
+          snip the node out of skiplist, it's possible that it can be removed by
+          two threads at the same time from different levels of the skiplist. To
+          avoid this we reuse the garbage_next field and make sure only one
+          thread can ever add the node to the garbage list. This is what the
+          compare-and-swap below ensures by swapping garbage_next to a value
+          of 1. We don't need to worry about anyone accidentally following this
+          bogus pointer, it is only deferenced in the cleanup function and this
+          is called when no thread can be concurrently modifying the skiplist.
+          */
+          if (atomic_compare_exchange_strong(&curr->garbage_next, &null_cell,
+                                             (struct lf_skipcell *)1)) {
+            /* Despite now having exclusivity of the current node's
+               garbage_next, having won the CAS, we might be racing another
+               thread to add a different node to the skiplist's garbage_head.
+               This is why we need to a retry loop and yet another CAS. */
+            while (1) {
+              struct lf_skipcell *_Atomic current_garbage_head =
+                  atomic_load_explicit(&sk->garbage_head, memory_order_acquire);
+
+              atomic_store_explicit(&curr->garbage_next, current_garbage_head,
+                                    memory_order_release);
+
+              if (atomic_compare_exchange_strong(
+                      &sk->garbage_head,
+                      (struct lf_skipcell **)&current_garbage_head, curr)) {
+                break;
+              }
+            }
+          }
+
+          /* Now try to load the current node again. We need to check it too
+             hasn't been marked. If it has we repeat the process */
+          curr = LF_SK_UNMARK(atomic_load_explicit(&pred->forward[level],
+                                                   memory_order_acquire));
+          LF_SK_EXTRACT(curr->forward[level], is_marked, succ);
+        }
+
+        if (curr->key < key) {
+          pred = curr;
+          curr = succ;
+        } else {
+          break;
+        }
+      }
+
+      preds[level] = pred;
+      succs[level] = curr;
+    }
+
+    return curr->key == key;
+  }
+}
+
+/* [lf_skiplist_lookup] will return a skipcell or node that is greater than or
+   equal to the key provided, along with the node that directly proceeds it. It
+   is a much simplified version of [lf_skiplist_find] as it simply ignores
+   marked nodes and does not snip them out. As a consequence, it is wait-free.
+
+   This implementation differs from of the 'contains' in "The Art of
+   Multiprocessor Programming" to fix the erronous swap of pred and curr inside
+   the while(marked) loop. It also uses [search_level] to avoid scanning the
+   sentinels unnecessarily.
+ */
+static struct lf_skipcell *lf_skiplist_lookup(struct lf_skiplist *sk,
+                                              uintnat key,
+                                              struct lf_skipcell **pred_out) {
+  struct lf_skipcell *pred = sk->head;
+  struct lf_skipcell *curr = NULL;
+  struct lf_skipcell *succ = NULL;
+  int marked = 0;
+
+  /* We start our search from the search_level of the skiplist - this is in
+     contrast to the find function above where we start at NUM_LEVELS. This is
+     intentional. Since every search has to eventually end up at the bottom-most
+     level (even those of an empty list), if we accidentally start at the wrong
+     level then our only cost is an increased number of nodes searched. If we
+     did the same thing in the find function above then we'd also fail to snip
+     out marked nodes. If we did that for long enough we might leak memory. */
+  for (int level =
+           atomic_load_explicit(&sk->search_level, memory_order_relaxed);
+       level >= 0; level--) {
+    curr = LF_SK_UNMARK(
+        atomic_load_explicit(&pred->forward[level], memory_order_acquire));
+    while (1) {
+      LF_SK_EXTRACT(curr->forward[level], marked, succ);
+      while (marked) {
+        curr = succ;
+        LF_SK_EXTRACT(curr->forward[level], marked, succ);
+      }
+      if (curr->key < key) {
+        pred = curr;
+        curr = succ;
+      } else {
+        break;
+      }
+    }
+  }
+
+  if (pred_out) {
+    *pred_out = pred;
+  }
+
+  return curr;
+}
+
+/* Search a skip list */
+
+int caml_lf_skiplist_find(struct lf_skiplist *sk, uintnat key, uintnat *data) {
+  struct lf_skipcell *found_cell = lf_skiplist_lookup(sk, key, NULL);
+
+  if (found_cell->key == key) {
+    if (data) {
+      *data = found_cell->data;
+    }
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+int caml_lf_skiplist_find_below(struct lf_skiplist *sk, uintnat k, uintnat *key,
+                                uintnat *data) {
+  struct lf_skipcell *pred;
+  struct lf_skipcell *curr = lf_skiplist_lookup(sk, k, &pred);
+  struct lf_skipcell *found_cell;
+
+  if (curr->key == k) {
+    found_cell = curr;
+  } else if (pred != sk->head) {
+    found_cell = pred;
+  } else {
+    return 0;
+  }
+
+  if (data) {
+    *data = found_cell->data;
+  }
+  if (key) {
+    *key = found_cell->key;
+  }
+  return 1;
+}
+
+/* Insertion in a skip list */
+
+int caml_lf_skiplist_insert(struct lf_skiplist *sk, uintnat key, uintnat data) {
+  struct lf_skipcell *preds[NUM_LEVELS];
+  struct lf_skipcell *succs[NUM_LEVELS];
+
+  CAMLassert(key > 0 && key < UINTNAT_MAX);
+
+  while (1) {
+    /* We first try to find a node with [key] in the skip list. If it exists
+       then we don't need to add it. The [skiplist_find] method will also
+       populate the predecessors and successors arrays, which gives us the nodes
+       between which we could add the new node. */
+    int found = skiplist_find(sk, key, preds, succs);
+    struct lf_skipcell *pred;
+    struct lf_skipcell *succ;
+
+    if (found) {
+      /* node already exists, we don't need to do anything */
+      return 0;
+    } else {
+      /* node does not exist. We need to generate a random top_level and
+       * construct a new node. The new node's forward array (which contains the
+       * next node in increasing order of key, at each level) starts at
+       * [top_level] and goes to 0. Each entry will point to the successors in
+         the [succ] array for that level. */
+      int top_level = random_level();
+      /* attentive readers will have noticed that we assume memory is aligned to
+       * atleast even addresses. This is certainly the case on glibc amd64 and
+       * Visual C++ on Windows though I can find no guarantees for other
+         platorms. */
+      struct lf_skipcell *new_cell = caml_stat_alloc(
+          SIZEOF_LF_SKIPCELL + (top_level + 1) * sizeof(struct lf_skipcell *));
+      new_cell->top_level = top_level;
+      new_cell->key = key;
+      new_cell->data = data;
+      atomic_store_explicit(&new_cell->garbage_next,NULL,memory_order_relaxed);
+
+      for (int level = 0; level <= top_level; level++) {
+        atomic_store_explicit(&new_cell->forward[level], succs[level],
+                              memory_order_release);
+      }
+
+      /* Now we need to actually slip the node in. We start at the bottom-most
+         level (i.e the linked list of all nodes). This is because all searches
+         must end up at this level and so as long as the node is present, it
+         will be found - regardless of whether it has been added to the level
+         above. Consider the staircasing referred to in [skiplist_find] earlier,
+         the final step in finding a node is following the reference from it's
+         predecessor at the bottom level. */
+      pred = preds[0];
+      succ = succs[0];
+
+      /* We could be racing another insertion here and if we are then restart
+         the whole insertion process. We can't just retry the CAS because the
+         new node's predecessor and successors could have changed. There's also
+         a possibility that the predecessor's forward pointer could have been
+         marked and we would fail the CAS for that reason too. In that case the
+         [skiplist_find] earlier on will take care of snipping the node before
+         we get back to this point. */
+      if (!atomic_compare_exchange_strong(&pred->forward[0], &succ, new_cell)) {
+        caml_stat_free(new_cell);
+        continue;
+      }
+
+      for (int level = 1; level <= top_level; level++) {
+        while (1) {
+          pred = preds[level];
+          succ = succs[level];
+
+          /* If we were able to insert the node then we proceed to the next
+             level */
+          if (atomic_compare_exchange_strong(&pred->forward[level], &succ,
+                                             new_cell)) {
+            break;
+          }
+
+          /* On the other hand if we failed it might be because the pointer was
+             marked or because a new node was added between pred and succ nodes
+             at level. In both cases we can fix things by calling
+             [skiplist_find] and repopulating preds and succs */
+          skiplist_find(sk, key, preds, succs);
+        }
+      }
+
+      /* If we put the new node at a higher level than the current
+         [search_level] then to speed up searches we need to bump it. We don't
+         care too much if this fails though. */
+      if (top_level >
+          atomic_load_explicit(&sk->search_level, memory_order_relaxed)) {
+        atomic_store_explicit(&sk->search_level, top_level,
+                              memory_order_relaxed);
+      }
+
+      return 1;
+    }
+  }
+}
+
+/* Deletion in a skip list */
+
+int caml_lf_skiplist_remove(struct lf_skiplist *sk, uintnat key) {
+  struct lf_skipcell *preds[NUM_LEVELS];
+  struct lf_skipcell *succs[NUM_LEVELS];
+  struct lf_skipcell *succ;
+  int marked;
+
+  while (1) {
+    /* As with insert. If the node doesn't exist, we don't need to do anything.
+       While we're checking for it we populate the predecessor nodes and
+       successor nodes at each level. */
+    int found = skiplist_find(sk, key, preds, succs);
+
+    if (!found) {
+      return 0;
+    } else {
+      /* When the node exists in the skiplist, then succs[0] must point to it.
+         Note: this isn't the case for levels > 0. */
+      struct lf_skipcell *to_remove = succs[0];
+      for (int level = to_remove->top_level; level >= 1; level--) {
+        /* We mark each of the forward pointers at every level the node is
+           present at. We may be raced by another thread deleting the same node
+           and by threads inserting new nodes directly after the node we are
+           removing, so we need to retry the CAS in a loop to deal with the
+           latter. */
+        LF_SK_EXTRACT(to_remove->forward[level], marked, succ);
+
+        while (!marked) {
+          atomic_compare_exchange_strong(&to_remove->forward[level], &succ,
+                                         LF_SK_MARKED(succ));
+          LF_SK_EXTRACT(to_remove->forward[level], marked, succ);
+        }
+      }
+
+      /* The bottom layer is what ultimately determines whether the node is
+         present in the skiplist or not. We try to remove it and if we succeed
+         then indicate so to the caller. If not then another thread raced us an
+         won. */
+      LF_SK_EXTRACT(to_remove->forward[0], marked, succ);
+      while (1) {
+        int mark_success = atomic_compare_exchange_strong(
+            &to_remove->forward[0], &succ, LF_SK_MARKED(succ));
+
+        LF_SK_EXTRACT(to_remove->forward[0], marked, succ);
+
+        if (mark_success) {
+          skiplist_find(sk, key, preds, succs); /* This will fix up the mark */
+          return 1;
+        } else if (marked) {
+          return 0; /* Someone else beat us to removing it */
+        }
+
+        /* If we end up here then we lost to a thread inserting a node directly
+           after the node we were removing. That's why we move on one sucessor.
+         */
+      }
+    }
+  }
+}
+
+/* Collects freed nodes from the skiplist. This must be called periodically from
+   a single thread at a time when there can be no concurrent access to this
+   skiplist */
+
+void caml_lf_skiplist_free_garbage(struct lf_skiplist *sk) {
+  struct lf_skipcell *curr =
+      atomic_load_explicit(&sk->garbage_head, memory_order_acquire);
+
+  struct lf_skipcell *head = sk->head;
+  while (curr != head) {
+    struct lf_skipcell *next = atomic_load_explicit
+                                  (&curr->garbage_next, memory_order_relaxed);
+    // acquire not useful, if executed in STW
+    caml_stat_free(curr);
+    curr = next;
+  }
+
+  atomic_store_explicit(&sk->garbage_head, sk->head, memory_order_release);
+}
diff --git a/runtime/major_gc.c b/runtime/major_gc.c
index 089d2d856e85..9a9f9f982407 100644
--- a/runtime/major_gc.c
+++ b/runtime/major_gc.c
@@ -15,34 +15,37 @@
 
 #define CAML_INTERNALS
 
-#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
 #include <math.h>
 
-#include "caml/compact.h"
-#include "caml/custom.h"
 #include "caml/config.h"
+#include "caml/codefrag.h"
+#include "caml/domain.h"
+#include "caml/eventlog.h"
 #include "caml/fail.h"
+#include "caml/fiber.h"
 #include "caml/finalise.h"
-#include "caml/freelist.h"
-#include "caml/gc.h"
-#include "caml/gc_ctrl.h"
-#include "caml/major_gc.h"
-#include "caml/misc.h"
+#include "caml/globroots.h"
+#include "caml/memory.h"
 #include "caml/mlvalues.h"
+#include "caml/platform.h"
 #include "caml/roots.h"
-#include "caml/skiplist.h"
 #include "caml/signals.h"
+#include "caml/shared_heap.h"
+#include "caml/startup_aux.h"
 #include "caml/weak.h"
-#include "caml/memprof.h"
-#include "caml/eventlog.h"
+#include "caml/skiplist.h"
 
-#ifdef _MSC_VER
-Caml_inline double fmin(double a, double b) {
-  return (a < b) ? a : b;
-}
-#endif
+/* NB the MARK_STACK_INIT_SIZE must be larger than the number of objects
+   that can be in a pool, see POOL_WSIZE */
+#define MARK_STACK_INIT_SIZE (1 << 12)
+#define INITIAL_POOLS_TO_RESCAN_LEN 4
 
-#define MARK_STACK_INIT_SIZE 2048
+typedef struct {
+  value block;
+  uintnat offset;
+} mark_entry;
 
 struct mark_stack {
   mark_entry* stack;
@@ -50,204 +53,557 @@ struct mark_stack {
   uintnat size;
 };
 
-uintnat caml_percent_free;
-static uintnat marked_words, heap_wsz_at_cycle_start;
-uintnat caml_major_heap_increment;
-CAMLexport char *caml_heap_start;
-char *caml_gc_sweep_hp;
-int caml_gc_phase;        /* always Phase_mark, Pase_clean,
-                             Phase_sweep, or Phase_idle */
-uintnat caml_allocated_words;
-uintnat caml_dependent_size, caml_dependent_allocated;
-double caml_extra_heap_resources;
-uintnat caml_fl_wsz_at_phase_change = 0;
-
-extern value caml_fl_merge;  /* Defined in freelist.c. */
-
-/* redarken_first_chunk is the first chunk needing redarkening, if NULL no
-  redarkening required */
-static char *redarken_first_chunk = NULL;
-
-static char *sweep_chunk;
-static double p_backlog = 0.0; /* backlog for the gc speedup parameter */
-
-int caml_gc_subphase;     /* Subphase_{mark_roots,mark_main,mark_final} */
-
-/**
-   Ephemerons:
-   During mark phase the list caml_ephe_list_head of ephemerons
-   is iterated by different pointers that follow the invariants:
-   caml_ephe_list_head ->* ephes_checked_if_pure ->* ephes_to_check ->* null
-                        |                         |                  |
-                       (1)                       (2)                (3)
-
-    At the start of mark phase, (1) and (2) are empty.
-
-    In mark phase:
-      - An ephemeron in (1) have a data alive (grey/black if in the heap)
-        or none (nb: new ephemerons are added in this part by weak.c)
-      - An ephemeron in (2):
-         - is in any state if caml_ephe_list_pure is false
-         - otherwise has at least a white key or is white or its data is
-           black or none.
-           The third case can happen only using a set_* of weak.c
-      - the ephemerons in (3) are in an unknown state and must be checked
-
-    At the end of mark phase, (3) is empty and caml_ephe_list_pure is true.
-    The ephemeron in (1) and (2) will be cleaned (white keys and data
-    replaced by none or the ephemeron is removed from the list if it is white)
-    in clean phase.
-
-    In clean phase:
-    caml_ephe_list_head ->*                           ephes_to_check ->* null
-                         |                                            |
-                        (1)                                          (3)
-
-    In clean phase, (2) is not used, ephes_to_check is initialized at
-    caml_ephe_list_head:
-    - the ephemerons in (1) are clean.
-    - the ephemerons in (3) should be cleaned or removed if white.
-
- */
-int caml_ephe_list_pure;
-/** The ephemerons is pure if since the start of its iteration
-    no value have been darkened. */
-static value *ephes_checked_if_pure;
-static value *ephes_to_check;
-
-int caml_major_window = 1;
-double caml_major_ring[Max_major_window] = { 0. };
-int caml_major_ring_index = 0;
-double caml_major_work_credit = 0.0;
-double caml_gc_clock = 0.0;
+uintnat caml_percent_free = Percent_free_def;
+
+/* This variable is only written with the world stopped,
+   so it need not be atomic */
+uintnat caml_major_cycles_completed = 0;
+
+static atomic_uintnat num_domains_to_sweep;
+static atomic_uintnat num_domains_to_mark;
+static atomic_uintnat num_domains_to_ephe_sweep;
+static atomic_uintnat num_domains_to_final_update_first;
+static atomic_uintnat num_domains_to_final_update_last;
+
+static atomic_uintnat terminated_domains_allocated_words;
+
+enum global_roots_status{
+  WORK_UNSTARTED,
+  WORK_STARTED
+};
+static atomic_uintnat domain_global_roots_started;
+
+gc_phase_t caml_gc_phase;
+
+extern value caml_ephe_none; /* See weak.c */
+
+static struct ephe_cycle_info_t {
+  atomic_uintnat num_domains_todo;
+  /* Number of domains that need to scan their ephemerons in the current major
+   * GC cycle. This field is decremented when ephe_info->todo list at a domain
+   * becomes empty.  */
+  atomic_uintnat ephe_cycle;
+  /* Ephemeron cycle count */
+  atomic_uintnat num_domains_done;
+  /* Number of domains that have marked their ephemerons in the current
+   * ephemeron cycle. */
+} ephe_cycle_info;
+  /* In the first major cycle, there is no ephemeron marking to be done. */
+
+/* ephe_cycle_info is always updated with the critical section protected by
+ * ephe_lock or in the global barrier. However, the fields may be read without
+ * the lock. */
+static caml_plat_mutex ephe_lock = CAML_PLAT_MUTEX_INITIALIZER;
+
+static void ephe_next_cycle ()
+{
+  caml_plat_lock(&ephe_lock);
+  atomic_fetch_add(&ephe_cycle_info.ephe_cycle, +1);
+  CAMLassert(atomic_load_acq(&ephe_cycle_info.num_domains_done) <=
+             atomic_load_acq(&ephe_cycle_info.num_domains_todo));
+  atomic_store(&ephe_cycle_info.num_domains_done, 0);
+  caml_plat_unlock(&ephe_lock);
+}
+
+void caml_ephe_todo_list_emptied (void)
+{
+  caml_plat_lock(&ephe_lock);
+
+  /* Force next ephemeron marking cycle in order to avoid reasoning about
+   * whether the domain has already incremented
+   * [ephe_cycle_info.num_domains_done] counter. */
+  atomic_store(&ephe_cycle_info.num_domains_done, 0);
+  atomic_fetch_add(&ephe_cycle_info.ephe_cycle, +1);
+  atomic_fetch_add(&ephe_cycle_info.num_domains_todo, -1);
+  atomic_fetch_add_verify_ge0(&num_domains_to_ephe_sweep, -1);
+  CAMLassert(atomic_load_acq(&ephe_cycle_info.num_domains_done) <=
+             atomic_load_acq(&ephe_cycle_info.num_domains_todo));
+
+  caml_plat_unlock(&ephe_lock);
+}
+
+/* Record that ephemeron marking was done for the given ephemeron cycle. */
+static void record_ephe_marking_done (uintnat ephe_cycle)
+{
+  CAMLassert (ephe_cycle <= atomic_load_acq(&ephe_cycle_info.ephe_cycle));
+  CAMLassert (Caml_state->marking_done);
+
+  if (ephe_cycle < atomic_load_acq(&ephe_cycle_info.ephe_cycle))
+    return;
+
+  caml_plat_lock(&ephe_lock);
+  if (ephe_cycle == atomic_load(&ephe_cycle_info.ephe_cycle)) {
+    Caml_state->ephe_info->cycle = ephe_cycle;
+    atomic_fetch_add(&ephe_cycle_info.num_domains_done, +1);
+    CAMLassert(atomic_load_acq(&ephe_cycle_info.num_domains_done) <=
+               atomic_load_acq(&ephe_cycle_info.num_domains_todo));
+  }
+  caml_plat_unlock(&ephe_lock);
+}
+
+/* These are biased data structures left over from terminating domains. */
+static struct {
+  value ephe_list_live;
+  struct caml_final_info *final_info;
+} orph_structs = {0, 0};
+
+static caml_plat_mutex orphaned_lock = CAML_PLAT_MUTEX_INITIALIZER;
+
+void caml_add_orphaned_finalisers (struct caml_final_info* f)
+{
+  CAMLassert (caml_gc_phase == Phase_sweep_and_mark_main);
+  CAMLassert (!f->updated_first);
+  CAMLassert (!f->updated_last);
+
+  caml_plat_lock(&orphaned_lock);
+  f->next = orph_structs.final_info;
+  orph_structs.final_info = f;
+  caml_plat_unlock(&orphaned_lock);
+
+}
+
+/* Called by terminating domain from handover_finalisers */
+void caml_final_domain_terminate (caml_domain_state *domain_state)
+{
+  struct caml_final_info *f = domain_state->final_info;
+  if(!f->updated_first) {
+    atomic_fetch_add_verify_ge0(&num_domains_to_final_update_first, -1);
+    f->updated_first = 1;
+  }
+  if(!f->updated_last) {
+    atomic_fetch_add_verify_ge0(&num_domains_to_final_update_last, -1);
+    f->updated_last = 1;
+  }
+}
+
+static int no_orphaned_work (void)
+{
+  return
+    orph_structs.ephe_list_live == 0 &&
+    orph_structs.final_info == NULL;
+}
+
+void caml_orphan_allocated_words (void)
+{
+  atomic_fetch_add(&terminated_domains_allocated_words,
+                   Caml_state->allocated_words);
+}
+
+Caml_inline value ephe_list_tail(value e)
+{
+  value last = 0;
+  while (e != 0) {
+    CAMLassert (Tag_val(e) == Abstract_tag);
+    last = e;
+    e = Ephe_link(e);
+  }
+  return last;
+}
 
 #ifdef DEBUG
-static unsigned long major_gc_counter = 0;
-#endif
+static void orph_ephe_list_verify_status (int status)
+{
+  value v;
+
+  caml_plat_lock(&orphaned_lock);
+
+  v = orph_structs.ephe_list_live;
+  while (v) {
+    CAMLassert (Tag_val(v) == Abstract_tag);
+    CAMLassert (Has_status_hd (Hd_val(v), status));
+    v = Ephe_link(v);
+  }
 
-#ifdef NAKED_POINTERS_CHECKER
-int caml_naked_pointers_detected = 0;
+  caml_plat_unlock(&orphaned_lock);
+}
 #endif
 
-void (*caml_major_gc_hook)(void) = NULL;
+#define EPHE_MARK_DEFAULT 0
+#define EPHE_MARK_FORCE_ALIVE 1
 
-/* This function prunes the mark stack if it's about to overflow. It does so
-   by building a skiplist of major heap chunks and then iterating through the
-   mark stack and setting redarken_start/redarken_end on each chunk to indicate
-   the range that requires redarkening. */
-static void mark_stack_prune (struct mark_stack* stk)
+static intnat ephe_mark (intnat budget, uintnat for_cycle, int force_alive);
+
+void caml_add_to_orphaned_ephe_list(struct caml_ephe_info* ephe_info)
 {
-  int entry;
-  uintnat mark_stack_count = stk->count;
-  mark_entry* mark_stack = stk->stack;
+  caml_plat_lock(&orphaned_lock);
 
-  char* heap_chunk = caml_heap_start;
-  struct skiplist chunk_sklist = SKIPLIST_STATIC_INITIALIZER;
+  /* Force all ephemerons and their data on todo list to be alive */
+  if (ephe_info->todo) {
+    while (ephe_info->todo) {
+      ephe_mark (100000, 0, EPHE_MARK_FORCE_ALIVE);
+    }
+    caml_ephe_todo_list_emptied ();
+  }
+  CAMLassert (ephe_info->todo == 0);
+
+  if (ephe_info->live) {
+    value live_tail = ephe_list_tail(ephe_info->live);
+    CAMLassert(Ephe_link(live_tail) == 0);
+    Ephe_link(live_tail) = orph_structs.ephe_list_live;
+    orph_structs.ephe_list_live = ephe_info->live;
+    ephe_info->live = 0;
+  }
+
+  caml_plat_unlock(&orphaned_lock);
+}
+
+void caml_adopt_orphaned_work (void)
+{
+  caml_domain_state* domain_state = Caml_state;
+  value last;
+  struct caml_final_info *f, *myf, *temp;
+
+  if (no_orphaned_work() || caml_domain_is_terminating())
+    return;
+
+  caml_plat_lock(&orphaned_lock);
 
-  do {
-    caml_skiplist_insert(&chunk_sklist, (uintnat)heap_chunk,
-                          (uintnat)(heap_chunk+Chunk_size(heap_chunk)));
-    heap_chunk = Chunk_next(heap_chunk);
-  } while( heap_chunk != NULL );
-
-  for( entry = 0; entry < mark_stack_count ; entry++ ) {
-    mark_entry me = mark_stack[entry];
-    uintnat chunk_addr = 0, chunk_addr_below = 0;
-
-    if( caml_skiplist_find_below(&chunk_sklist, (uintnat)me.start,
-          &chunk_addr, &chunk_addr_below)
-        && (uintnat)me.start < chunk_addr_below ) {
-      heap_chunk_head* ch = Chunk_head(chunk_addr);
-      if (ch->redarken_first.start > me.start)
-        ch->redarken_first = me;
-
-      if (ch->redarken_end < me.end)
-        ch->redarken_end = me.end;
-
-      if( redarken_first_chunk == NULL
-          || redarken_first_chunk > (char*)chunk_addr ) {
-        redarken_first_chunk = (char*)chunk_addr;
+  if (orph_structs.ephe_list_live) {
+    last = ephe_list_tail(orph_structs.ephe_list_live);
+    CAMLassert(Ephe_link(last) == 0);
+    Ephe_link(last) = domain_state->ephe_info->live;
+    domain_state->ephe_info->live = orph_structs.ephe_list_live;
+    orph_structs.ephe_list_live = 0;
+  }
+
+  f = orph_structs.final_info;
+  myf = domain_state->final_info;
+  while (f != NULL) {
+    CAMLassert (!f->updated_first);
+    CAMLassert (!f->updated_last);
+    CAMLassert (!myf->updated_first);
+    CAMLassert (!myf->updated_last);
+    CAMLassert (caml_gc_phase == Phase_sweep_and_mark_main);
+    if (f->todo_head) {
+      if (myf->todo_tail == NULL) {
+        CAMLassert(myf->todo_head == NULL);
+        myf->todo_head = f->todo_head;
+        myf->todo_tail = f->todo_tail;
+      } else {
+        myf->todo_tail->next = f->todo_head;
+        myf->todo_tail = f->todo_tail;
       }
     }
+    if (f->first.young > 0) {
+      caml_final_merge_finalisable (&f->first, &myf->first);
+    }
+    if (f->last.young > 0) {
+      caml_final_merge_finalisable (&f->last, &myf->last);
+    }
+    temp = f;
+    f = f->next;
+    caml_stat_free (temp);
   }
+  orph_structs.final_info = NULL;
+  caml_plat_unlock(&orphaned_lock);
+}
 
-  caml_skiplist_empty(&chunk_sklist);
+#define BUFFER_SIZE 64
 
-  caml_gc_message(0x08, "Mark stack overflow.\n");
+struct buf_list_t {
+  double buffer[BUFFER_SIZE];
+  struct buf_list_t *next;
+};
 
-  stk->count = 0;
+static struct {
+  intnat heap_words_last_cycle;
+  intnat not_garbage_words_last_cycle;
+  int index;
+  struct buf_list_t *l;
+ } caml_stat_space_overhead = {0, 0, 0, NULL};
+
+double caml_mean_space_overhead (void)
+{
+  int index = caml_stat_space_overhead.index;
+  struct buf_list_t *t, *l = caml_stat_space_overhead.l;
+  /* Use Welford's online algorithm for calculating running variance to remove
+   * outliers from mean calculation. */
+  double mean = 0.0, m2 = 0.0, stddev = 0.0, v;
+  double delta, delta2;
+  intnat count = 0;
+
+  while (l) {
+    while (index > 0) {
+      v = l->buffer[--index];
+      if (count > 5 && (v < mean - 3 * stddev || v > mean + 3 * stddev)) {
+        continue;
+      }
+      count++;
+      delta = v - mean;
+      mean = mean + delta / count;
+      delta2 = v - mean;
+      m2 = m2 + delta * delta2;
+      stddev = sqrt (m2 / count);
+    }
+    t = l;
+    l = l->next;
+    caml_stat_free(t);
+    index = BUFFER_SIZE;
+  }
+  return mean;
 }
 
+static void update_major_slice_work() {
+  double p, dp, heap_words;
+  intnat computed_work;
+  caml_domain_state *dom_st = Caml_state;
+  uintnat heap_size, heap_sweep_words, saved_terminated_words;
+  /*
+     Free memory at the start of the GC cycle (garbage + free list) (assumed):
+                 FM = heap_words * caml_percent_free
+                      / (100 + caml_percent_free)
+
+     Assuming steady state and enforcing a constant allocation rate, then
+     FM is divided in 2/3 for garbage and 1/3 for free list.
+              G = 2 * FM / 3
+     G is also the amount of memory that will be used during this cycle
+     (still assuming steady state).
+
+     Proportion of G consumed since the previous slice:
+              PH = dom_st->allocated_words / G
+                = dom_st->allocated_words * 3 * (100 + caml_percent_free)
+                  / (2 * heap_words * caml_percent_free)
+     Proportion of extra-heap resources consumed since the previous slice:
+              PE = dom_st->extra_heap_resources
+     Proportion of total work to do in this slice:
+              P  = max (PH, PE)
+     Amount of marking work for the GC cycle:
+              MW = heap_words * 100 / (100 + caml_percent_free)
+     Amount of sweeping work for the GC cycle:
+              SW = heap_sweep_words
+     Amount of total work for the GC cycle:
+              TW = MW + SW
+              = heap_words * 100 / (100 + caml_percent_free) + heap_sweep_words
+
+     Amount of time to spend on this slice:
+                 T = P * TT
+
+     Since we must do TW amount of work in TT time, the amount of work done
+     for this slice is:
+                 S = P * TW
+  */
+  heap_size = caml_heap_size(dom_st->shared_heap);
+  heap_words = (double)Wsize_bsize(heap_size);
+  heap_sweep_words = heap_words;
+
+  saved_terminated_words = terminated_domains_allocated_words;
+  if( saved_terminated_words > 0 ) {
+    while(!atomic_compare_exchange_strong
+            (&terminated_domains_allocated_words, &saved_terminated_words, 0));
+  }
+
+  if (heap_words > 0) {
+    p = (double) (saved_terminated_words
+                + dom_st->allocated_words) * 3.0 * (100 + caml_percent_free)
+                / heap_words / caml_percent_free / 2.0;
+  } else {
+    p = 0.0;
+  }
+
+  if (dom_st->dependent_size > 0) {
+    dp = (double) dom_st->dependent_allocated * (100 + caml_percent_free)
+         / dom_st->dependent_size / caml_percent_free;
+  }else{
+    dp = 0.0;
+  }
+  if (p < dp) p = dp;
+
+  if (p < dom_st->extra_heap_resources) p = dom_st->extra_heap_resources;
+
+  computed_work = (intnat) (p * (heap_sweep_words
+                            + (heap_words * 100 / (100 + caml_percent_free))));
+
+  /* accumulate work */
+  dom_st->major_work_computed += computed_work;
+  dom_st->major_work_todo += computed_work;
+
+  caml_gc_message (0x40, "heap_words = %"
+                         ARCH_INTNAT_PRINTF_FORMAT "u\n",
+                   (uintnat)heap_words);
+  caml_gc_message (0x40, "allocated_words = %"
+                         ARCH_INTNAT_PRINTF_FORMAT "u\n",
+                   dom_st->allocated_words);
+  caml_gc_message (0x40, "raw work-to-do = %"
+                         ARCH_INTNAT_PRINTF_FORMAT "uu\n",
+                   (uintnat) (p * 1000000));
+  caml_gc_message (0x40, "extra_heap_resources = %"
+                         ARCH_INTNAT_PRINTF_FORMAT "uu\n",
+                   (uintnat) (dom_st->extra_heap_resources * 1000000));
+  caml_gc_message (0x40, "computed work = %"
+                         ARCH_INTNAT_PRINTF_FORMAT "d words\n",
+                   computed_work);
+
+  caml_gc_log("Updated major work: [%c] "
+                         " %"ARCH_INTNAT_PRINTF_FORMAT "u heap_words, "
+                         " %"ARCH_INTNAT_PRINTF_FORMAT "u allocated, "
+                         " %"ARCH_INTNAT_PRINTF_FORMAT "d computed_work, "
+                         " %"ARCH_INTNAT_PRINTF_FORMAT "d work_computed, "
+                         " %"ARCH_INTNAT_PRINTF_FORMAT "d work_todo, "
+                         " %"ARCH_INTNAT_PRINTF_FORMAT "u gc_clock",
+                         caml_gc_phase_char(caml_gc_phase),
+                         (uintnat)heap_words, dom_st->allocated_words,
+                         computed_work,
+                         dom_st->major_work_computed,
+                         dom_st->major_work_todo,
+                         (intnat)(dom_st->major_gc_clock*1000000));
+
+  dom_st->stat_major_words += dom_st->allocated_words;
+  dom_st->allocated_words = 0;
+  dom_st->dependent_allocated = 0;
+  dom_st->extra_heap_resources = 0.0;
+}
+
+static intnat get_major_slice_work(intnat howmuch) {
+  caml_domain_state *dom_st = Caml_state;
+  intnat computed_work;
+
+  /* calculate how much work to do now */
+  if (howmuch == AUTO_TRIGGERED_MAJOR_SLICE ||
+      howmuch == GC_CALCULATE_MAJOR_SLICE) {
+    computed_work = (dom_st->major_work_todo > 0)
+      ? dom_st->major_work_todo
+      : 0;
+
+    /* cap computed_work to 0.3 */
+    {
+      uintnat heap_size = caml_heap_size(dom_st->shared_heap);
+      uintnat heap_words = (double)Wsize_bsize(heap_size);
+      uintnat heap_sweep_words = heap_words;
+      intnat limit = (intnat)(0.3 * (heap_sweep_words
+                            + (heap_words * 100 / (100 + caml_percent_free))));
+
+      if (computed_work > limit)
+      {
+        computed_work = limit;
+      }
+    }
+  } else {
+    /* forced or opportunistic GC slice with explicit quantity */
+    computed_work = howmuch;
+  }
+
+  /* TODO: do we want to do anything more complex or simplify the above? */
+
+  return computed_work;
+}
+
+static void commit_major_slice_work(intnat words_done) {
+  caml_domain_state *dom_st = Caml_state;
+  intnat limit;
+
+  dom_st->major_work_todo -= words_done;
+
+  /* cap how far work todo can be in credit */
+  limit = -2*Wsize_bsize(caml_heap_size(dom_st->shared_heap));
+  if (dom_st->major_work_todo < limit)
+  {
+    dom_st->major_work_todo = limit;
+  }
+
+  /* check clock to close a cycle if need be */
+  if (dom_st->major_work_todo <= 0
+      && dom_st->major_gc_clock >= 1.0)
+  {
+    caml_gc_log("Major GC slice complete: "
+        " %"ARCH_INTNAT_PRINTF_FORMAT "d words_done, "
+        " %"ARCH_INTNAT_PRINTF_FORMAT "d todo, "
+        " %"ARCH_INTNAT_PRINTF_FORMAT "d computed, "
+        " %"ARCH_INTNAT_PRINTF_FORMAT "u clock",
+        words_done,
+        dom_st->major_work_todo,
+        dom_st->major_work_computed,
+        (uintnat)(dom_st->major_gc_clock * 1000000)
+      );
+
+    /* we have caught up */
+    while( dom_st->major_gc_clock >= 1.0 ) {
+      dom_st->major_gc_clock -= 1.;
+    }
+
+    /* limit amount of work credit that can go into next cycle */
+    limit = -2*dom_st->major_work_computed;
+    dom_st->major_work_todo = dom_st->major_work_todo < limit
+      ? limit
+      : dom_st->major_work_todo;
+    dom_st->major_work_computed = 0;
+  }
+}
+
+static void mark_stack_prune(struct mark_stack* stk);
+static struct pool* find_pool_to_rescan(void);
+
+
+#ifdef DEBUG
+#define Is_markable(v) \
+    (CAMLassert (v != Debug_free_major), \
+     Is_block(v) && !Is_young(v))
+#else
+#define Is_markable(v) (Is_block(v) && !Is_young(v))
+#endif
+
 static void realloc_mark_stack (struct mark_stack* stk)
 {
   mark_entry* new;
   uintnat mark_stack_bsize = stk->size * sizeof(mark_entry);
 
-  if ( Wsize_bsize(mark_stack_bsize) < Caml_state->stat_heap_wsz / 64 ) {
-    caml_gc_message (0x08, "Growing mark stack to %"
-                           ARCH_INTNAT_PRINTF_FORMAT "uk bytes\n",
-                     (intnat) mark_stack_bsize * 2 / 1024);
-
-    new = (mark_entry*) caml_stat_resize_noexc ((char*) stk->stack,
-                                                2 * mark_stack_bsize);
-    if (new != NULL) {
-      stk->stack = new;
-      stk->size *= 2;
-      return;
-    }
+  caml_gc_log ("Growing mark stack to %"ARCH_INTNAT_PRINTF_FORMAT"uk bytes\n",
+               (intnat) mark_stack_bsize * 2 / 1024);
+
+  new = (mark_entry*) caml_stat_resize_noexc ((char*) stk->stack,
+                                              2 * mark_stack_bsize);
+  if (new != NULL) {
+    stk->stack = new;
+    stk->size *= 2;
+    return;
   }
 
-  caml_gc_message (0x08, "No room for growing mark stack. Pruning..\n");
-  mark_stack_prune(stk);
+  caml_fatal_error("No room for growing mark stack.\n");
+  /* TODO: re-enable mark stack prune when safe to remark a pool
+    from a foreign domain which is also allocating from that pool
+   */
+  if (0) {
+    caml_gc_log ("Mark stack size is %"ARCH_INTNAT_PRINTF_FORMAT"u"
+                "bytes (> 32 * major heap size of this domain %"
+                ARCH_INTNAT_PRINTF_FORMAT"u bytes. Pruning..\n",
+                mark_stack_bsize,
+                caml_heap_size(Caml_state->shared_heap));
+    mark_stack_prune(stk);
+  }
 }
 
-/* This function pushes the provided mark_entry [me] onto the current mark
-   stack [stk]. It first checks, if the block is small enough, whether there
-   are any fields we would actually do mark work on. If so then it enqueues
-   the entry. */
-Caml_inline void mark_stack_push(struct mark_stack* stk, value block,
-                                  uintnat offset, intnat* work)
+static void mark_stack_push(struct mark_stack* stk, value block,
+                            uintnat offset, intnat* work)
 {
   value v;
   int i, block_wsz = Wosize_val(block), end;
   mark_entry* me;
 
-  CAMLassert(Is_block(block) && Is_in_heap (block)
-            && Is_black_val(block));
-  CAMLassert(Tag_val(block) != Infix_tag);
-  CAMLassert(Tag_val(block) < No_scan_tag);
-
-#if defined(NO_NAKED_POINTERS) || defined(NAKED_POINTERS_CHECKER)
-  if (Tag_val(block) == Closure_tag) {
+  if (offset == 0 && Tag_val(block) == Closure_tag) {
     /* Skip the code pointers and integers at beginning of closure;
-        start scanning at the first word of the environment part. */
-  /* It might be the case that [mark_stack_push] has been called
-      while we are traversing a closure block but have not enough
-      budget to finish the block. In that specific case, we should not
-      update [m.offset] */
-    if (offset == 0)
-      offset = Start_env_closinfo(Closinfo_val(block));
-
-    CAMLassert(offset <= Wosize_val(block)
-      && offset >= Start_env_closinfo(Closinfo_val(block)));
+       start scanning at the first word of the environment part. */
+    offset = Start_env_closinfo(Closinfo_val(block));
   }
-#endif
 
-  end = (block_wsz < 8 ? block_wsz : 8);
+  CAMLassert(Is_block(block) && !Is_young(block));
+  CAMLassert(Tag_val(block) != Infix_tag);
+  CAMLassert(Tag_val(block) < No_scan_tag);
+  CAMLassert(Tag_val(block) != Cont_tag);
+  CAMLassert(offset <= block_wsz);
 
   /* Optimisation to avoid pushing small, unmarkable objects such as [Some 42]
    * into the mark stack. */
+  end =  (block_wsz < 8 ? block_wsz : 8);
+
   for (i = offset; i < end; i++) {
     v = Field(block, i);
 
-    if (Is_block(v) && !Is_young(v))
-      /* found something to mark */
+    if (Is_markable(v))
       break;
   }
 
-  if (i == block_wsz) {
-    /* nothing left to mark */
-    if( work != NULL ) {
+  if (i == block_wsz){
+    /* nothing left to mark and credit header */
+    if(work != NULL){
       /* we should take credit for it though */
       *work -= Whsize_wosize(block_wsz - offset);
     }
@@ -266,61 +622,26 @@ Caml_inline void mark_stack_push(struct mark_stack* stk, value block,
     realloc_mark_stack(stk);
 
   me = &stk->stack[stk->count++];
-
-  me->start = Op_val(block) + offset;
-  me->end = Op_val(block) + Wosize_val(block);
+  me->block = block;
+  me->offset = offset;
 }
 
-#if defined(NAKED_POINTERS_CHECKER) && defined(NATIVE_CODE)
-static void is_naked_pointer_safe (value v, value *p);
-#endif
-
-void caml_darken (value v, value *p)
+/* to fit scanning_action */
+static void mark_stack_push_act(void* state, value v, value* ignored)
 {
-#ifdef NO_NAKED_POINTERS
-  if (Is_block(v) && !Is_young (v)) {
-#else
-  if (Is_block(v) && Is_in_heap (v)) {
-#endif
-    header_t h = Hd_val (v);
-    tag_t t = Tag_hd (h);
-    if (t == Infix_tag){
-      v -= Infix_offset_val(v);
-      h = Hd_val (v);
-      t = Tag_hd (h);
-    }
-#ifdef NO_NAKED_POINTERS
-    /* We insist that naked pointers to outside the heap point to things that
-       look like values with headers coloured black.  This is always
-       strictly necessary because the compactor relies on it. */
-    CAMLassert (Is_in_heap (v) || Is_black_hd (h));
-#endif
-    CAMLassert (!Is_blue_hd (h));
-    if (Is_white_hd (h)){
-      caml_ephe_list_pure = 0;
-      Hd_val (v) = Blackhd_hd (h);
-      marked_words += Whsize_hd (h);
-      if (t < No_scan_tag){
-        mark_stack_push(Caml_state->mark_stack, v, 0, NULL);
-      }
-    }
-  }
-#if defined(NAKED_POINTERS_CHECKER) && defined(NATIVE_CODE)
-  else if (Is_block(v) && !Is_young(v)) {
-    is_naked_pointer_safe(v, p);
-  }
-#endif
+  if (Tag_val(v) < No_scan_tag && Tag_val(v) != Cont_tag)
+    mark_stack_push(Caml_state->mark_stack, v, 0, NULL);
 }
 
 /* This function shrinks the mark stack back to the MARK_STACK_INIT_SIZE size
-   and is called at the end of a GC compaction to avoid a mark stack greater
-   than 1/32th of the heap. */
-void caml_shrink_mark_stack () {
+   and is called at domain termination via caml_finish_marking. */
+void caml_shrink_mark_stack (void)
+{
   struct mark_stack* stk = Caml_state->mark_stack;
   intnat init_stack_bsize = MARK_STACK_INIT_SIZE * sizeof(mark_entry);
   mark_entry* shrunk_stack;
 
-  caml_gc_message (0x08, "Shrinking mark stack to %"
+  caml_gc_log ("Shrinking mark stack to %"
                   ARCH_INTNAT_PRINTF_FORMAT "uk bytes\n",
                   init_stack_bsize / 1024);
 
@@ -330,1083 +651,987 @@ void caml_shrink_mark_stack () {
     stk->stack = shrunk_stack;
     stk->size = MARK_STACK_INIT_SIZE;
   }else{
-    caml_gc_message (0x08, "Mark stack shrinking failed");
+    caml_gc_log ("Mark stack shrinking failed");
   }
 }
 
-/* This function adds blocks in the passed heap chunk [heap_chunk] to
-   the mark stack. It returns 1 when the supplied chunk has no more
-   range to redarken.  It returns 0 if there are still blocks in the
-   chunk that need redarkening because pushing them onto the stack
-   would make it grow more than a quarter full. This is to lower the
-   chance of triggering another overflow, which would be
-   wasteful. Subsequent calls will continue progress.
- */
-static int redarken_chunk(char* heap_chunk, struct mark_stack* stk) {
-  heap_chunk_head* chunk = Chunk_head(heap_chunk);
-  mark_entry me = chunk->redarken_first;
-  header_t* end = (header_t*)chunk->redarken_end;
-  if (chunk->redarken_end <= me.start) return 1;
-
-  while (1) {
-    header_t* hp;
-    /* Skip a prefix of fields that need no marking */
-    CAMLassert(me.start <= me.end && (header_t*)me.end <= end);
-    while (me.start < me.end &&
-           (!Is_block(*me.start) || Is_young(*me.start))) {
-      me.start++;
-    }
+void caml_darken_cont(value cont);
 
-    /* Push to the mark stack (if anything's left) */
-    if (me.start < me.end) {
-      if (stk->count < stk->size/4) {
-        stk->stack[stk->count++] = me;
+static void mark_slice_darken(struct mark_stack* stk, value v, mlsize_t i,
+                              intnat* work)
+{
+  value child;
+  header_t chd;
+
+  child = Field(v, i);
+
+  if (Is_markable(child)){
+    chd = Hd_val(child);
+    if (Tag_hd(chd) == Infix_tag) {
+      child -= Infix_offset_hd(chd);
+      chd = Hd_val(child);
+    }
+    CAMLassert(!Has_status_hd(chd, caml_global_heap_state.GARBAGE));
+    if (Has_status_hd(chd, caml_global_heap_state.UNMARKED)){
+      Caml_state->stat_blocks_marked++;
+      if (Tag_hd(chd) == Cont_tag){
+        caml_darken_cont(child);
+        *work -= Wosize_hd(chd);
       } else {
-        /* Only fill up a quarter of the mark stack, we can resume later
-           for more if we need to */
-        chunk->redarken_first = me;
-        return 0;
+    again:
+        if (Tag_hd(chd) == Lazy_tag || Tag_hd(chd) == Forcing_tag){
+          if(!atomic_compare_exchange_strong(Hp_atomic_val(child), &chd,
+                With_status_hd(chd, caml_global_heap_state.MARKED))){
+                  chd = Hd_val(child);
+                  goto again;
+          }
+        } else {
+          atomic_store_explicit(
+            Hp_atomic_val(child),
+            With_status_hd(chd, caml_global_heap_state.MARKED),
+            memory_order_relaxed);
+        }
+        if(Tag_hd(chd) < No_scan_tag){
+          mark_stack_push(stk, child, 0, work);
+        } else {
+          *work -= Wosize_hd(chd);
+        }
       }
     }
+  }
+}
 
-    /* Find the next block that needs to be re-marked */
-    hp = (header_t*)me.end;
-    CAMLassert(hp <= end);
-    while (hp < end) {
-      value v = Val_hp(hp);
-      if (Tag_val(v) < No_scan_tag && Is_black_val(v))
-        break;
-      hp = (header_t*)(Op_val(v) + Wosize_val(v));
-    }
-    if (hp == end)
-      break;
-
-    /* Found a block */
-    me.start = Op_hp(hp);
-    me.end = me.start + Wosize_hp(hp);
-    if (Tag_hp(hp) == Closure_tag) {
-      me.start += Start_env_closinfo(Closinfo_val(Val_hp(hp)));
+static intnat do_some_marking(intnat budget) {
+  struct mark_stack* stk = Caml_state->mark_stack;
+  while (stk->count > 0) {
+    mark_entry me = stk->stack[--stk->count];
+    intnat me_end = Wosize_val(me.block);
+    while (me.offset != me_end) {
+      if (budget <= 0) {
+        mark_stack_push(stk, me.block, me.offset, NULL);
+        return budget;
+      }
+      budget--;
+      CAMLassert(Is_markable(me.block) &&
+                 Has_status_hd(Hd_val(me.block),
+                               caml_global_heap_state.MARKED) &&
+                 Tag_val(me.block) < No_scan_tag &&
+                 Tag_val(me.block) != Cont_tag);
+      mark_slice_darken(stk, me.block, me.offset++, &budget);
     }
+    budget--; /* credit for header */
   }
-
-  chunk->redarken_first.start =
-      (value*)(heap_chunk + Chunk_size(heap_chunk));
-  chunk->redarken_first.end = chunk->redarken_first.start;
-  chunk->redarken_end = (value*)heap_chunk;
-
-  return 1;
+  return budget;
 }
 
-static void start_cycle (void)
-{
-  CAMLassert (caml_gc_phase == Phase_idle);
-  CAMLassert (Caml_state->mark_stack->count == 0);
-  CAMLassert (redarken_first_chunk == NULL);
-  caml_gc_message (0x01, "Starting new major GC cycle\n");
-  marked_words = 0;
-  caml_darken_all_roots_start ();
-  caml_gc_phase = Phase_mark;
-  heap_wsz_at_cycle_start = Caml_state->stat_heap_wsz;
-  caml_gc_subphase = Subphase_mark_roots;
-  caml_ephe_list_pure = 1;
-  ephes_checked_if_pure = &caml_ephe_list_head;
-  ephes_to_check = &caml_ephe_list_head;
-#ifdef DEBUG
-  ++ major_gc_counter;
-  caml_heap_check ();
-#endif
+/* mark until the budget runs out or marking is done */
+static intnat mark(intnat budget) {
+  while (budget > 0 && !Caml_state->marking_done) {
+    budget = do_some_marking(budget);
+    if (budget > 0) {
+      struct pool* p = find_pool_to_rescan();
+      if (p) {
+        caml_redarken_pool(p, &mark_stack_push_act, 0);
+      } else {
+        ephe_next_cycle ();
+        Caml_state->marking_done = 1;
+        atomic_fetch_add_verify_ge0(&num_domains_to_mark, -1);
+      }
+    }
+  }
+  return budget;
 }
 
-static void init_sweep_phase(void)
+void caml_darken_cont(value cont)
 {
-  /* Phase_clean is done. */
-  /* Initialise the sweep phase. */
-  caml_gc_sweep_hp = caml_heap_start;
-  caml_fl_init_merge ();
-  caml_gc_phase = Phase_sweep;
-  sweep_chunk = caml_heap_start;
-  caml_gc_sweep_hp = sweep_chunk;
-  caml_fl_wsz_at_phase_change = caml_fl_cur_wsz;
-  if (caml_major_gc_hook) (*caml_major_gc_hook)();
+  CAMLassert(Is_block(cont) && !Is_young(cont) && Tag_val(cont) == Cont_tag);
+  {
+    SPIN_WAIT {
+      header_t hd
+            = atomic_load_explicit(Hp_atomic_val(cont), memory_order_relaxed);
+      CAMLassert(!Has_status_hd(hd, caml_global_heap_state.GARBAGE));
+      if (Has_status_hd(hd, caml_global_heap_state.MARKED))
+        break;
+      if (Has_status_hd(hd, caml_global_heap_state.UNMARKED) &&
+          atomic_compare_exchange_strong(
+              Hp_atomic_val(cont), &hd,
+              With_status_hd(hd, NOT_MARKABLE))) {
+        value stk = Field(cont, 0);
+        if (Ptr_val(stk) != NULL)
+          caml_scan_stack(&caml_darken, 0, Ptr_val(stk), 0);
+        atomic_store_explicit(
+          Hp_atomic_val(cont),
+          With_status_hd(hd, caml_global_heap_state.MARKED),
+          memory_order_release);
+      }
+    }
+  }
 }
 
-/* auxiliary function of mark_ephe_aux */
-Caml_inline void mark_ephe_darken(struct mark_stack* stk, value v, mlsize_t i,
-                                       int in_ephemeron, int *slice_pointers,
-                                       intnat *work)
-{
-  value child;
-  header_t chd;
-
-  child = Field (v, i);
+void caml_darken(void* state, value v, value* ignored) {
+  header_t hd;
+  if (!Is_markable (v)) return; /* foreign stack, at least */
 
-#ifdef NO_NAKED_POINTERS
-  if (Is_block (child) && ! Is_young (child)) {
-#else
-  if (Is_block (child) && Is_in_heap (child)) {
-#endif
-    CAML_EVENTLOG_DO (++ *slice_pointers);
-    chd = Hd_val (child);
-    if (Tag_hd (chd) == Forward_tag){
-      value f = Forward_val (child);
-      if ((in_ephemeron && Is_long(f)) ||
-          (Is_block (f)
-           && (!Is_in_value_area(f) || Tag_val (f) == Forward_tag
-               || Tag_val (f) == Lazy_tag
-#ifdef FLAT_FLOAT_ARRAY
-               || Tag_val (f) == Double_tag
-#endif
-               ))){
-        /* Do not short-circuit the pointer. */
-      }else{
-        /* The variable child is not changed because it must be mark alive */
-        Field (v, i) = f;
-        if (Is_block (f) && Is_young (f) && !Is_young (child)){
-          if(in_ephemeron) {
-            add_to_ephe_ref_table (Caml_state->ephe_ref_table, v, i);
-          } else {
-            add_to_ref_table (Caml_state->ref_table, &Field (v, i));
-          }
-        }
-      }
-    }
-    else if (Tag_hd(chd) == Infix_tag) {
-      child -= Infix_offset_val(child);
-      chd = Hd_val(child);
+  hd = Hd_val(v);
+  if (Tag_hd(hd) == Infix_tag) {
+    v -= Infix_offset_hd(hd);
+    hd = Hd_val(v);
+  }
+  if (Has_status_hd(hd, caml_global_heap_state.UNMARKED)) {
+    if (Caml_state->marking_done) {
+      atomic_fetch_add(&num_domains_to_mark, 1);
+      Caml_state->marking_done = 0;
     }
-#ifdef NO_NAKED_POINTERS
-    /* See [caml_darken] for a description of this assertion. */
-    CAMLassert (Is_in_heap (child) || Is_black_hd (chd));
-#endif
-    if (Is_white_hd (chd)){
-      caml_ephe_list_pure = 0;
-      Hd_val (child) = Blackhd_hd (chd);
-      if( Tag_hd(chd) < No_scan_tag ) {
-        mark_stack_push(stk, child, 0, work);
-      } else {
-        *work -= Whsize_hd (chd);
+    if (Tag_hd(hd) == Cont_tag) {
+      caml_darken_cont(v);
+    } else {
+      atomic_store_explicit(
+         Hp_atomic_val(v),
+         With_status_hd(hd, caml_global_heap_state.MARKED),
+         memory_order_relaxed);
+      if (Tag_hd(hd) < No_scan_tag) {
+        mark_stack_push(Caml_state->mark_stack, v, 0, NULL);
       }
     }
   }
-#if defined(NAKED_POINTERS_CHECKER) && defined(NATIVE_CODE)
-  else if (Is_block(child) && ! Is_young(child)) {
-    is_naked_pointer_safe(child, &Field (v, i));
-  }
-#endif
 }
 
-static void mark_ephe_aux (struct mark_stack *stk, intnat *work,
-                             int *slice_pointers)
+static intnat ephe_mark (intnat budget, uintnat for_cycle,
+                         /* Forces ephemerons and their data to be alive */
+                         int force_alive)
 {
-  value v, data, key;
+  value v, data, key, f, todo;
+  value* prev_linkp;
   header_t hd;
   mlsize_t size, i;
-
-  v = *ephes_to_check;
-  hd = Hd_val(v);
-  CAMLassert(Tag_val (v) == Abstract_tag);
-  data = Field(v,CAML_EPHE_DATA_OFFSET);
-  if ( data != caml_ephe_none &&
-       Is_block (data) &&
-#ifdef NO_NAKED_POINTERS
-       !Is_young(data) &&
-#else
-       Is_in_heap (data) &&
-#endif
-       Is_white_val (data)){
-
-    int alive_data = 1;
-
-    /* The liveness of the ephemeron is one of the condition */
-    if (Is_white_hd (hd)) alive_data = 0;
-
-    /* The liveness of the keys not caml_ephe_none is the other condition */
-    size = Wosize_hd (hd);
-    for (i = CAML_EPHE_FIRST_KEY; alive_data && i < size; i++){
-      key = Field (v, i);
+  caml_domain_state* domain_state = Caml_state;
+  int alive_data;
+  intnat marked = 0, made_live = 0;
+
+  if (domain_state->ephe_info->cursor.cycle == for_cycle &&
+      !force_alive) {
+    prev_linkp = domain_state->ephe_info->cursor.todop;
+    todo = *prev_linkp;
+  } else {
+    todo = domain_state->ephe_info->todo;
+    prev_linkp = &domain_state->ephe_info->todo;
+  }
+  while (todo != 0 && budget > 0) {
+    v = todo;
+    todo = Ephe_link(v);
+    CAMLassert (Tag_val(v) == Abstract_tag);
+    hd = Hd_val(v);
+    data = Ephe_data(v);
+    alive_data = 1;
+
+    if (force_alive)
+      caml_darken (0, v, 0);
+
+    /* If ephemeron is unmarked, data is dead */
+    if (is_unmarked(v)) alive_data = 0;
+
+    size = Wosize_hd(hd);
+    for (i = CAML_EPHE_FIRST_KEY; alive_data && i < size; i++) {
+      key = Field(v, i);
     ephemeron_again:
-      if (key != caml_ephe_none &&
-          Is_block (key) &&
-#ifdef NO_NAKED_POINTERS
-          !Is_young(key)
-#else
-          Is_in_heap(key)
-#endif
-          ){
-        if (Tag_val (key) == Forward_tag){
-          value f = Forward_val (key);
-          if (Is_long (f) ||
-              (Is_block (f) &&
-               (!Is_in_value_area(f) || Tag_val (f) == Forward_tag
-                || Tag_val (f) == Lazy_tag
-#ifdef FLAT_FLOAT_ARRAY
-                || Tag_val (f) == Double_tag
-#endif
-                ))){
-            /* Do not short-circuit the pointer. */
-          }else{
-            Field (v, i) = key = f;
-            goto ephemeron_again;
+      if (key != caml_ephe_none && Is_block(key)) {
+        if (Tag_val(key) == Forward_tag) {
+          f = Forward_val(key);
+          if (Is_block(f)) {
+            if (Tag_val(f) == Forward_tag || Tag_val(f) == Lazy_tag ||
+                Tag_val(f) == Forcing_tag || Tag_val(f) == Double_tag) {
+              /* Do not short-circuit the pointer */
+            } else {
+              Field(v, i) = key = f;
+              goto ephemeron_again;
+            }
           }
         }
-        if (Is_white_val (key)){
-          alive_data = 0;
+        else {
+          if (Tag_val (key) == Infix_tag) key -= Infix_offset_val (key);
+          if (is_unmarked (key))
+            alive_data = 0;
         }
       }
     }
-    *work -= Whsize_wosize(i);
-
-    if (alive_data){
-      mark_ephe_darken(stk, v, CAML_EPHE_DATA_OFFSET, /*in_ephemeron=*/1,
-                          slice_pointers, work);
-    } else { /* not triggered move to the next one */
-      ephes_to_check = &Field(v,CAML_EPHE_LINK_OFFSET);
-      return;
+    budget -= Whsize_wosize(i);
+
+    if (force_alive || alive_data) {
+      if (data != caml_ephe_none && Is_block(data)) {
+        caml_darken (0, data, 0);
+      }
+      Ephe_link(v) = domain_state->ephe_info->live;
+      domain_state->ephe_info->live = v;
+      *prev_linkp = todo;
+      made_live++;
+    } else {
+      /* Leave this ephemeron on the todo list */
+      prev_linkp = &Ephe_link(v);
     }
-  } else {  /* a similarly weak pointer or an already alive data */
-    *work -= 1;
+    marked++;
   }
 
-  /* all keys black or data none or black
-     move the ephemerons from (3) to the end of (1) */
-  if ( ephes_checked_if_pure == ephes_to_check ) {
-    /* corner case and optim */
-    ephes_checked_if_pure = &Field(v,CAML_EPHE_LINK_OFFSET);
-    ephes_to_check = ephes_checked_if_pure;
-  } else {
-    /*  - remove v from the list (3) */
-    *ephes_to_check = Field(v,CAML_EPHE_LINK_OFFSET);
-    /*  - insert it at the end of (1) */
-    Field(v,CAML_EPHE_LINK_OFFSET) = *ephes_checked_if_pure;
-    *ephes_checked_if_pure = v;
-    ephes_checked_if_pure = &Field(v,CAML_EPHE_LINK_OFFSET);
+  caml_gc_log
+  ("Mark Ephemeron: %s. for ephemeron cycle=%"ARCH_INTNAT_PRINTF_FORMAT"d "
+  "marked=%"ARCH_INTNAT_PRINTF_FORMAT"d made_live=%"ARCH_INTNAT_PRINTF_FORMAT"d"
+  , domain_state->ephe_info->cursor.cycle == for_cycle
+                              ? "continued from cursor" : "discarded cursor",
+                              for_cycle, marked, made_live);
+
+  domain_state->ephe_info->cursor.cycle = for_cycle;
+  domain_state->ephe_info->cursor.todop = prev_linkp;
+
+  return budget;
+}
+
+static intnat ephe_sweep (caml_domain_state* domain_state, intnat budget)
+{
+  value v;
+  CAMLassert (caml_gc_phase == Phase_sweep_ephe);
+
+  while (domain_state->ephe_info->todo != 0 && budget > 0) {
+    v = domain_state->ephe_info->todo;
+    domain_state->ephe_info->todo = Ephe_link(v);
+    CAMLassert (Tag_val(v) == Abstract_tag);
+
+    if (is_unmarked(v)) {
+      /* The whole array is dead, drop this ephemeron */
+      budget -= 1;
+    } else {
+      caml_ephe_clean(v);
+      Ephe_link(v) = domain_state->ephe_info->live;
+      domain_state->ephe_info->live = v;
+      budget -= Whsize_val(v);
+    }
   }
+  return budget;
 }
 
+static struct gc_stats sampled_gc_stats[Max_domains];
 
-#define Pb_size (1 << 8)
-#define Pb_min 64
-#define Pb_mask (Pb_size - 1)
+void caml_accum_heap_stats(struct heap_stats* acc, const struct heap_stats* h)
+{
+  acc->pool_words += h->pool_words;
+  if (acc->pool_max_words < h->pool_max_words)
+    acc->pool_max_words = h->pool_max_words;
+  acc->pool_live_words += h->pool_live_words;
+  acc->pool_live_blocks += h->pool_live_blocks;
+  acc->pool_frag_words += h->pool_frag_words;
+  acc->large_words += h->large_words;
+  if (acc->large_max_words < h->large_max_words)
+    acc->large_max_words = h->large_max_words;
+  acc->large_blocks += h->large_blocks;
+}
 
-Caml_inline void prefetch_block(value v)
+void caml_remove_heap_stats(struct heap_stats* acc, const struct heap_stats* h)
 {
-  /* Prefetch a block so that scanning it later avoids cache misses.
-     We will access at least the header, but we don't yet know how
-     many of the fields we will access - the block might be already
-     marked, not scannable, or very short. The compromise here is to
-     prefetch the header and the first few fields.
-
-     We issue two prefetches, with the second being a few words ahead
-     of the first. Most of the time, these will land in the same
-     cacheline, be coalesced by hardware, and so not cost any more
-     than a single prefetch. Two memory operations are issued only
-     when the two prefetches land in different cachelines.
-
-     In the case where the block is not already in cache, and yet is
-     already marked, not markable, or extremely short, then we waste
-     somewhere between 1/8-1/2 of a prefetch operation (in expectation,
-     depending on alignment, word size, and cache line size), which is
-     cheap enough to make this worthwhile. */
-  caml_prefetch(Hp_val(v));
-  caml_prefetch(&Field(v, 3));
+  acc->pool_words -= h->pool_words;
+  acc->pool_live_words -= h->pool_live_words;
+  acc->pool_live_blocks -= h->pool_live_blocks;
+  acc->pool_frag_words -= h->pool_frag_words;
+  acc->large_words -= h->large_words;
+  acc->large_blocks -= h->large_blocks;
 }
 
-Caml_inline uintnat rotate1(uintnat x)
+
+void caml_sample_gc_stats(struct gc_stats* buf)
 {
-  return (x << ((sizeof x)*8 - 1)) | (x >> 1);
+  int i;
+  intnat pool_max = 0, large_max = 0;
+  int my_id = Caml_state->id;
+  memset(buf, 0, sizeof(*buf));
+
+  for (i=0; i<Max_domains; i++) {
+    struct gc_stats* s = &sampled_gc_stats[i];
+    struct heap_stats* h = &s->major_heap;
+    if (i != my_id) {
+      buf->minor_words += s->minor_words;
+      buf->promoted_words += s->promoted_words;
+      buf->major_words += s->major_words;
+      buf->minor_collections += s->minor_collections;
+      buf->forced_major_collections += s->forced_major_collections;
+    }
+    else {
+      buf->minor_words += Caml_state->stat_minor_words;
+      buf->promoted_words += Caml_state->stat_promoted_words;
+      buf->major_words += Caml_state->stat_major_words;
+      buf->minor_collections += Caml_state->stat_minor_collections;
+      buf->forced_major_collections += s->forced_major_collections;
+      //FIXME handle the case for major heap stats [h]
+    }
+    /* The instantaneous maximum heap size cannot be computed
+       from per-domain statistics, and would be very expensive
+       to maintain directly. Here, we just sum the per-domain
+       maxima, which is statistically dubious.
+
+       FIXME: maybe maintain coarse global maxima? */
+    pool_max += h->pool_max_words;
+    large_max += h->large_max_words;
+    caml_accum_heap_stats(&buf->major_heap, h);
+  }
+  buf->major_heap.pool_max_words = pool_max;
+  buf->major_heap.large_max_words = large_max;
 }
 
-Caml_noinline static intnat do_some_marking
-#ifndef CAML_INSTR
-  (intnat work)
-#else
-  (intnat work, int* pslice_fields, int* pslice_pointers)
-#endif
+/* update GC stats for this given domain */
+inline void caml_sample_gc_collect(caml_domain_state* domain)
 {
-  uintnat pb_enqueued = 0, pb_dequeued = 0;
-  int darkened_anything = 0;
-  value pb[Pb_size];
-  uintnat min_pb = Pb_min; /* keep pb at least this full */
-  /* These global values are cached in locals,
-     so that they can be stored in registers */
-  struct mark_stack stk = *Caml_state->mark_stack;
-  uintnat young_start = (uintnat)Val_hp(Caml_state->young_start);
-  uintnat half_young_len =
-    ((uintnat)Caml_state->young_end - (uintnat)Caml_state->young_start) >> 1;
-#define Is_block_and_not_young(v) \
-  (((intnat)rotate1((uintnat)v - young_start)) >= (intnat)half_young_len)
-#ifdef NO_NAKED_POINTERS
-  #define Is_major_block(v) Is_block_and_not_young(v)
-#else
-  #define Is_major_block(v) (Is_block_and_not_young(v) && Is_in_heap(v))
-#endif
+  struct gc_stats* stats = &sampled_gc_stats[domain->id];
+
+  stats->minor_words = domain->stat_minor_words;
+  stats->promoted_words = domain->stat_promoted_words;
+  stats->major_words = domain->stat_major_words;
+  stats->minor_collections = domain->stat_minor_collections;
+  stats->forced_major_collections = domain->stat_forced_major_collections;
+  caml_sample_heap_stats(domain->shared_heap, &stats->major_heap);
+}
 
-#ifdef CAML_INSTR
-  int slice_fields = 0, slice_pointers = 0;
-#endif
+static void cycle_all_domains_callback(caml_domain_state* domain, void* unused,
+                                       int participating_count,
+                                       caml_domain_state** participating)
+{
+  uintnat num_domains_in_stw;
+
+  CAML_EV_BEGIN(EV_MAJOR_GC_CYCLE_DOMAINS);
+
+  CAMLassert(domain == Caml_state);
+  CAMLassert(atomic_load_acq(&ephe_cycle_info.num_domains_todo) ==
+             atomic_load_acq(&ephe_cycle_info.num_domains_done));
+  CAMLassert(atomic_load(&num_domains_to_mark) == 0);
+  CAMLassert(atomic_load(&num_domains_to_sweep) == 0);
+  CAMLassert(atomic_load(&num_domains_to_ephe_sweep) == 0);
+
+  caml_empty_minor_heap_no_major_slice_from_stw
+                        (domain, (void*)0, participating_count, participating);
+
+  CAML_EV_BEGIN(EV_MAJOR_GC_STW);
+
+  {
+    /* Cycle major heap */
+    // FIXME: delete caml_cycle_heap_stw and have per-domain copies of the data?
+    barrier_status b = caml_global_barrier_begin();
+    if (caml_global_barrier_is_final(b)) {
+      caml_cycle_heap_stw();
+      caml_gc_log("GC cycle %lu completed (heap cycled)",
+                  (long unsigned int)caml_major_cycles_completed);
+
+      caml_major_cycles_completed++;
+      caml_gc_message(0x40, "Starting major GC cycle\n");
+
+      if (caml_params->verb_gc & 0x400) {
+        struct gc_stats s;
+        intnat heap_words, not_garbage_words, swept_words;
+
+        caml_sample_gc_stats(&s);
+        heap_words = s.major_heap.pool_words + s.major_heap.large_words;
+        not_garbage_words = s.major_heap.pool_live_words
+                            + s.major_heap.large_words;
+        swept_words = domain->swept_words;
+        caml_gc_log ("heap_words: %"ARCH_INTNAT_PRINTF_FORMAT"d "
+                      "not_garbage_words %"ARCH_INTNAT_PRINTF_FORMAT"d "
+                      "swept_words %"ARCH_INTNAT_PRINTF_FORMAT"d",
+                      heap_words, not_garbage_words, swept_words);
+
+        if (caml_stat_space_overhead.heap_words_last_cycle != 0) {
+          /* At the end of a major cycle, no object has colour MARKED.
+
+             [not_garbage_words] counts all objects which are UNMARKED.
+             Importantly, this includes both live objects and objects which are
+             unreachable in the current cycle (i.e, garbage). But we don't get
+             to know which objects are garbage until the end of the next cycle.
+
+             live_words@N = not_garbage_words@N - swept_words@N+1
+
+             space_overhead@N =
+                      100.0 * (heap_words@N - live_words@N) / live_words@N
+            */
+          double live_words_last_cycle =
+            caml_stat_space_overhead.not_garbage_words_last_cycle - swept_words;
+          double space_overhead =
+            100.0 * (double)(caml_stat_space_overhead.heap_words_last_cycle
+                            - live_words_last_cycle) / live_words_last_cycle;
+
+          if (caml_stat_space_overhead.l == NULL ||
+              caml_stat_space_overhead.index == BUFFER_SIZE) {
+            struct buf_list_t *l =
+              (struct buf_list_t*)
+                  caml_stat_alloc_noexc(sizeof(struct buf_list_t));
+            l->next = caml_stat_space_overhead.l;
+            caml_stat_space_overhead.l = l;
+            caml_stat_space_overhead.index = 0;
+          }
+          caml_stat_space_overhead.l->buffer[caml_stat_space_overhead.index++] =
+            space_overhead;
+          caml_gc_log("Previous cycle's space_overhead: %lf", space_overhead);
+        }
+        caml_stat_space_overhead.heap_words_last_cycle = heap_words;
 
-  while (1) {
-    value *scan, *obj_end, *scan_end;
-    intnat scan_len;
+        caml_stat_space_overhead.not_garbage_words_last_cycle
+        = not_garbage_words;
+      }
+      domain->swept_words = 0;
 
-    if (pb_enqueued > pb_dequeued + min_pb) {
-      /* Dequeue from prefetch buffer */
-      value block = pb[(pb_dequeued++) & Pb_mask];
-      header_t hd = Hd_val(block);
+      num_domains_in_stw = (uintnat)caml_global_barrier_num_domains();
+      atomic_store_rel(&num_domains_to_sweep, num_domains_in_stw);
+      atomic_store_rel(&num_domains_to_mark, num_domains_in_stw);
 
-      if (Tag_hd(hd) == Infix_tag) {
-        block -= Infix_offset_val(block);
-        hd = Hd_val(block);
-      }
+      caml_gc_phase = Phase_sweep_and_mark_main;
+      atomic_store(&ephe_cycle_info.num_domains_todo, num_domains_in_stw);
+      atomic_store(&ephe_cycle_info.ephe_cycle, 1);
+      atomic_store(&ephe_cycle_info.num_domains_done, 0);
+      atomic_store_rel(&num_domains_to_ephe_sweep, num_domains_in_stw);
+      atomic_store_rel(&num_domains_to_final_update_first, num_domains_in_stw);
+      atomic_store_rel(&num_domains_to_final_update_last, num_domains_in_stw);
 
-#ifdef NO_NAKED_POINTERS
-      /* See [caml_darken] for a description of this assertion. */
-      CAMLassert (Is_in_heap (block) || Is_black_hd (hd));
-#endif
-      CAMLassert(Is_white_hd(hd) || Is_black_hd(hd));
-      if (!Is_white_hd (hd)) {
-        /* Already black, nothing to do */
-        continue;
-      }
-      hd = Blackhd_hd (hd);
-      Hd_val (block) = hd;
-      darkened_anything = 1;
-      work--; /* header word */
-      if (Tag_hd (hd) >= No_scan_tag) {
-        /* Nothing to scan here */
-        work -= Wosize_hd (hd);
-        continue;
-      }
-      scan = Op_val(block);
-      obj_end = scan + Wosize_hd(hd);
+      atomic_store(&domain_global_roots_started, WORK_UNSTARTED);
 
-      if (Tag_hd (hd) == Closure_tag) {
-        uintnat env_offset = Start_env_closinfo(Closinfo_val(block));
-        work -= env_offset;
-        scan += env_offset;
-      }
-    } else if (work <= 0 || stk.count == 0) {
-      if (min_pb > 0) {
-        /* Dequeue from pb even when close to empty, because
-           we have nothing else to do */
-        min_pb = 0;
-        continue;
-      } else {
-        /* Couldn't find work with min_pb == 0, so there's nothing to do */
-        break;
-      }
-    } else {
-      mark_entry m = stk.stack[--stk.count];
-      scan = m.start;
-      obj_end = m.end;
+      /* Cleanups for various data structures that must be done in a STW by
+        only a single domain */
+      caml_code_fragment_cleanup();
     }
+    // should interrupts be processed here or not?
+    // depends on whether marking above may need interrupts
+    caml_global_barrier_end(b);
+  }
 
-    scan_len = obj_end - scan;
-    if (work < scan_len) {
-      scan_len = work;
-      if (scan_len < 0) scan_len = 0;
-    }
-    work -= scan_len;
-    scan_end = scan + scan_len;
+  /* If the heap is to be verified, do it before the domains continue
+     running OCaml code. */
+  if (caml_params->verify_heap) {
+    struct heap_verify_state* ver = caml_verify_begin();
+    caml_do_roots (&caml_verify_root, ver, domain, 1);
+    caml_scan_global_roots(&caml_verify_root, ver);
+    caml_verify_heap(ver);
+    caml_gc_log("Heap verified");
+    caml_global_barrier();
+  }
 
-    for (; scan < scan_end; scan++) {
-      value v = *scan;
-#ifdef CAML_INSTR
-      slice_fields ++;
-#endif
-      if (Is_major_block(v)) {
-#ifdef CAML_INSTR
-        slice_pointers ++;
-#endif
-        if (pb_enqueued == pb_dequeued + Pb_size) {
-          /* Prefetch buffer is full */
-          work += scan_end - scan; /* scanning work not done */
-          break;
-        }
-        prefetch_block(v);
-        pb[(pb_enqueued++) & Pb_mask] = v;
-      }
-#if defined(NAKED_POINTERS_CHECKER) && defined(NATIVE_CODE)
-      else if (Is_block_and_not_young (v) && !Is_in_heap (v)){
-        is_naked_pointer_safe (v, scan);
-      }
-#endif
+  domain->stat_major_collections++;
+  caml_cycle_heap(domain->shared_heap);
+  domain->sweeping_done = 0;
+
+  /* Mark roots for new cycle */
+  domain->marking_done = 0;
+  domain->major_work_computed = 0;
+  domain->major_work_todo = 0;
+  domain->major_gc_clock = 0.0;
+
+  CAML_EV_BEGIN(EV_MAJOR_MARK_ROOTS);
+  caml_do_roots (&caml_darken, NULL, domain, 0);
+  {
+    uintnat work_unstarted = WORK_UNSTARTED;
+    if(atomic_compare_exchange_strong(&domain_global_roots_started,
+                                      &work_unstarted,
+                                      WORK_STARTED)){
+        caml_scan_global_roots(&caml_darken, NULL);
     }
+  }
+  CAML_EV_END(EV_MAJOR_MARK_ROOTS);
 
-    if (scan < obj_end) {
-      /* Didn't finish scanning this object, either because work <= 0,
-         or the prefetch buffer filled up. Leave the rest on the stack. */
-      mark_entry m = { scan, obj_end };
-      caml_prefetch(scan+1);
-      if (stk.count == stk.size) {
-        *Caml_state->mark_stack = stk;
-        realloc_mark_stack(Caml_state->mark_stack);
-        stk = *Caml_state->mark_stack;
-      }
-      CAML_EVENTLOG_DO({
-        if (work <= 0 && pb_enqueued == pb_dequeued) {
-          CAML_EV_COUNTER(EV_C_MAJOR_MARK_SLICE_REMAIN, obj_end - scan);
-        }
-      });
-      stk.stack[stk.count++] = m;
-      /* We may have just discovered more work when we were about to run out.
-         Reset min_pb so that we try to refill the buffer again. */
-      min_pb = Pb_min;
-    }
+  if (domain->mark_stack->count == 0) {
+    atomic_fetch_add_verify_ge0(&num_domains_to_mark, -1);
+    domain->marking_done = 1;
   }
-  CAMLassert(pb_enqueued == pb_dequeued);
-  *Caml_state->mark_stack = stk;
-  if (darkened_anything)
-    caml_ephe_list_pure = 0;
-#ifdef CAML_INSTR
-  *pslice_fields += slice_fields;
-  *pslice_pointers += slice_pointers;
+
+  /* Ephemerons */
+  // Adopt orphaned work from domains that were spawned and terminated in
+  // the previous cycle.
+#ifdef DEBUG
+  orph_ephe_list_verify_status (caml_global_heap_state.UNMARKED);
 #endif
-  return work;
+  caml_adopt_orphaned_work ();
+  CAMLassert(domain->ephe_info->todo == (value) NULL);
+  domain->ephe_info->todo = domain->ephe_info->live;
+  domain->ephe_info->live = (value) NULL;
+  domain->ephe_info->cycle = 0;
+  domain->ephe_info->cursor.todop = NULL;
+  domain->ephe_info->cursor.cycle = 0;
+  if (domain->ephe_info->todo == (value) NULL)
+    caml_ephe_todo_list_emptied();
+
+  /* Finalisers */
+  domain->final_info->updated_first = 0;
+  domain->final_info->updated_last = 0;
+
+  /* To ensure a mutator doesn't resume while global roots are being marked.
+     Mutators can alter the set of global roots, to preserve its correctness,
+     they should not run while global roots are being marked.*/
+  caml_global_barrier();
+
+  CAML_EV_END(EV_MAJOR_GC_STW);
+  CAML_EV_END(EV_MAJOR_GC_CYCLE_DOMAINS);
 }
 
-static void mark_slice (intnat work)
+static int is_complete_phase_sweep_and_mark_main ()
 {
-#ifdef CAML_INSTR
-  int slice_fields = 0; /** eventlog counters */
-#endif /*CAML_INSTR*/
-  int slice_pointers = 0;
-  struct mark_stack* stk = Caml_state->mark_stack;
-
-  caml_gc_message (0x40, "Marking %"ARCH_INTNAT_PRINTF_FORMAT"d words\n", work);
-  caml_gc_message (0x40, "Subphase = %d\n", caml_gc_subphase);
-
-  marked_words += work;
-  while (1){
-#ifndef CAML_INSTR
-    work = do_some_marking(work);
-#else
-    work = do_some_marking(work, &slice_fields, &slice_pointers);
-#endif
-
-    if (work <= 0)
-      break;
-
-    CAMLassert (stk->count == 0);
+  return
+    caml_gc_phase == Phase_sweep_and_mark_main &&
+    atomic_load_acq (&num_domains_to_sweep) == 0 &&
+    atomic_load_acq (&num_domains_to_mark) == 0 &&
+    /* Marking is done */
+    atomic_load_acq(&ephe_cycle_info.num_domains_todo) ==
+    atomic_load_acq(&ephe_cycle_info.num_domains_done) &&
+    /* Ephemeron marking is done */
+    no_orphaned_work();
+    /* All orphaned ephemerons have been adopted */
+}
 
-    if( redarken_first_chunk != NULL ) {
-      /* There are chunks that need to be redarkened because we
-         overflowed our mark stack */
-      if( redarken_chunk(redarken_first_chunk, stk) ) {
-        redarken_first_chunk = Chunk_next(redarken_first_chunk);
-      }
-    } else if (caml_gc_subphase == Subphase_mark_roots) {
-      CAML_EV_BEGIN(EV_MAJOR_MARK_ROOTS);
-      marked_words -= work;
-      work = caml_darken_all_roots_slice (work);
-      marked_words += work;
-      CAML_EV_END(EV_MAJOR_MARK_ROOTS);
-      if (work > 0){
-        caml_gc_subphase = Subphase_mark_main;
-      }
-    } else if (*ephes_to_check != (value) NULL) {
-      /* Continue to scan the list of ephe */
-      mark_ephe_aux(stk,&work,&slice_pointers);
-    } else if (!caml_ephe_list_pure){
-      /* We must scan again the list because some value have been darken */
-      caml_ephe_list_pure = 1;
-      ephes_to_check = ephes_checked_if_pure;
-    }else{
-      switch (caml_gc_subphase){
-      case Subphase_mark_main: {
-          /* Subphase_mark_main is done.
-             Mark finalised values. */
-          CAML_EV_BEGIN(EV_MAJOR_MARK_MAIN);
-          caml_final_update_mark_phase ();
-          /* Complete the marking */
-          ephes_to_check = ephes_checked_if_pure;
-          CAML_EV_END(EV_MAJOR_MARK_MAIN);
-          caml_gc_subphase = Subphase_mark_final;
-      }
-        break;
-      case Subphase_mark_final: {
-        /** The set of unreachable value will not change anymore for
-            this cycle. Start clean phase. */
-        CAML_EV_BEGIN(EV_MAJOR_MARK_FINAL);
-        caml_gc_phase = Phase_clean;
-        caml_final_update_clean_phase ();
-        caml_memprof_update_clean_phase ();
-        if (caml_ephe_list_head != (value) NULL){
-          /* Initialise the clean phase. */
-          ephes_to_check = &caml_ephe_list_head;
-        } else {
-          /* Initialise the sweep phase. */
-          init_sweep_phase();
-        }
-        marked_words -= work;
-        work = 0;
-        CAML_EV_END(EV_MAJOR_MARK_FINAL);
-      }
-        break;
-      default: CAMLassert (0);
-      }
-    }
-  }
-  marked_words -= work;  /* work may be negative */
-  CAML_EV_COUNTER(EV_C_MAJOR_MARK_SLICE_FIELDS, slice_fields);
-  CAML_EV_COUNTER(EV_C_MAJOR_MARK_SLICE_POINTERS, slice_pointers);
+static int is_complete_phase_mark_final ()
+{
+  return
+    caml_gc_phase == Phase_mark_final &&
+    atomic_load_acq (&num_domains_to_final_update_first) == 0 &&
+    /* updated finalise first values */
+    atomic_load_acq (&num_domains_to_mark) == 0 &&
+    /* Marking is done */
+    atomic_load_acq(&ephe_cycle_info.num_domains_todo) ==
+    atomic_load_acq(&ephe_cycle_info.num_domains_done) &&
+    /* Ephemeron marking is done */
+    no_orphaned_work();
+    /* All orphaned ephemerons have been adopted */
 }
 
-/* Clean ephemerons */
-static void clean_slice (intnat work)
+static int is_complete_phase_sweep_ephe ()
 {
-  value v;
+  return
+    caml_gc_phase == Phase_sweep_ephe &&
+    atomic_load_acq (&num_domains_to_ephe_sweep) == 0 &&
+    /* All domains have swept their ephemerons */
+    atomic_load_acq (&num_domains_to_final_update_last) == 0 &&
+    /* All domains have updated finalise last values */
+    no_orphaned_work();
+    /* All orphaned structures have been adopted */
+}
 
-  caml_gc_message (0x40, "Cleaning %"
-                   ARCH_INTNAT_PRINTF_FORMAT "d words\n", work);
-  while (work > 0){
-    v = *ephes_to_check;
-    if (v != (value) NULL){
-      if (Is_white_val (v)){
-        /* The whole array is dead, remove it from the list. */
-        *ephes_to_check = Field (v, CAML_EPHE_LINK_OFFSET);
-        work -= 1;
-      }else{
-        caml_ephe_clean(v);
-        ephes_to_check = &Field (v, CAML_EPHE_LINK_OFFSET);
-        work -= Whsize_val (v);
-      }
-    }else{ /* End of list reached */
-      /* Phase_clean is done. */
-      /* Initialise the sweep phase. */
-      init_sweep_phase();
-      work = 0;
+static void try_complete_gc_phase (caml_domain_state* domain, void* unused,
+                                   int participating_count,
+                                   caml_domain_state** participating)
+{
+  barrier_status b;
+  CAML_EV_BEGIN(EV_MAJOR_GC_PHASE_CHANGE);
+
+  b = caml_global_barrier_begin ();
+  if (caml_global_barrier_is_final(b)) {
+    if (is_complete_phase_sweep_and_mark_main()) {
+      caml_gc_phase = Phase_mark_final;
+    } else if (is_complete_phase_mark_final()) {
+      caml_gc_phase = Phase_sweep_ephe;
     }
   }
+  caml_global_barrier_end(b);
+  CAML_EV_END(EV_MAJOR_GC_PHASE_CHANGE);
 }
 
-static void sweep_slice (intnat work)
+intnat caml_opportunistic_major_work_available (void)
 {
-  char *hp, *sweep_hp, *limit;
-  header_t hd;
+  caml_domain_state* domain_state = Caml_state;
+  return !domain_state->sweeping_done || !domain_state->marking_done;
+}
 
-  caml_gc_message (0x40, "Sweeping %"
-                   ARCH_INTNAT_PRINTF_FORMAT "d words\n", work);
-  sweep_hp = caml_gc_sweep_hp;
-  limit = sweep_chunk + Chunk_size(sweep_chunk);
-  while (work > 0){
-    if (sweep_hp < limit){
-      caml_prefetch(sweep_hp + 4000);
-      hp = sweep_hp;
-      hd = Hd_hp (hp);
-      work -= Whsize_hd (hd);
-      sweep_hp += Bhsize_hd (hd);
-      switch (Color_hd (hd)){
-      case Caml_white:
-        caml_gc_sweep_hp = sweep_hp;
-        sweep_hp = (char *) caml_fl_merge_block (Val_hp (hp), limit);
-        break;
-      case Caml_blue:
-        /* Only the blocks of the free-list are blue.  See [freelist.c]. */
-        caml_fl_merge = (value) Bp_hp (hp);
-        break;
-      default:          /* gray or black */
-        CAMLassert (Color_hd (hd) == Caml_black);
-        Hd_hp (hp) = Whitehd_hd (hd);
-        break;
-      }
-      CAMLassert (sweep_hp <= limit);
-    }else{
-      sweep_chunk = Chunk_next (sweep_chunk);
-      if (sweep_chunk == NULL){
-        /* Sweeping is done. */
-        caml_gc_sweep_hp = sweep_hp;
-        ++ Caml_state->stat_major_collections;
-        work = 0;
-        caml_gc_phase = Phase_idle;
-        caml_request_minor_gc ();
-      }else{
-        sweep_hp = sweep_chunk;
-        limit = sweep_chunk + Chunk_size (sweep_chunk);
-      }
-    }
+typedef enum {
+  Slice_uninterruptible,
+  Slice_interruptible,
+  Slice_opportunistic
+} collection_slice_mode;
+
+static char collection_slice_mode_char(collection_slice_mode mode)
+{
+  switch(mode) {
+    case Slice_uninterruptible:
+      return 'u';
+    case Slice_interruptible:
+      return 'i';
+    case Slice_opportunistic:
+      return 'o';
+    default:
+      return ' ';
   }
-  caml_gc_sweep_hp = sweep_hp;
 }
 
-/* The main entry point for the major GC. Called about once for each
-   minor GC. [howmuch] is the amount of work to do:
-   -1 if the GC is triggered automatically
-   0 to let the GC compute the amount of work
-   [n] to make the GC do enough work to (on average) free [n] words
- */
-void caml_major_collection_slice (intnat howmuch)
-{
-  double p, dp, filt_p, spend;
-  intnat computed_work;
-  int i;
-  /*
-     Free memory at the start of the GC cycle (garbage + free list) (assumed):
-                 FM = Caml_state->stat_heap_wsz * caml_percent_free
-                      / (100 + caml_percent_free)
+#define Chunk_size 0x400
 
-     Assuming steady state and enforcing a constant allocation rate, then
-     FM is divided in 2/3 for garbage and 1/3 for free list.
-                 G = 2 * FM / 3
-     G is also the amount of memory that will be used during this cycle
-     (still assuming steady state).
+static intnat major_collection_slice(intnat howmuch,
+                                     int participant_count,
+                                     caml_domain_state** barrier_participants,
+                                     collection_slice_mode mode)
+{
+  caml_domain_state* domain_state = Caml_state;
+  intnat sweep_work = 0, mark_work = 0;
+  intnat available, left;
+  uintnat blocks_marked_before = domain_state->stat_blocks_marked;
+  int was_marking = 0;
+  uintnat saved_ephe_cycle;
+  uintnat saved_major_cycle = caml_major_cycles_completed;
+  int log_events = mode != Slice_opportunistic || (caml_params->verb_gc & 0x40);
+  intnat computed_work, budget, interrupted_budget = 0;
+
+  update_major_slice_work();
+  computed_work = get_major_slice_work(howmuch);
+  budget = computed_work;
+
+  /* shortcut out if there is no opportunistic work to be done
+   * NB: needed particularly to avoid caml_ev spam when polling */
+  if (mode == Slice_opportunistic &&
+      !caml_opportunistic_major_work_available()) {
+    return budget;
+  }
 
-     Proportion of G consumed since the previous slice:
-                 PH = caml_allocated_words / G
-                    = caml_allocated_words * 3 * (100 + caml_percent_free)
-                      / (2 * Caml_state->stat_heap_wsz * caml_percent_free)
-     Proportion of extra-heap resources consumed since the previous slice:
-                 PE = caml_extra_heap_resources
-     Proportion of total work to do in this slice:
-                 P  = max (PH, PE)
+  if (log_events) CAML_EV_BEGIN(EV_MAJOR_SLICE);
 
-     Here, we insert a time-based filter on the P variable to avoid large
-     latency spikes in the GC, so the P below is a smoothed-out version of
-     the P above.
+  if (!domain_state->sweeping_done) {
+    if (log_events) CAML_EV_BEGIN(EV_MAJOR_SWEEP);
 
-     Amount of marking work for the GC cycle:
-             MW = Caml_state->stat_heap_wsz * 100 / (100 + caml_percent_free)
-                  + caml_incremental_roots_count
-     Amount of sweeping work for the GC cycle:
-             SW = Caml_state->stat_heap_wsz
+    do {
+      available = budget > Chunk_size ? Chunk_size : budget;
+      left = caml_sweep(domain_state->shared_heap, available);
+      budget -= available - left;
+      sweep_work += available - left;
 
-     In order to finish marking with a non-empty free list, we will
-     use 40% of the time for marking, and 60% for sweeping.
+      if (budget > 0 && available == left) {
+        domain_state->sweeping_done = 1;
+        atomic_fetch_add_verify_ge0(&num_domains_to_sweep, -1);
+      }
 
-     Let MT be the time spent marking, ST the time spent sweeping, and TT
-     the total time for this cycle. We have:
-                 MT = 40/100 * TT
-                 ST = 60/100 * TT
+      if (mode == Slice_interruptible && caml_incoming_interrupts_queued())
+      {
+        interrupted_budget = budget;
+        budget = 0;
+      }
+    } while (budget > 0 && available != left);
 
-     Amount of time to spend on this slice:
-                 T  = P * TT = P * MT / (40/100) = P * ST / (60/100)
-
-     Since we must do MW work in MT time or SW work in ST time, the amount
-     of work for this slice is:
-                 MS = P * MW / (40/100)  if marking
-                 SS = P * SW / (60/100)  if sweeping
-
-     Amount of marking work for a marking slice:
-                 MS = P * MW / (40/100)
-                 MS = P * (Caml_state->stat_heap_wsz * 250
-                           / (100 + caml_percent_free)
-                           + 2.5 * caml_incremental_roots_count)
-     Amount of sweeping work for a sweeping slice:
-                 SS = P * SW / (60/100)
-                 SS = P * Caml_state->stat_heap_wsz * 5 / 3
-
-     This slice will either mark MS words or sweep SS words.
-  */
+    if (log_events) CAML_EV_END(EV_MAJOR_SWEEP);
+  }
 
-  if (caml_major_slice_begin_hook != NULL) (*caml_major_slice_begin_hook) ();
+mark_again:
+  while (budget > 0) {
+    if (!domain_state->marking_done) {
+      if (!was_marking) {
+        if (log_events) CAML_EV_BEGIN(EV_MAJOR_MARK);
+        was_marking = 1;
+      }
+      available = budget > Chunk_size ? Chunk_size : budget;
+      left = mark(available);
+      budget -= available - left;
+      mark_work += available - left;
+    } else {
+      break;
+    }
 
-  p = (double) caml_allocated_words * 3.0 * (100 + caml_percent_free)
-      / Caml_state->stat_heap_wsz / caml_percent_free / 2.0;
-  if (caml_dependent_size > 0){
-    dp = (double) caml_dependent_allocated * (100 + caml_percent_free)
-         / caml_dependent_size / caml_percent_free;
-  }else{
-    dp = 0.0;
+    if (mode == Slice_interruptible && caml_incoming_interrupts_queued()) {
+      interrupted_budget = budget;
+      budget = 0;
+    }
   }
-  if (p < dp) p = dp;
-  if (p < caml_extra_heap_resources) p = caml_extra_heap_resources;
-  p += p_backlog;
-  p_backlog = 0.0;
-  if (p > 0.3){
-    p_backlog = p - 0.3;
-    p = 0.3;
+  if (was_marking) {
+    if (log_events) CAML_EV_END(EV_MAJOR_MARK);
+    was_marking = 0;
   }
 
-  CAML_EV_COUNTER (EV_C_MAJOR_WORK_EXTRA,
-                  (uintnat) (caml_extra_heap_resources * 1000000));
+  if (mode != Slice_opportunistic) {
+    /* Finalisers */
+    if (caml_gc_phase == Phase_mark_final &&
+        caml_final_update_first(domain_state)) {
+      /* This domain has updated finalise first values */
+      atomic_fetch_add_verify_ge0(&num_domains_to_final_update_first, -1);
+      if (budget > 0 && !domain_state->marking_done)
+        goto mark_again;
+    }
 
-  caml_gc_message (0x40, "ordered work = %"
-                   ARCH_INTNAT_PRINTF_FORMAT "d words\n", howmuch);
-  caml_gc_message (0x40, "allocated_words = %"
-                         ARCH_INTNAT_PRINTF_FORMAT "u\n",
-                   caml_allocated_words);
-  caml_gc_message (0x40, "extra_heap_resources = %"
-                         ARCH_INTNAT_PRINTF_FORMAT "uu\n",
-                   (uintnat) (caml_extra_heap_resources * 1000000));
-  caml_gc_message (0x40, "raw work-to-do = %"
-                         ARCH_INTNAT_PRINTF_FORMAT "du\n",
-                   (intnat) (p * 1000000));
-  caml_gc_message (0x40, "work backlog = %"
-                         ARCH_INTNAT_PRINTF_FORMAT "du\n",
-                   (intnat) (p_backlog * 1000000));
-
-  for (i = 0; i < caml_major_window; i++){
-    caml_major_ring[i] += p / caml_major_window;
-  }
+    if (caml_gc_phase == Phase_sweep_ephe &&
+        caml_final_update_last(domain_state)) {
+      /* This domain has updated finalise last values */
+      atomic_fetch_add_verify_ge0(&num_domains_to_final_update_last, -1);
+      /* Nothing has been marked while updating last */
+    }
 
-  if (caml_gc_clock >= 1.0){
-    caml_gc_clock -= 1.0;
-    ++caml_major_ring_index;
-    if (caml_major_ring_index >= caml_major_window){
-      caml_major_ring_index = 0;
+#ifdef DEBUG
+    orph_ephe_list_verify_status (caml_global_heap_state.MARKED);
+#endif
+    caml_adopt_orphaned_work();
+
+    /* Ephemerons */
+    saved_ephe_cycle = atomic_load_acq(&ephe_cycle_info.ephe_cycle);
+    if (domain_state->ephe_info->todo != (value) NULL &&
+        saved_ephe_cycle > domain_state->ephe_info->cycle) {
+      CAML_EV_BEGIN(EV_MAJOR_EPHE_MARK);
+      budget = ephe_mark(budget, saved_ephe_cycle, EPHE_MARK_DEFAULT);
+      CAML_EV_END(EV_MAJOR_EPHE_MARK);
+      if (domain_state->ephe_info->todo == (value) NULL)
+        caml_ephe_todo_list_emptied ();
+      else if (budget > 0 && domain_state->marking_done)
+        record_ephe_marking_done(saved_ephe_cycle);
+      else if (budget > 0) goto mark_again;
     }
-  }
-  if (howmuch == -1){
-    /* auto-triggered GC slice: spend work credit on the current bucket,
-       then do the remaining work, if any */
-    /* Note that the minor GC guarantees that the major slice is called in
-       automatic mode (with [howmuch] = -1) at least once per clock tick.
-       This means we never leave a non-empty bucket behind. */
-    spend = fmin (caml_major_work_credit,
-                  caml_major_ring[caml_major_ring_index]);
-    caml_major_work_credit -= spend;
-    filt_p = caml_major_ring[caml_major_ring_index] - spend;
-    caml_major_ring[caml_major_ring_index] = 0.0;
-  }else{
-    /* forced GC slice: do work and add it to the credit */
-    if (howmuch == 0){
-      /* automatic setting: size of next bucket
-         we do not use the current bucket, as it may be empty */
-      int i = caml_major_ring_index + 1;
-      if (i >= caml_major_window) i = 0;
-      filt_p = caml_major_ring[i];
-    }else{
-      /* manual setting */
-      filt_p = (double) howmuch * 3.0 * (100 + caml_percent_free)
-               / Caml_state->stat_heap_wsz / caml_percent_free / 2.0;
+
+    if (caml_gc_phase == Phase_sweep_ephe &&
+        domain_state->ephe_info->todo != 0) {
+      CAML_EV_BEGIN(EV_MAJOR_EPHE_SWEEP);
+      budget = ephe_sweep (domain_state, budget);
+      CAML_EV_END(EV_MAJOR_EPHE_SWEEP);
+      if (domain_state->ephe_info->todo == 0) {
+        atomic_fetch_add_verify_ge0(&num_domains_to_ephe_sweep, -1);
+      }
+    }
+
+    /* Complete GC phase */
+    if (is_complete_phase_sweep_and_mark_main() ||
+        is_complete_phase_mark_final ()) {
+      if (barrier_participants) {
+        try_complete_gc_phase (domain_state,
+                              (void*)0,
+                              participant_count,
+                              barrier_participants);
+      } else {
+        caml_try_run_on_all_domains (&try_complete_gc_phase, 0, 0);
+      }
+      if (budget > 0) goto mark_again;
     }
-    caml_major_work_credit += filt_p;
-    /* Limit work credit to 1.0 */
-    caml_major_work_credit = fmin(caml_major_work_credit, 1.0);
   }
 
-  p = filt_p;
+  if (log_events) CAML_EV_END(EV_MAJOR_SLICE);
 
-  caml_gc_message (0x40, "filtered work-to-do = %"
-                         ARCH_INTNAT_PRINTF_FORMAT "du\n",
-                   (intnat) (p * 1000000));
+  caml_gc_log
+    ("Major slice [%c%c%c]: %ld work, %ld sweep, %ld mark (%lu blocks)",
+              collection_slice_mode_char(mode),
+              interrupted_budget == 0 ? '.' : '*',
+              caml_gc_phase_char(caml_gc_phase),
+              (long)computed_work, (long)sweep_work, (long)mark_work,
+              (unsigned long)(domain_state->stat_blocks_marked
+                                                      - blocks_marked_before));
 
-  if (caml_gc_phase == Phase_idle){
-    if (Caml_state->young_ptr == Caml_state->young_alloc_end){
-      /* We can only start a major GC cycle if the minor allocation arena
-         is empty, otherwise we'd have to treat it as a set of roots. */
-      CAML_EV_BEGIN(EV_MAJOR_ROOTS);
-      start_cycle ();
-      CAML_EV_END(EV_MAJOR_ROOTS);
-    }
-    p = 0;
-    goto finished;
-  }
+  /* we did: work we were asked - interrupted_budget + any overwork */
+  commit_major_slice_work
+              (computed_work - interrupted_budget + (budget < 0 ? -budget : 0));
 
-  if (p < 0){
-    p = 0;
-    goto finished;
-  }
+  if (mode != Slice_opportunistic && is_complete_phase_sweep_ephe()) {
+    saved_major_cycle = caml_major_cycles_completed;
+    /* To handle the case where multiple domains try to finish the major
+      cycle simultaneously, we loop until the current cycle has ended,
+      ignoring whether caml_try_run_on_all_domains succeeds. */
 
-  if (caml_gc_phase == Phase_mark || caml_gc_phase == Phase_clean){
-    computed_work = (intnat) (p * ((double) Caml_state->stat_heap_wsz * 250
-                                   / (100 + caml_percent_free)
-                                   + caml_incremental_roots_count));
-  }else{
-    computed_work = (intnat) (p * Caml_state->stat_heap_wsz * 5 / 3);
-  }
-  caml_gc_message (0x40, "computed work = %"
-                   ARCH_INTNAT_PRINTF_FORMAT "d words\n", computed_work);
-  if (caml_gc_phase == Phase_mark){
-    CAML_EV_COUNTER (EV_C_MAJOR_WORK_MARK, computed_work);
-    CAML_EV_BEGIN(EV_MAJOR_MARK);
-    mark_slice (computed_work);
-    CAML_EV_END(EV_MAJOR_MARK);
-    caml_gc_message (0x02, "!");
-  }else if (caml_gc_phase == Phase_clean){
-    clean_slice (computed_work);
-    caml_gc_message (0x02, "%%");
-  }else{
-    CAMLassert (caml_gc_phase == Phase_sweep);
-    CAML_EV_COUNTER (EV_C_MAJOR_WORK_SWEEP, computed_work);
-    CAML_EV_BEGIN(EV_MAJOR_SWEEP);
-    sweep_slice (computed_work);
-    CAML_EV_END(EV_MAJOR_SWEEP);
-    caml_gc_message (0x02, "$");
-  }
 
-  if (caml_gc_phase == Phase_idle){
-    double previous_overhead; // overhead at the end of the previous cycle
-
-    CAML_EV_BEGIN(EV_MAJOR_CHECK_AND_COMPACT);
-    caml_gc_message (0x200, "marked words = %"
-                     ARCH_INTNAT_PRINTF_FORMAT "u words\n",
-                     marked_words);
-    caml_gc_message (0x200, "heap size at start of cycle = %"
-                     ARCH_INTNAT_PRINTF_FORMAT "u words\n",
-                     heap_wsz_at_cycle_start);
-    if (marked_words == 0){
-      previous_overhead = 1000000.;
-      caml_gc_message (0x200, "overhead at start of cycle = +inf\n");
-    }else{
-      previous_overhead =
-        100.0 * (heap_wsz_at_cycle_start - marked_words) / marked_words;
-      caml_gc_message (0x200, "overhead at start of cycle = %.0f%%\n",
-                       previous_overhead);
+    while (saved_major_cycle == caml_major_cycles_completed) {
+      if (barrier_participants) {
+        cycle_all_domains_callback
+              (domain_state, (void*)0, participant_count, barrier_participants);
+      } else {
+        caml_try_run_on_all_domains(&cycle_all_domains_callback, 0, 0);
+      }
     }
-    caml_compact_heap_maybe (previous_overhead);
-    CAML_EV_END(EV_MAJOR_CHECK_AND_COMPACT);
   }
 
- finished:
-  caml_gc_message (0x40, "work-done = %"
-                         ARCH_INTNAT_PRINTF_FORMAT "du\n",
-                   (intnat) (p * 1000000));
-
-  /* if some of the work was not done, take it back from the credit
-     or spread it over the buckets. */
-  p = filt_p - p;
-  spend = fmin (p, caml_major_work_credit);
-  caml_major_work_credit -= spend;
-  if (p > spend){
-    p -= spend;
-    p /= caml_major_window;
-    for (i = 0; i < caml_major_window; i++) caml_major_ring[i] += p;
-  }
+  return interrupted_budget;
+}
 
-  Caml_state->stat_major_words += caml_allocated_words;
-  caml_allocated_words = 0;
-  caml_dependent_allocated = 0;
-  caml_extra_heap_resources = 0.0;
-  if (caml_major_slice_end_hook != NULL) (*caml_major_slice_end_hook) ();
+void caml_opportunistic_major_collection_slice(intnat howmuch)
+{
+  major_collection_slice(howmuch, 0, 0, Slice_opportunistic);
 }
 
-/* This does not call [caml_compact_heap_maybe] because the estimates of
-   free and live memory are only valid for a cycle done incrementally.
-   Besides, this function itself is called by [caml_compact_heap_maybe].
-*/
-void caml_finish_major_cycle (void)
+void caml_major_collection_slice(intnat howmuch)
 {
-  if (caml_gc_phase == Phase_idle){
-    p_backlog = 0.0; /* full major GC cycle, the backlog becomes irrelevant */
-    start_cycle ();
+
+  /* if this is an auto-triggered GC slice, make it interruptible */
+  if (howmuch == AUTO_TRIGGERED_MAJOR_SLICE) {
+    intnat interrupted_work = major_collection_slice(
+        AUTO_TRIGGERED_MAJOR_SLICE,
+        0,
+        0,
+        Slice_interruptible
+        );
+    if (interrupted_work > 0) {
+      caml_gc_log("Major slice interrupted, rescheduling major slice");
+      caml_request_major_slice();
+    }
+  } else {
+    /* TODO: could make forced API slices interruptible, but would need to do
+       accounting or pass up interrupt */
+    major_collection_slice(howmuch, 0, 0, Slice_uninterruptible);
   }
-  while (caml_gc_phase == Phase_mark) mark_slice (LONG_MAX);
-  while (caml_gc_phase == Phase_clean) clean_slice (LONG_MAX);
-  CAMLassert (caml_gc_phase == Phase_sweep);
-  CAMLassert (redarken_first_chunk == NULL);
-  while (caml_gc_phase == Phase_sweep) sweep_slice (LONG_MAX);
-  CAMLassert (caml_gc_phase == Phase_idle);
-  Caml_state->stat_major_words += caml_allocated_words;
-  caml_allocated_words = 0;
 }
 
-/* Call this function to make sure [bsz] is greater than or equal
-   to both [Heap_chunk_min] and the current heap increment.
-*/
-asize_t caml_clip_heap_chunk_wsz (asize_t wsz)
+static void finish_major_cycle_callback (caml_domain_state* domain, void* arg,
+                                         int participating_count,
+                                         caml_domain_state** participating)
 {
-  asize_t result = wsz;
-  uintnat incr;
+  uintnat saved_major_cycles = (uintnat)arg;
+  CAMLassert (domain == Caml_state);
 
-  /* Compute the heap increment as a word size. */
-  if (caml_major_heap_increment > 1000){
-    incr = caml_major_heap_increment;
-  }else{
-    incr = Caml_state->stat_heap_wsz / 100 * caml_major_heap_increment;
-  }
+  caml_empty_minor_heap_no_major_slice_from_stw
+    (domain, (void*)0, participating_count, participating);
 
-  if (result < incr){
-    result = incr;
-  }
-  if (result < Heap_chunk_min){
-    result = Heap_chunk_min;
+  while (saved_major_cycles == caml_major_cycles_completed) {
+    major_collection_slice(10000000, participating_count, participating,
+                           Slice_uninterruptible);
   }
-  return result;
 }
 
-/* [heap_size] is a number of bytes */
-void caml_init_major_heap (asize_t heap_size)
+void caml_finish_major_cycle (void)
 {
-  int i;
+  uintnat saved_major_cycles = caml_major_cycles_completed;
 
-  Caml_state->stat_heap_wsz =
-    caml_clip_heap_chunk_wsz (Wsize_bsize (heap_size));
-  Caml_state->stat_top_heap_wsz = Caml_state->stat_heap_wsz;
-  CAMLassert (Bsize_wsize (Caml_state->stat_heap_wsz) % Page_size == 0);
-  caml_heap_start =
-    (char *) caml_alloc_for_heap (Bsize_wsize (Caml_state->stat_heap_wsz));
-  if (caml_heap_start == NULL)
-    caml_fatal_error ("cannot allocate initial major heap");
-  Chunk_next (caml_heap_start) = NULL;
-  Caml_state->stat_heap_wsz = Wsize_bsize (Chunk_size (caml_heap_start));
-  Caml_state->stat_heap_chunks = 1;
-  Caml_state->stat_top_heap_wsz = Caml_state->stat_heap_wsz;
-
-  if (caml_page_table_add(In_heap, caml_heap_start,
-        caml_heap_start + Bsize_wsize (Caml_state->stat_heap_wsz))
-      != 0) {
-    caml_fatal_error ("cannot allocate initial page table");
+  while( saved_major_cycles == caml_major_cycles_completed ) {
+    caml_try_run_on_all_domains
+    (&finish_major_cycle_callback, (void*)caml_major_cycles_completed, 0);
   }
-
-  caml_fl_init_merge ();
-  caml_make_free_blocks ((value *) caml_heap_start,
-                         Caml_state->stat_heap_wsz, 1, Caml_white);
-  caml_gc_phase = Phase_idle;
-
-  Caml_state->mark_stack = caml_stat_alloc_noexc(sizeof(struct mark_stack));
-  if (Caml_state->mark_stack == NULL)
-    caml_fatal_error ("not enough memory for the mark stack");
-
-  Caml_state->mark_stack->stack =
-    caml_stat_alloc_noexc(MARK_STACK_INIT_SIZE * sizeof(mark_entry));
-
-  if(Caml_state->mark_stack->stack == NULL)
-    caml_fatal_error("not enough memory for the mark stack");
-
-  Caml_state->mark_stack->count = 0;
-  Caml_state->mark_stack->size = MARK_STACK_INIT_SIZE;
-
-  caml_allocated_words = 0;
-  caml_extra_heap_resources = 0.0;
-  for (i = 0; i < Max_major_window; i++) caml_major_ring[i] = 0.0;
 }
 
-void caml_set_major_window (int w){
-  uintnat total = 0;
-  int i;
-  if (w == caml_major_window) return;
-  CAMLassert (w <= Max_major_window);
-  /* Collect the current work-to-do from the buckets. */
-  for (i = 0; i < caml_major_window; i++){
-    total += caml_major_ring[i];
-  }
-  /* Redistribute to the new buckets. */
-  for (i = 0; i < w; i++){
-    caml_major_ring[i] = total / w;
+void caml_empty_mark_stack (void)
+{
+  while (!Caml_state->marking_done){
+    mark(1000);
+    caml_handle_incoming_interrupts();
   }
-  caml_major_window = w;
+
+  if (Caml_state->stat_blocks_marked)
+    caml_gc_log("Finished marking major heap. Marked %u blocks",
+                (unsigned)Caml_state->stat_blocks_marked);
+  Caml_state->stat_blocks_marked = 0;
 }
 
-void caml_finalise_heap (void)
+void caml_finish_marking (void)
 {
-  /* Finishing major cycle (all values become white) */
-  caml_empty_minor_heap ();
-  caml_gc_message (0x1, "Finishing major GC cycle (finalising heap)\n");
-  caml_finish_major_cycle ();
-  CAMLassert (caml_gc_phase == Phase_idle);
-
-  /* Finalising all values (by means of forced sweeping) */
-  caml_fl_init_merge ();
-  caml_gc_phase = Phase_sweep;
-  sweep_chunk = caml_heap_start;
-  caml_gc_sweep_hp = sweep_chunk;
-  while (caml_gc_phase == Phase_sweep)
-    sweep_slice (LONG_MAX);
+  if (!Caml_state->marking_done) {
+    CAML_EV_BEGIN(EV_MAJOR_FINISH_MARKING);
+    caml_empty_mark_stack();
+    caml_shrink_mark_stack();
+    Caml_state->stat_major_words += Caml_state->allocated_words;
+    Caml_state->allocated_words = 0;
+    CAML_EV_END(EV_MAJOR_FINISH_MARKING);
+  }
 }
 
-#if defined(NAKED_POINTERS_CHECKER) && defined(NATIVE_CODE)
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-
-Caml_inline int safe_load(volatile header_t * p, header_t * result)
+void caml_finish_sweeping (void)
 {
-  header_t v;
-  __try {
-    v = *p;
-  }
-  __except(GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION ?
-        EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) {
-    *result = 0xdeadbeef;
-    return 0;
+  if (Caml_state->sweeping_done) return;
+  CAML_EV_BEGIN(EV_MAJOR_FINISH_SWEEPING);
+  while (!Caml_state->sweeping_done) {
+    if (caml_sweep(Caml_state->shared_heap, 10) > 0) {
+      /* just finished sweeping */
+      CAMLassert(Caml_state->sweeping_done == 0);
+      Caml_state->sweeping_done = 1;
+      atomic_fetch_add_verify_ge0(&num_domains_to_sweep, -1);
+      break;
+    }
+    caml_handle_incoming_interrupts();
   }
-  *result = v;
-  return 1;
+  CAML_EV_END(EV_MAJOR_FINISH_SWEEPING);
 }
 
-#elif defined(TARGET_amd64)
-
-Caml_inline int safe_load (header_t * addr, /*out*/ header_t * contents)
+static struct pool* find_pool_to_rescan(void)
 {
-  int ok;
-  header_t h;
-  intnat tmp;
-
-  asm volatile(
-      "leaq 1f(%%rip), %[tmp] \n\t"
-      "movq %[tmp], 0(%[handler]) \n\t"
-      "xorl %[ok], %[ok] \n\t"
-      "movq 0(%[addr]), %[h] \n\t"
-      "movl $1, %[ok] \n\t"
-  "1: \n\t"
-      "xorq %[tmp], %[tmp] \n\t"
-      "movq %[tmp], 0(%[handler])"
-      : [tmp] "=&r" (tmp), [ok] "=&r" (ok), [h] "=&r" (h)
-      : [addr] "r" (addr),
-        [handler] "r" (&(Caml_state->checking_pointer_pc)));
-  *contents = h;
-  return ok;
-}
+  struct pool* p;
+
+  if (Caml_state->pools_to_rescan_count > 0) {
+    p = Caml_state->pools_to_rescan[--Caml_state->pools_to_rescan_count];
+    caml_gc_log
+    ("Redarkening pool %p (%d others left)",
+    p,
+    Caml_state->pools_to_rescan_count);
+  } else {
+    p = 0;
+  }
 
-#elif defined(TARGET_arm64)
+  return p;
+}
 
-Caml_inline int safe_load (header_t * addr, /*out*/ header_t * contents)
+static void mark_stack_prune (struct mark_stack* stk)
 {
-  int ok;
-  header_t h;
-  intnat tmp;
-
-  asm volatile(
-      "adr %[tmp], 1f \n\t"
-      "str %[tmp], [%[handler]] \n\t"
-      "mov %w[ok], #0 \n\t"
-      "ldr %[h], [%[addr]] \n\t"
-      "mov %w[ok], #1 \n\t"
-  "1: \n\t"
-      "mov %[tmp], #0 \n\t"
-      "str %[tmp], [%[handler]]"
-      : [tmp] "=&r" (tmp), [ok] "=&r" (ok), [h] "=&r" (h)
-      : [addr] "r" (addr),
-        [handler] "r" (&(Caml_state->checking_pointer_pc)));
-  *contents = h;
-  return ok;
-}
+  int entry_idx, large_idx = 0;
+  mark_entry* mark_stack = stk->stack;
 
-#else
-#error "NAKED_POINTERS_CHECKER not supported on this platform"
-#endif
+  struct skiplist chunk_sklist = SKIPLIST_STATIC_INITIALIZER;
+  /* Insert used pools into skiplist */
+  for(entry_idx = 0; entry_idx < stk->count; entry_idx++){
+    mark_entry me = mark_stack[entry_idx];
+    struct pool* pool = caml_pool_of_shared_block(me.block);
+    if (!pool) {
+      // This could be a large allocation - which is off-heap. Hold on to it.
+      mark_stack[large_idx++] = me;
+      continue;
+    }
+    caml_skiplist_insert(&chunk_sklist, (uintnat)pool,
+    (uintnat)pool + sizeof(pool));
+  }
 
-static void is_naked_pointer_safe (value v, value *p)
-{
-  header_t h;
-  tag_t t;
+  /* Traverse through entire skiplist and put it into pools to rescan */
+  FOREACH_SKIPLIST_ELEMENT(e, &chunk_sklist, {
+    if(Caml_state->pools_to_rescan_len == Caml_state->pools_to_rescan_count){
+      Caml_state->pools_to_rescan_len =
+        Caml_state->pools_to_rescan_len * 2 + 128;
 
-  /* The following conditions were checked by the caller */
-  CAMLassert(Is_block(v) && !Is_young(v) && !Is_in_heap(v));
+      Caml_state->pools_to_rescan =
+        caml_stat_resize(
+          Caml_state->pools_to_rescan,
+          Caml_state->pools_to_rescan_len * sizeof(struct pool *));
+    }
+    Caml_state->pools_to_rescan[Caml_state->pools_to_rescan_count++]
+      = (struct pool*) (e->key);;
+  });
+
+  caml_gc_log(
+    "Mark stack overflow. Postponing %d pools",
+    Caml_state->pools_to_rescan_count);
 
-  if (! safe_load(&Hd_val(v), &h)) goto on_segfault;
+  stk->count = large_idx;
+}
 
-  t = Tag_hd(h);
-  if (t == Infix_tag) {
-    v -= Infix_offset_hd(h);
-    if (! safe_load(&Hd_val(v), &h)) goto on_segfault;
-    t = Tag_hd(h);
+int caml_init_major_gc(caml_domain_state* d) {
+  Caml_state->mark_stack = caml_stat_alloc_noexc(sizeof(struct mark_stack));
+  if(Caml_state->mark_stack == NULL) {
+    return -1;
+  }
+  Caml_state->mark_stack->stack =
+    caml_stat_alloc_noexc(MARK_STACK_INIT_SIZE * sizeof(mark_entry));
+  if(Caml_state->mark_stack->stack == NULL) {
+    caml_stat_free(Caml_state->mark_stack);
+    Caml_state->mark_stack = NULL;
+    return -1;
   }
+  Caml_state->mark_stack->count = 0;
+  Caml_state->mark_stack->size = MARK_STACK_INIT_SIZE;
+  /* Fresh domains do not need to performing marking or sweeping. */
+  d->sweeping_done = 1;
+  d->marking_done = 1;
+  d->major_work_computed = 0;
+  d->major_work_todo = 0;
+  d->major_gc_clock = 0.0;
+  /* Finalisers. Fresh domains participate in updating finalisers. */
+  d->final_info = caml_alloc_final_info ();
+  if(d->final_info == NULL) {
+    caml_stat_free(Caml_state->mark_stack->stack);
+    caml_stat_free(Caml_state->mark_stack);
+    return -1;
+  }
+  d->ephe_info = caml_alloc_ephe_info();
+  if(d->ephe_info == NULL) {
+    caml_stat_free(d->final_info);
+    caml_stat_free(Caml_state->mark_stack->stack);
+    caml_stat_free(Caml_state->mark_stack);
+    d->final_info = NULL;
+    Caml_state->mark_stack = NULL;
+    return -1;
+  }
+  atomic_fetch_add(&num_domains_to_final_update_first, 1);
+  atomic_fetch_add(&num_domains_to_final_update_last, 1);
 
-  /* For the out-of-heap pointer to be considered safe,
-   * it should have a black header and its size should be < 2 ** 40
-   * words (128 GB). If not, we report a warning. */
-  if (Is_black_hd(h) && Wosize_hd(h) < (INT64_LITERAL(1) << 40))
-    return;
+  Caml_state->pools_to_rescan =
+    caml_stat_alloc_noexc(INITIAL_POOLS_TO_RESCAN_LEN * sizeof(struct pool*));
+  Caml_state->pools_to_rescan_len = INITIAL_POOLS_TO_RESCAN_LEN;
+  Caml_state->pools_to_rescan_count = 0;
 
-  caml_naked_pointers_detected = 1;
-  if (!Is_black_hd(h)) {
-    fprintf (stderr, "Out-of-heap pointer at %p of value %p has "
-                     "non-black head (tag=%d)\n", p, (void*)v, t);
-  } else {
-    fprintf (stderr,
-             "Out-of-heap pointer at %p of value %p has "
-             "suspiciously large size: %" ARCH_INT64_PRINTF_FORMAT "u words\n",
-              p, (void*)v, Wosize_hd(h));
-  }
-  return;
+  return 0;
+}
 
- on_segfault:
-  caml_naked_pointers_detected = 1;
-  fprintf (stderr, "Out-of-heap pointer at %p of value %p. "
-           "Cannot read head.\n", p, (void*)v);
+void caml_teardown_major_gc(void) {
+  CAMLassert(Caml_state->mark_stack->count == 0);
+  caml_stat_free(Caml_state->mark_stack->stack);
+  caml_stat_free(Caml_state->mark_stack);
+  if( Caml_state->pools_to_rescan_len > 0 )
+    caml_stat_free(Caml_state->pools_to_rescan);
+  Caml_state->mark_stack = NULL;
 }
 
-#endif
+void caml_finalise_heap (void)
+{
+  return;
+}
diff --git a/runtime/memory.c b/runtime/memory.c
index 66d1c50ccb2d..35fea5bcb8a5 100644
--- a/runtime/memory.c
+++ b/runtime/memory.c
@@ -15,648 +15,347 @@
 
 #define CAML_INTERNALS
 
-#include <stdlib.h>
 #include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
 #include <stdarg.h>
 #include <stddef.h>
-#include "caml/address_class.h"
 #include "caml/config.h"
+#include "caml/misc.h"
 #include "caml/fail.h"
-#include "caml/freelist.h"
-#include "caml/gc.h"
-#include "caml/gc_ctrl.h"
-#include "caml/major_gc.h"
 #include "caml/memory.h"
 #include "caml/major_gc.h"
-#include "caml/minor_gc.h"
-#include "caml/misc.h"
-#include "caml/mlvalues.h"
 #include "caml/signals.h"
-#include "caml/memprof.h"
+#include "caml/shared_heap.h"
+#include "caml/domain.h"
+#include "caml/roots.h"
+#include "caml/alloc.h"
+#include "caml/fiber.h"
+#include "caml/platform.h"
 #include "caml/eventlog.h"
 
-int caml_huge_fallback_count = 0;
-/* Number of times that mmapping big pages fails and we fell back to small
-   pages. This counter is available to the program through
-   [Gc.huge_fallback_count].
-*/
-
-uintnat caml_use_huge_pages = 0;
-/* True iff the program allocates heap chunks by mmapping huge pages.
-   This is set when parsing [OCAMLRUNPARAM] and must stay constant
-   after that.
+/* Note [MM]: Enforcing the memory model.
+
+   Multicore OCaml implements the memory consistency model defined in
+
+     Bounding Data Races in Space and Time (PLDI '18)
+     Stephen Dolan, KC Sivaramakrishnan, Anil Madhavapeddy.
+
+   Unlike the C++ (also used in C11) memory model, this model gives
+   well-defined behaviour to data races, ensuring that they do not
+   affect unrelated computations. In C++, plain (non-atomic) accesses
+   have undefined semantics if they race, so it is necessary to use at
+   least relaxed atomics to implement all accesses.
+
+   However, simply using C++ relaxed atomics for non-atomic accesses
+   and C++ SC atomics for atomic ones is not enough, since the OCaml
+   memory model is stronger. The prototypical example where C++
+   exhibits a behaviour not allowed by OCaml is below. Assume that the
+   reference b and the atomic reference a are initially 0:
+
+       Thread 1            Thread 2
+       Atomic.set a 1;     let x = !b in
+       b := 1              let y = Atomic.get a in
+                           ...
+       Outcome: x = 1, y = 0
+
+   This outcome is not permitted by the OCaml memory model, as can be
+   seen from the operational model: if !b sees the write b := 1, then
+   the Atomic.set must have executed before the Atomic.get, and since
+   it is atomic the most recent set must be returned by the get,
+   yielding y = 1. In the equivalent axiomatic model, this would be a
+   violation of Causality.
+
+   If this example is naively translated to C++ (using atomic_{load,
+   store} for atomics, and atomic_{load, store}_explicit(...,
+   memory_order_relaxed) for nonatomics), then this outcome becomes
+   possible. The C++ model specifies that there is a total order on SC
+   accesses, but this total order is surprisingly weak. In this
+   example, we can have:
+
+       x = !b ...
+          [happens-before]
+       y = Atomic.get a
+          [SC-before]
+       Atomic.set a 1
+          [happens-before]
+       b := 1
+
+   Sadly, the composition of happens-before and SC-before does not add
+   up to anything useful, and the C++ model permits the read 'x = !b'
+   to read from the write 'b := 1' in this example, allowing the
+   outcome above.
+
+   To remedy this, we need to strengthen the relaxed accesses used for
+   non-atomic loads and stores. The most straightforward way to do
+   this is to use acquire loads and release stores instead of relaxed
+   for non-atomic accesses, which ensures that all reads-from edges
+   appear in the C++ synchronises-with relation, outlawing the outcome
+   above.
+
+   Using release stores for all writes also ensures publication safety
+   for newly-allocated objects, and isn't necessary for initialising
+   writes. The cost is free on x86, but requires a fence in
+   caml_modify on weakly-ordered architectures (ARM, Power).
+
+   However, instead of using acquire loads for all reads, an
+   optimisation is possible. (Optimising reads is more important than
+   optimising writes because reads are vastly more common). The OCaml
+   memory model does not require ordering between non-atomic reads,
+   which acquire loads provide. The acquire semantics are only
+   necessary between a non-atomic read and an atomic access or a
+   write, so we delay the acquire fence until one of those operations
+   occurs.
+
+   So, our non-atomic reads are implemented as standard relaxed loads,
+   but non-atomic writes and atomic operations (in this file, below)
+   contain an odd-looking line:
+
+      atomic_thread_fence(memory_order_acquire)
+
+   which serves to upgrade previous relaxed loads to acquire loads.
+   This encodes the OCaml memory model in the primitives provided by
+   the C++ model.
+
+   On x86, all loads and all stores have acquire/release semantics by
+   default anyway, so all of these fences compile away to nothing
+   (They're still useful, though: they serve to inhibit an overeager C
+   compiler's optimisations). On ARMv8, actual hardware fences are
+   generated.
 */
 
-extern uintnat caml_percent_free;                   /* major_gc.c */
-
-/* Page table management */
-
-#define Page(p) ((uintnat) (p) >> Page_log)
-#define Page_mask ((~(uintnat)0) << Page_log)
-
-#ifdef ARCH_SIXTYFOUR
-
-/* 64-bit implementation:
-   The page table is represented sparsely as a hash table
-   with linear probing */
-
-struct page_table {
-  mlsize_t size;                /* size == 1 << (wordsize - shift) */
-  int shift;
-  mlsize_t mask;                /* mask == size - 1 */
-  mlsize_t occupancy;
-  uintnat * entries;            /* [size]  */
-};
-
-static struct page_table caml_page_table;
-
-/* Page table entries are the logical 'or' of
-   - the key: address of a page (low Page_log bits = 0)
-   - the data: a 8-bit integer */
-
-#define Page_entry_matches(entry,addr) \
-  ((((entry) ^ (addr)) & Page_mask) == 0)
-
-/* Multiplicative Fibonacci hashing
-   (Knuth, TAOCP vol 3, section 6.4, page 518).
-   HASH_FACTOR is (sqrt(5) - 1) / 2 * 2^wordsize. */
-#ifdef ARCH_SIXTYFOUR
-#define HASH_FACTOR 11400714819323198486UL
-#else
-#define HASH_FACTOR 2654435769UL
-#endif
-#define Hash(v) (((v) * HASH_FACTOR) >> caml_page_table.shift)
-
-int caml_page_table_lookup(void * addr)
-{
-  uintnat h, e;
-
-  h = Hash(Page(addr));
-  /* The first hit is almost always successful, so optimize for this case */
-  e = caml_page_table.entries[h];
-  if (Page_entry_matches(e, (uintnat)addr)) return e & 0xFF;
-  while(1) {
-    if (e == 0) return 0;
-    h = (h + 1) & caml_page_table.mask;
-    e = caml_page_table.entries[h];
-    if (Page_entry_matches(e, (uintnat)addr)) return e & 0xFF;
-  }
-}
-
-int caml_page_table_initialize(mlsize_t bytesize)
-{
-  uintnat pagesize = Page(bytesize);
-
-  caml_page_table.size = 1;
-  caml_page_table.shift = 8 * sizeof(uintnat);
-  /* Aim for initial load factor between 1/4 and 1/2 */
-  while (caml_page_table.size < 2 * pagesize) {
-    caml_page_table.size <<= 1;
-    caml_page_table.shift -= 1;
-  }
-  caml_page_table.mask = caml_page_table.size - 1;
-  caml_page_table.occupancy = 0;
-  caml_page_table.entries =
-    caml_stat_calloc_noexc(caml_page_table.size, sizeof(uintnat));
-  if (caml_page_table.entries == NULL)
-    return -1;
-  else
-    return 0;
-}
-
-static int caml_page_table_resize(void)
-{
-  struct page_table old = caml_page_table;
-  uintnat * new_entries;
-  uintnat i, h;
-
-  caml_gc_message (0x08, "Growing page table to %"
-                   ARCH_INTNAT_PRINTF_FORMAT "u entries\n",
-                   caml_page_table.size);
-
-  new_entries = caml_stat_calloc_noexc(2 * old.size, sizeof(uintnat));
-  if (new_entries == NULL) {
-    caml_gc_message (0x08, "No room for growing page table\n");
-    return -1;
-  }
-
-  caml_page_table.size = 2 * old.size;
-  caml_page_table.shift = old.shift - 1;
-  caml_page_table.mask = caml_page_table.size - 1;
-  caml_page_table.occupancy = old.occupancy;
-  caml_page_table.entries = new_entries;
-
-  for (i = 0; i < old.size; i++) {
-    uintnat e = old.entries[i];
-    if (e == 0) continue;
-    h = Hash(Page(e));
-    while (caml_page_table.entries[h] != 0)
-      h = (h + 1) & caml_page_table.mask;
-    caml_page_table.entries[h] = e;
-  }
-
-  caml_stat_free(old.entries);
-  return 0;
-}
-
-static int caml_page_table_modify(uintnat page, int toclear, int toset)
-{
-  uintnat h;
-
-  CAMLassert ((page & ~Page_mask) == 0);
-
-  /* Resize to keep load factor below 1/2 */
-  if (caml_page_table.occupancy * 2 >= caml_page_table.size) {
-    if (caml_page_table_resize() != 0) return -1;
-  }
-  h = Hash(Page(page));
-  while (1) {
-    if (caml_page_table.entries[h] == 0) {
-      caml_page_table.entries[h] = page | toset;
-      caml_page_table.occupancy++;
-      break;
-    }
-    if (Page_entry_matches(caml_page_table.entries[h], page)) {
-      caml_page_table.entries[h] =
-        (caml_page_table.entries[h] & ~toclear) | toset;
-      break;
-    }
-    h = (h + 1) & caml_page_table.mask;
-  }
-  return 0;
-}
-
-#else
-
-/* 32-bit implementation:
-   The page table is represented as a 2-level array of unsigned char */
-
-CAMLexport unsigned char * caml_page_table[Pagetable1_size];
-static unsigned char caml_page_table_empty[Pagetable2_size] = { 0, };
-
-int caml_page_table_initialize(mlsize_t bytesize)
-{
-  int i;
-  for (i = 0; i < Pagetable1_size; i++)
-    caml_page_table[i] = caml_page_table_empty;
-  return 0;
-}
-
-static int caml_page_table_modify(uintnat page, int toclear, int toset)
+Caml_inline void write_barrier(
+  value obj, intnat field, value old_val, value new_val)
 {
-  uintnat i = Pagetable_index1(page);
-  uintnat j = Pagetable_index2(page);
-
-  if (caml_page_table[i] == caml_page_table_empty) {
-    unsigned char * new_tbl = caml_stat_calloc_noexc(Pagetable2_size, 1);
-    if (new_tbl == 0) return -1;
-    caml_page_table[i] = new_tbl;
-  }
-  caml_page_table[i][j] = (caml_page_table[i][j] & ~toclear) | toset;
-  return 0;
-}
+  /* HACK: can't assert when get old C-api style pointers
+    CAMLassert (Is_block(obj)); */
 
-#endif
-
-int caml_page_table_add(int kind, void * start, void * end)
-{
-  uintnat pstart = (uintnat) start & Page_mask;
-  uintnat pend = ((uintnat) end - 1) & Page_mask;
-  uintnat p;
+  if (!Is_young(obj)) {
 
-  for (p = pstart; p <= pend; p += Page_size)
-    if (caml_page_table_modify(p, 0, kind) != 0) return -1;
-  return 0;
+    if (Is_block(old_val)) {
+       /* if old is in the minor heap,
+          then this is in a remembered set already */
+       if (Is_young(old_val)) return;
+       /* old is a block and in the major heap */
+       caml_darken(0, old_val, 0);
+     }
+     /* this update is creating a new link from major to minor, remember it */
+     if (Is_block_and_young(new_val)) {
+       Ref_table_add(&Caml_state->minor_tables->major_ref, Op_val(obj) + field);
+     }
+   }
 }
 
-int caml_page_table_remove(int kind, void * start, void * end)
+CAMLexport CAMLweakdef void caml_modify (value *fp, value val)
 {
-  uintnat pstart = (uintnat) start & Page_mask;
-  uintnat pend = ((uintnat) end - 1) & Page_mask;
-  uintnat p;
+  write_barrier((value)fp, 0, *fp, val);
 
-  for (p = pstart; p <= pend; p += Page_size)
-    if (caml_page_table_modify(p, kind, 0) != 0) return -1;
-  return 0;
+  /* See Note [MM] above */
+  atomic_thread_fence(memory_order_acquire);
+  atomic_store_explicit(&Op_atomic_val((value)fp)[0], val,
+                        memory_order_release);
 }
 
-/* Allocate a block of the requested size, to be passed to
-   [caml_add_to_heap] later.
-   [request] will be rounded up to some implementation-dependent size.
-   The caller must use [Chunk_size] on the result to recover the actual
-   size.
-   Return NULL if the request cannot be satisfied. The returned pointer
-   is a hp, but the header (and the contents) must be initialized by the
-   caller.
+/* Dependent memory is all memory blocks allocated out of the heap
+   that depend on the GC (and finalizers) for deallocation.
+   For the GC to take dependent memory into account when computing
+   its automatic speed setting,
+   you must call [caml_alloc_dependent_memory] when you allocate some
+   dependent memory, and [caml_free_dependent_memory] when you
+   free it.  In both cases, you pass as argument the size (in bytes)
+   of the block being allocated or freed.
 */
-char *caml_alloc_for_heap (asize_t request)
+CAMLexport void caml_alloc_dependent_memory (mlsize_t nbytes)
 {
-  char *mem;
-  if (caml_use_huge_pages){
-#ifndef HAS_HUGE_PAGES
-    return NULL;
-#else
-    uintnat size = Round_mmap_size (sizeof (heap_chunk_head) + request);
-    void *block;
-    block = mmap (NULL, size, PROT_READ | PROT_WRITE,
-                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
-    if (block == MAP_FAILED) return NULL;
-    mem = (char *) block + sizeof (heap_chunk_head);
-    Chunk_size (mem) = size - sizeof (heap_chunk_head);
-    Chunk_block (mem) = block;
-#endif
-  }else{
-    void *block;
-
-    request = ((request + Page_size - 1) >> Page_log) << Page_log;
-    mem = caml_stat_alloc_aligned_noexc (request + sizeof (heap_chunk_head),
-                                         sizeof (heap_chunk_head), &block);
-    if (mem == NULL) return NULL;
-    mem += sizeof (heap_chunk_head);
-    Chunk_size (mem) = request;
-    Chunk_block (mem) = block;
-  }
-  Chunk_head (mem)->redarken_first.start = (value*)(mem + Chunk_size(mem));
-  Chunk_head (mem)->redarken_first.end = (value*)(mem + Chunk_size(mem));
-  Chunk_head (mem)->redarken_end = (value*)mem;
-  return mem;
+  Caml_state->dependent_size += nbytes / sizeof (value);
+  Caml_state->dependent_allocated += nbytes / sizeof (value);
 }
 
-/* Use this function to free a block allocated with [caml_alloc_for_heap]
-   if you don't add it with [caml_add_to_heap].
-*/
-void caml_free_for_heap (char *mem)
+CAMLexport void caml_free_dependent_memory (mlsize_t nbytes)
 {
-  if (caml_use_huge_pages){
-#ifdef HAS_HUGE_PAGES
-    munmap (Chunk_block (mem), Chunk_size (mem) + sizeof (heap_chunk_head));
-#else
-    CAMLassert (0);
-#endif
+  if (Caml_state->dependent_size < nbytes / sizeof (value)){
+    Caml_state->dependent_size = 0;
   }else{
-    caml_stat_free (Chunk_block (mem));
+    Caml_state->dependent_size -= nbytes / sizeof (value);
   }
 }
 
-/* Take a chunk of memory as argument, which must be the result of a
-   call to [caml_alloc_for_heap], and insert it into the heap chaining.
-   The contents of the chunk must be a sequence of valid blocks and
-   fragments: no space between blocks and no trailing garbage.  If
-   some blocks are blue, they must be added to the free list by the
-   caller.  All other blocks must have the color [caml_allocation_color(m)].
-   The caller must update [caml_allocated_words] if applicable.
-   Return value: 0 if no error; -1 in case of error.
-
-   See also: caml_compact_heap, which duplicates most of this function.
+/* Use this function to tell the major GC to speed up when you use
+   finalized blocks to automatically deallocate resources (other
+   than memory). The GC will do at least one cycle every [max]
+   allocated resources; [res] is the number of resources allocated
+   this time.
+   Note that only [res/max] is relevant.  The units (and kind of
+   resource) can change between calls to [caml_adjust_gc_speed].
 */
-int caml_add_to_heap (char *m)
+CAMLexport void caml_adjust_gc_speed (mlsize_t res, mlsize_t max)
 {
-#ifdef DEBUG
-  /* Should check the contents of the block. */
-#endif /* DEBUG */
-
-  caml_gc_message (0x04, "Growing heap to %"
-                   ARCH_INTNAT_PRINTF_FORMAT "uk bytes\n",
-     (Bsize_wsize (Caml_state->stat_heap_wsz) + Chunk_size (m)) / 1024);
-
-  /* Register block in page table */
-  if (caml_page_table_add(In_heap, m, m + Chunk_size(m)) != 0)
-    return -1;
-
-  /* Chain this heap chunk. */
-  {
-    char **last = &caml_heap_start;
-    char *cur = *last;
-
-    while (cur != NULL && cur < m){
-      last = &(Chunk_next (cur));
-      cur = *last;
-    }
-    Chunk_next (m) = cur;
-    *last = m;
-
-    ++ Caml_state->stat_heap_chunks;
-  }
-
-  Caml_state->stat_heap_wsz += Wsize_bsize (Chunk_size (m));
-  if (Caml_state->stat_heap_wsz > Caml_state->stat_top_heap_wsz){
-    Caml_state->stat_top_heap_wsz = Caml_state->stat_heap_wsz;
+  if (max == 0) max = 1;
+  if (res > max) res = max;
+  Caml_state->extra_heap_resources += (double) res / (double) max;
+  if (Caml_state->extra_heap_resources > 1.0){
+    Caml_state->extra_heap_resources = 1.0;
+    caml_request_major_slice ();
   }
-  return 0;
 }
 
-/* Allocate more memory from malloc for the heap.
-   Return a blue block of at least the requested size.
-   The blue block is chained to a sequence of blue blocks (through their
-   field 0); the last block of the chain is pointed by field 1 of the
-   first.  There may be a fragment after the last block.
-   The caller must insert the blocks into the free list.
-   [request] is a number of words and must be less than or equal
-   to [Max_wosize].
-   Return NULL when out of memory.
-*/
-static value *expand_heap (mlsize_t request)
+/* You must use [caml_intialize] to store the initial value in a field of a
+   block, unless you are sure the value is not a young block, in which case a
+   plain assignment would do.
+
+   [caml_initialize] never calls the GC, so you may call it while a block is
+   unfinished (i.e. just after a call to [caml_alloc_shr].) */
+CAMLexport CAMLweakdef void caml_initialize (value *fp, value val)
 {
-  /* these point to headers, but we do arithmetic on them, hence [value *]. */
-  value *mem, *hp, *prev;
-  asize_t over_request, malloc_request, remain;
-
-  CAMLassert (request <= Max_wosize);
-  over_request = request + request / 100 * caml_percent_free;
-  malloc_request = caml_clip_heap_chunk_wsz (over_request);
-  mem = (value *) caml_alloc_for_heap (Bsize_wsize (malloc_request));
-  if (mem == NULL){
-    caml_gc_message (0x04, "No room for growing heap\n");
-    return NULL;
-  }
-  remain = Wsize_bsize (Chunk_size (mem));
-  prev = hp = mem;
-  /* FIXME find a way to do this with a call to caml_make_free_blocks */
-  while (Wosize_whsize (remain) > Max_wosize){
-    Hd_hp (hp) = Make_header (Max_wosize, 0, Caml_blue);
-#ifdef DEBUG
-    caml_set_fields (Val_hp (hp), 0, Debug_free_major);
-#endif
-    hp += Whsize_wosize (Max_wosize);
-    remain -= Whsize_wosize (Max_wosize);
-    Field (Val_hp (mem), 1) = Field (Val_hp (prev), 0) = Val_hp (hp);
-    prev = hp;
-  }
-  if (remain > 1){
-    Hd_hp (hp) = Make_header (Wosize_whsize (remain), 0, Caml_blue);
 #ifdef DEBUG
-    caml_set_fields (Val_hp (hp), 0, Debug_free_major);
+  if (Is_young((value)fp))
+    CAMLassert(*fp == Debug_uninit_minor || *fp == Val_unit);
+  else
+    CAMLassert(*fp == Debug_uninit_major || *fp == Val_unit);
 #endif
-    Field (Val_hp (mem), 1) = Field (Val_hp (prev), 0) = Val_hp (hp);
-    Field (Val_hp (hp), 0) = (value) NULL;
-  }else{
-    Field (Val_hp (prev), 0) = (value) NULL;
-    if (remain == 1) {
-      Hd_hp (hp) = Make_header (0, 0, Caml_white);
+  *fp = val;
+  if (!Is_young((value)fp) && Is_block_and_young (val))
+    Ref_table_add(&Caml_state->minor_tables->major_ref, fp);
+}
+
+CAMLexport int caml_atomic_cas_field (
+  value obj, intnat field, value oldval, value newval)
+{
+  if (caml_domain_alone()) {
+    /* non-atomic CAS since only this thread can access the object */
+    value* p = &Field(obj, field);
+    if (*p == oldval) {
+      *p = newval;
+      write_barrier(obj, field, oldval, newval);
+      return 1;
+    } else {
+      return 0;
+    }
+  } else {
+    /* need a real CAS */
+    atomic_value* p = &Op_atomic_val(obj)[field];
+    if (atomic_compare_exchange_strong(p, &oldval, newval)) {
+      write_barrier(obj, field, oldval, newval);
+      return 1;
+    } else {
+      return 0;
     }
   }
-  CAMLassert (Wosize_hp (mem) >= request);
-  if (caml_add_to_heap ((char *) mem) != 0){
-    caml_free_for_heap ((char *) mem);
-    return NULL;
-  }
-  return Op_hp (mem);
 }
 
-/* Remove the heap chunk [chunk] from the heap and give the memory back
-   to [free].
-*/
-void caml_shrink_heap (char *chunk)
-{
-  char **cp;
-
-  /* Never deallocate the first chunk, because caml_heap_start is both the
-     first block and the base address for page numbers, and we don't
-     want to shift the page table, it's too messy (see above).
-     It will never happen anyway, because of the way compaction works.
-     (see compact.c)
-     XXX FIXME this has become false with the fix to PR#5389 (see compact.c)
-  */
-  if (chunk == caml_heap_start) return;
-
-  Caml_state->stat_heap_wsz -= Wsize_bsize (Chunk_size (chunk));
-  caml_gc_message (0x04, "Shrinking heap to %"
-                   ARCH_INTNAT_PRINTF_FORMAT "dk words\n",
-                   Caml_state->stat_heap_wsz / 1024);
 
-#ifdef DEBUG
-  {
-    mlsize_t i;
-    for (i = 0; i < Wsize_bsize (Chunk_size (chunk)); i++){
-      ((value *) chunk) [i] = Debug_free_shrink;
-    }
+CAMLprim value caml_atomic_load (value ref)
+{
+  if (caml_domain_alone()) {
+    return Field(ref, 0);
+  } else {
+    value v;
+    /* See Note [MM] above */
+    atomic_thread_fence(memory_order_acquire);
+    v = atomic_load(Op_atomic_val(ref));
+    return v;
   }
-#endif
-
-  -- Caml_state->stat_heap_chunks;
-
-  /* Remove [chunk] from the list of chunks. */
-  cp = &caml_heap_start;
-  while (*cp != chunk) cp = &(Chunk_next (*cp));
-  *cp = Chunk_next (chunk);
-
-  /* Remove the pages of [chunk] from the page table. */
-  caml_page_table_remove(In_heap, chunk, chunk + Chunk_size (chunk));
-
-  /* Free the [malloc] block that contains [chunk]. */
-  caml_free_for_heap (chunk);
 }
 
-CAMLexport color_t caml_allocation_color (void *hp)
+/* stores are implemented as exchanges */
+CAMLprim value caml_atomic_exchange (value ref, value v)
 {
-  if (caml_gc_phase == Phase_mark || caml_gc_phase == Phase_clean ||
-      (caml_gc_phase == Phase_sweep && (char *)hp >= (char *)caml_gc_sweep_hp)){
-    return Caml_black;
-  }else{
-    CAMLassert (caml_gc_phase == Phase_idle
-            || (caml_gc_phase == Phase_sweep
-                && (char *)hp < (char *)caml_gc_sweep_hp));
-    return Caml_white;
+  value ret;
+  if (caml_domain_alone()) {
+    ret = Field(ref, 0);
+    Field(ref, 0) = v;
+  } else {
+    /* See Note [MM] above */
+    atomic_thread_fence(memory_order_acquire);
+    ret = atomic_exchange(Op_atomic_val(ref), v);
   }
+  write_barrier(ref, 0, ret, v);
+  return ret;
 }
 
-Caml_inline value caml_alloc_shr_aux (mlsize_t wosize, tag_t tag, int track,
-                                      uintnat profinfo)
+CAMLprim value caml_atomic_cas (value ref, value oldv, value newv)
 {
-  header_t *hp;
-  value *new_block;
-
-  if (wosize > Max_wosize) return 0;
-  CAML_EV_ALLOC(wosize);
-  hp = caml_fl_allocate (wosize);
-  if (hp == NULL){
-    new_block = expand_heap (wosize);
-    if (new_block == NULL) return 0;
-    caml_fl_add_blocks ((value) new_block);
-    hp = caml_fl_allocate (wosize);
-  }
-
-  CAMLassert (Is_in_heap (Val_hp (hp)));
-
-  /* Inline expansion of caml_allocation_color. */
-  if (caml_gc_phase == Phase_mark || caml_gc_phase == Phase_clean ||
-      (caml_gc_phase == Phase_sweep && (char *)hp >= (char *)caml_gc_sweep_hp)){
-    Hd_hp (hp) = Make_header_with_profinfo (wosize, tag, Caml_black, profinfo);
-  }else{
-    CAMLassert (caml_gc_phase == Phase_idle
-            || (caml_gc_phase == Phase_sweep
-                && (char *)hp < (char *)caml_gc_sweep_hp));
-    Hd_hp (hp) = Make_header_with_profinfo (wosize, tag, Caml_white, profinfo);
-  }
-  CAMLassert (Hd_hp (hp)
-    == Make_header_with_profinfo (wosize, tag, caml_allocation_color (hp),
-                                  profinfo));
-  caml_allocated_words += Whsize_wosize (wosize);
-  if (caml_allocated_words > Caml_state->minor_heap_wsz){
-    CAML_EV_COUNTER (EV_C_REQUEST_MAJOR_ALLOC_SHR, 1);
-    caml_request_major_slice ();
-  }
-#ifdef DEBUG
-  {
-    uintnat i;
-    for (i = 0; i < wosize; i++){
-      Field (Val_hp (hp), i) = Debug_uninit_major;
+  if (caml_domain_alone()) {
+    value* p = Op_val(ref);
+    if (*p == oldv) {
+      *p = newv;
+      write_barrier(ref, 0, oldv, newv);
+      return Val_int(1);
+    } else {
+      return Val_int(0);
+    }
+  } else {
+    atomic_value* p = &Op_atomic_val(ref)[0];
+    if (atomic_compare_exchange_strong(p, &oldv, newv)) {
+      write_barrier(ref, 0, oldv, newv);
+      return Val_int(1);
+    } else {
+      return Val_int(0);
     }
   }
-#endif
-  if(track)
-    caml_memprof_track_alloc_shr(Val_hp (hp));
-  return Val_hp (hp);
 }
 
-Caml_inline value check_oom(value v)
+CAMLprim value caml_atomic_fetch_add (value ref, value incr)
 {
-  if (v == 0) {
-    if (Caml_state->in_minor_collection)
-      caml_fatal_error ("out of memory");
-    else
-      caml_raise_out_of_memory ();
+  value ret;
+  if (caml_domain_alone()) {
+    value* p = Op_val(ref);
+    CAMLassert(Is_long(*p));
+    ret = *p;
+    *p = Val_long(Long_val(ret) + Long_val(incr));
+    /* no write barrier needed, integer write */
+  } else {
+    atomic_value *p = &Op_atomic_val(ref)[0];
+    ret = atomic_fetch_add(p, 2*Long_val(incr));
   }
-  return v;
-}
-
-CAMLexport value caml_alloc_shr_with_profinfo (mlsize_t wosize, tag_t tag,
-                                               intnat profinfo)
-{
-  return check_oom(caml_alloc_shr_aux(wosize, tag, 1, profinfo));
-}
-
-CAMLexport value caml_alloc_shr_for_minor_gc (mlsize_t wosize,
-                                              tag_t tag, header_t old_hd)
-{
-  return check_oom(caml_alloc_shr_aux(wosize, tag, 0, Profinfo_hd(old_hd)));
-}
-
-CAMLexport value caml_alloc_shr (mlsize_t wosize, tag_t tag)
-{
-  return caml_alloc_shr_with_profinfo(wosize, tag, NO_PROFINFO);
-}
-
-CAMLexport value caml_alloc_shr_no_track_noexc (mlsize_t wosize, tag_t tag)
-{
-  return caml_alloc_shr_aux(wosize, tag, 0, NO_PROFINFO);
+  return ret;
 }
 
-/* Dependent memory is all memory blocks allocated out of the heap
-   that depend on the GC (and finalizers) for deallocation.
-   For the GC to take dependent memory into account when computing
-   its automatic speed setting,
-   you must call [caml_alloc_dependent_memory] when you allocate some
-   dependent memory, and [caml_free_dependent_memory] when you
-   free it.  In both cases, you pass as argument the size (in bytes)
-   of the block being allocated or freed.
-*/
-CAMLexport void caml_alloc_dependent_memory (mlsize_t nbytes)
+CAMLexport void caml_set_fields (value obj, value v)
 {
-  caml_dependent_size += nbytes / sizeof (value);
-  caml_dependent_allocated += nbytes / sizeof (value);
-}
+  int i;
+  CAMLassert (Is_block(obj));
 
-CAMLexport void caml_free_dependent_memory (mlsize_t nbytes)
-{
-  if (caml_dependent_size < nbytes / sizeof (value)){
-    caml_dependent_size = 0;
-  }else{
-    caml_dependent_size -= nbytes / sizeof (value);
+  for (i = 0; i < Wosize_val(obj); i++) {
+    caml_modify(&Field(obj, i), v);
   }
 }
 
-/* Use this function to tell the major GC to speed up when you use
-   finalized blocks to automatically deallocate resources (other
-   than memory). The GC will do at least one cycle every [max]
-   allocated resources; [res] is the number of resources allocated
-   this time.
-   Note that only [res/max] is relevant.  The units (and kind of
-   resource) can change between calls to [caml_adjust_gc_speed].
-*/
-CAMLexport void caml_adjust_gc_speed (mlsize_t res, mlsize_t max)
+Caml_inline value alloc_shr(mlsize_t wosize, tag_t tag, int noexc)
 {
-  if (max == 0) max = 1;
-  if (res > max) res = max;
-  caml_extra_heap_resources += (double) res / (double) max;
-  if (caml_extra_heap_resources > 1.0){
-    CAML_EV_COUNTER (EV_C_REQUEST_MAJOR_ADJUST_GC_SPEED, 1);
-    caml_extra_heap_resources = 1.0;
-    caml_request_major_slice ();
+  caml_domain_state *dom_st = Caml_state;
+  value *v = caml_shared_try_alloc(dom_st->shared_heap, wosize, tag, 0);
+  if (v == NULL) {
+    if (!noexc)
+      caml_raise_out_of_memory();
+    else
+      return (value)NULL;
+  }
+  CAML_EV_ALLOC(wosize);
+  dom_st->allocated_words += Whsize_wosize(wosize);
+  if (dom_st->allocated_words > dom_st->minor_heap_wsz) {
+    CAML_EV_COUNTER (EV_C_REQUEST_MAJOR_ALLOC_SHR, 1);
+    caml_request_major_slice();
   }
-}
 
-/* You must use [caml_initialize] to store the initial value in a field of
-   a shared block, unless you are sure the value is not a young block.
-   A block value [v] is a shared block if and only if [Is_in_heap (v)]
-   is true.
-*/
-/* [caml_initialize] never calls the GC, so you may call it while a block is
-   unfinished (i.e. just after a call to [caml_alloc_shr].) */
-/* PR#6084 workaround: define it as a weak symbol */
-CAMLexport CAMLweakdef void caml_initialize (value *fp, value val)
-{
-  CAMLassert(Is_in_heap_or_young(fp));
-  *fp = val;
-  if (!Is_young((value)fp) && Is_block (val) && Is_young (val)) {
-    add_to_ref_table (Caml_state->ref_table, fp);
+#ifdef DEBUG
+  if (tag < No_scan_tag) {
+    mlsize_t i;
+    for (i = 0; i < wosize; i++)
+      Op_hp(v)[i] = Debug_uninit_major;
   }
+#endif
+  return Val_hp(v);
 }
 
-/* You must use [caml_modify] to change a field of an existing shared block,
-   unless you are sure the value being overwritten is not a shared block and
-   the value being written is not a young block. */
-/* [caml_modify] never calls the GC. */
-/* [caml_modify] can also be used to do assignment on data structures that are
-   in the minor heap instead of in the major heap.  In this case, it
-   is a bit slower than simple assignment.
-   In particular, you can use [caml_modify] when you don't know whether the
-   block being changed is in the minor heap or the major heap. */
-/* PR#6084 workaround: define it as a weak symbol */
-
-CAMLexport CAMLweakdef void caml_modify (value *fp, value val)
+CAMLexport value caml_alloc_shr(mlsize_t wosize, tag_t tag)
 {
-  /* The write barrier implemented by [caml_modify] checks for the
-     following two conditions and takes appropriate action:
-     1- a pointer from the major heap to the minor heap is created
-        --> add [fp] to the remembered set
-     2- a pointer from the major heap to the major heap is overwritten,
-        while the GC is in the marking phase
-        --> call [caml_darken] on the overwritten pointer so that the
-            major GC treats it as an additional root.
-
-     The logic implemented below is duplicated in caml_array_fill to
-     avoid repeated calls to caml_modify and repeated tests on the
-     values.  Don't forget to update caml_array_fill if the logic
-     below changes!
-  */
-  value old;
-
-  if (Is_young((value)fp)) {
-    /* The modified object resides in the minor heap.
-       Conditions 1 and 2 cannot occur. */
-    *fp = val;
-  } else {
-    /* The modified object resides in the major heap. */
-    CAMLassert(Is_in_heap(fp));
-    old = *fp;
-    *fp = val;
-    if (Is_block(old)) {
-      /* If [old] is a pointer within the minor heap, we already
-         have a major->minor pointer and [fp] is already in the
-         remembered set.  Conditions 1 and 2 cannot occur. */
-      if (Is_young(old)) return;
-      /* Here, [old] can be a pointer within the major heap.
-         Check for condition 2. */
-      if (caml_gc_phase == Phase_mark) caml_darken(old, NULL);
-    }
-    /* Check for condition 1. */
-    if (Is_block(val) && Is_young(val)) {
-      add_to_ref_table (Caml_state->ref_table, fp);
-    }
-  }
+  return alloc_shr(wosize, tag, 0);
 }
 
+CAMLexport value caml_alloc_shr_noexc(mlsize_t wosize, tag_t tag) {
+  return alloc_shr(wosize, tag, 1);
+}
 
 /* Global memory pool.
 
@@ -708,7 +407,7 @@ struct pool_block {
 #endif
 
 static struct pool_block *pool = NULL;
-
+static caml_plat_mutex pool_mutex = CAML_PLAT_MUTEX_INITIALIZER;
 
 /* Returns a pointer to the block header, given a pointer to "data" */
 static struct pool_block* get_pool_block(caml_stat_block b)
@@ -731,7 +430,7 @@ CAMLexport void caml_stat_create_pool(void)
   if (pool == NULL) {
     pool = malloc(SIZEOF_POOL_BLOCK);
     if (pool == NULL)
-      caml_fatal_error("out of memory");
+      caml_fatal_error("Fatal error: out of memory.\n");
 #ifdef DEBUG
     pool->magic = Debug_pool_magic;
 #endif
@@ -742,6 +441,7 @@ CAMLexport void caml_stat_create_pool(void)
 
 CAMLexport void caml_stat_destroy_pool(void)
 {
+  caml_plat_lock(&pool_mutex);
   if (pool != NULL) {
     pool->prev->next = NULL;
     while (pool != NULL) {
@@ -751,6 +451,33 @@ CAMLexport void caml_stat_destroy_pool(void)
     }
     pool = NULL;
   }
+  caml_plat_unlock(&pool_mutex);
+}
+
+/* [sz] is a number of bytes */
+CAMLexport caml_stat_block caml_stat_alloc_noexc(asize_t sz)
+{
+  /* Backward compatibility mode */
+  if (pool == NULL)
+    return malloc(sz);
+  else {
+    struct pool_block *pb = malloc(sz + SIZEOF_POOL_BLOCK);
+    if (pb == NULL) return NULL;
+#ifdef DEBUG
+    memset(&(pb->data), Debug_uninit_stat, sz);
+    pb->magic = Debug_pool_magic;
+#endif
+
+    /* Linking the block into the ring */
+    caml_plat_lock(&pool_mutex);
+    pb->next = pool->next;
+    pb->prev = pool;
+    pool->next->prev = pb;
+    pool->next = pb;
+    caml_plat_unlock(&pool_mutex);
+
+    return &(pb->data);
+  }
 }
 
 /* [sz] and [modulo] are numbers of bytes */
@@ -791,30 +518,6 @@ CAMLexport void* caml_stat_alloc_aligned(asize_t sz, int modulo,
   return result;
 }
 
-/* [sz] is a number of bytes */
-CAMLexport caml_stat_block caml_stat_alloc_noexc(asize_t sz)
-{
-  /* Backward compatibility mode */
-  if (pool == NULL)
-    return malloc(sz);
-  else {
-    struct pool_block *pb = malloc(sz + SIZEOF_POOL_BLOCK);
-    if (pb == NULL) return NULL;
-#ifdef DEBUG
-    memset(&(pb->data), Debug_uninit_stat, sz);
-    pb->magic = Debug_pool_magic;
-#endif
-
-    /* Linking the block into the ring */
-    pb->next = pool->next;
-    pb->prev = pool;
-    pool->next->prev = pb;
-    pool->next = pb;
-
-    return &(pb->data);
-  }
-}
-
 /* [sz] is a number of bytes */
 CAMLexport caml_stat_block caml_stat_alloc(asize_t sz)
 {
@@ -835,8 +538,10 @@ CAMLexport void caml_stat_free(caml_stat_block b)
     if (pb == NULL) return;
 
     /* Unlinking the block from the ring */
+    caml_plat_lock(&pool_mutex);
     pb->prev->next = pb->next;
     pb->next->prev = pb->prev;
+    caml_plat_unlock(&pool_mutex);
 
     free(pb);
   }
@@ -856,8 +561,10 @@ CAMLexport caml_stat_block caml_stat_resize_noexc(caml_stat_block b, asize_t sz)
     if (pb_new == NULL) return NULL;
 
     /* Relinking the new block into the ring in place of the old one */
+    caml_plat_lock(&pool_mutex);
     pb_new->prev->next = pb_new;
     pb_new->next->prev = pb_new;
+    caml_plat_unlock(&pool_mutex);
 
     return &(pb_new->data);
   }
diff --git a/runtime/memprof.c b/runtime/memprof.c
index b381db2e0c23..ed115ec8d5d9 100644
--- a/runtime/memprof.c
+++ b/runtime/memprof.c
@@ -15,6 +15,21 @@
 
 #define CAML_INTERNALS
 
+#include "caml/fail.h"
+
+CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param)
+{
+  caml_failwith("Gc.memprof.start: not implemented in multicore");
+}
+
+CAMLprim value caml_memprof_stop(value unit)
+{
+  caml_failwith("Gc.memprof.stop: not implemented in multicore");
+}
+
+/* FIXME: integrate memprof with multicore */
+#if 0
+
 #include <string.h>
 #include "caml/memprof.h"
 #include "caml/fail.h"
@@ -1134,3 +1149,5 @@ CAMLexport void caml_memprof_enter_thread(struct caml_memprof_th_ctx* ctx)
   local = ctx;
   caml_memprof_set_suspended(ctx->suspended);
 }
+
+#endif
diff --git a/runtime/meta.c b/runtime/meta.c
index cbacc9a03d87..5b002b8ce216 100644
--- a/runtime/meta.c
+++ b/runtime/meta.c
@@ -24,6 +24,7 @@
 #include "caml/config.h"
 #include "caml/debugger.h"
 #include "caml/fail.h"
+#include "caml/fiber.h"
 #include "caml/fix_code.h"
 #include "caml/interp.h"
 #include "caml/intext.h"
@@ -33,8 +34,7 @@
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
 #include "caml/prims.h"
-#include "caml/signals.h"
-#include "caml/stacks.h"
+#include "caml/startup_aux.h"
 
 #ifndef NATIVE_CODE
 
@@ -43,14 +43,11 @@ CAMLprim value caml_get_global_data(value unit)
   return caml_global_data;
 }
 
-char * caml_section_table = NULL;
-asize_t caml_section_table_size;
-
 CAMLprim value caml_get_section_table(value unit)
 {
-  if (caml_section_table == NULL) caml_raise_not_found();
-  return caml_input_value_from_block(caml_section_table,
-                                     caml_section_table_size);
+  if (caml_params->section_table == NULL) caml_raise_not_found();
+  return caml_input_value_from_block(caml_params->section_table,
+                                     caml_params->section_table_size);
 }
 
 struct bytecode {
@@ -121,8 +118,12 @@ CAMLprim value caml_reify_bytecode(value ls_prog,
   caml_thread_code((code_t) prog, len);
 #endif
 
+#if 0
+  /* TODO: support dynlink debugger: PR8654 */
   /* Notify debugger after fragment gets added and reified. */
   caml_debugger(CODE_LOADED, Val_long(fragnum));
+#endif
+  (void)fragnum; /* clobber warning */
 
   clos = caml_alloc_small (2, Closure_tag);
   Code_val(clos) = (code_t) prog;
@@ -161,11 +162,13 @@ CAMLprim value caml_static_release_bytecode(value bc)
 
 CAMLprim value caml_realloc_global(value size)
 {
+  CAMLparam1(size);
+  CAMLlocal2(old_global_data, new_global_data);
   mlsize_t requested_size, actual_size, i;
-  value new_global_data;
+  old_global_data = caml_global_data;
 
   requested_size = Long_val(size);
-  actual_size = Wosize_val(caml_global_data);
+  actual_size = Wosize_val(old_global_data);
   if (requested_size >= actual_size) {
     requested_size = (requested_size + 0x100) & 0xFFFFFF00;
     caml_gc_message (0x08, "Growing global data to %"
@@ -173,20 +176,18 @@ CAMLprim value caml_realloc_global(value size)
                      requested_size);
     new_global_data = caml_alloc_shr(requested_size, 0);
     for (i = 0; i < actual_size; i++)
-      caml_initialize(&Field(new_global_data, i), Field(caml_global_data, i));
+      caml_initialize(&Field(new_global_data, i), Field(old_global_data, i));
     for (i = actual_size; i < requested_size; i++){
       Field (new_global_data, i) = Val_long (0);
     }
-    // Give gc a chance to run, and run memprof callbacks
-    caml_global_data = new_global_data;
-    caml_process_pending_actions();
+    caml_modify_generational_global_root(&caml_global_data, new_global_data);
   }
-  return Val_unit;
+  CAMLreturn (Val_unit);
 }
 
 CAMLprim value caml_get_current_environment(value unit)
 {
-  return *Caml_state->extern_sp;
+  return *Caml_state->current_stack->sp;
 }
 
 CAMLprim value caml_invoke_traced_function(value codeptr, value env, value arg)
@@ -219,9 +220,9 @@ CAMLprim value caml_invoke_traced_function(value codeptr, value env, value arg)
   value * osp, * nsp;
   int i;
 
-  osp = Caml_state->extern_sp;
-  Caml_state->extern_sp -= 4;
-  nsp = Caml_state->extern_sp;
+  osp = Caml_state->current_stack->sp;
+  Caml_state->current_stack->sp -= 4;
+  nsp = Caml_state->current_stack->sp;
   for (i = 0; i < 7; i++) nsp[i] = osp[i];
   nsp[7] = (value) Nativeint_val(codeptr);
   nsp[8] = env;
@@ -270,4 +271,6 @@ value caml_static_release_bytecode(value prog, value len)
   return Val_unit; /* not reached */
 }
 
+void (* volatile caml_async_action_hook)(void);
+
 #endif
diff --git a/runtime/minor_gc.c b/runtime/minor_gc.c
index 9155a6fd6321..e823214830bf 100644
--- a/runtime/minor_gc.c
+++ b/runtime/minor_gc.c
@@ -16,73 +16,36 @@
 #define CAML_INTERNALS
 
 #include <string.h>
-#include "caml/custom.h"
+#include <stdio.h>
+
 #include "caml/config.h"
+#include "caml/custom.h"
+#include "caml/domain.h"
+#include "caml/eventlog.h"
 #include "caml/fail.h"
+#include "caml/fiber.h"
 #include "caml/finalise.h"
 #include "caml/gc.h"
 #include "caml/gc_ctrl.h"
+#include "caml/globroots.h"
 #include "caml/major_gc.h"
 #include "caml/memory.h"
 #include "caml/minor_gc.h"
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
+#include "caml/platform.h"
 #include "caml/roots.h"
+#include "caml/shared_heap.h"
 #include "caml/signals.h"
+#include "caml/startup_aux.h"
 #include "caml/weak.h"
-#include "caml/memprof.h"
-#include "caml/eventlog.h"
-
-/* Pointers into the minor heap.
-   [Caml_state->young_base]
-       The [malloc] block that contains the heap.
-   [Caml_state->young_start] ... [Caml_state->young_end]
-       The whole range of the minor heap: all young blocks are inside
-       this interval.
-   [Caml_state->young_alloc_start]...[Caml_state->young_alloc_end]
-       The allocation arena: newly-allocated blocks are carved from
-       this interval, starting at [Caml_state->young_alloc_end].
-   [Caml_state->young_alloc_mid] is the mid-point of this interval.
-   [Caml_state->young_ptr], [Caml_state->young_trigger],
-   [Caml_state->young_limit]
-       These pointers are all inside the allocation arena.
-       - [Caml_state->young_ptr] is where the next allocation will take place.
-       - [Caml_state->young_trigger] is how far we can allocate before
-         triggering [caml_gc_dispatch]. Currently, it is either
-         [Caml_state->young_alloc_start] or the mid-point of the allocation
-         arena.
-       - [Caml_state->young_limit] is the pointer that is compared to
-         [Caml_state->young_ptr] for allocation. It is either:
-            + [Caml_state->young_alloc_end] if a signal handler or
-              finaliser or memprof callback is pending, or if a major
-              or minor collection has been requested, or an
-              asynchronous callback has just raised an exception,
-            + [caml_memprof_young_trigger] if a memprof sample is planned,
-            + or [Caml_state->young_trigger].
-*/
 
+extern value caml_ephe_none; /* See weak.c */
 struct generic_table CAML_TABLE_STRUCT(char);
 
-void caml_alloc_minor_tables ()
-{
-  Caml_state->ref_table =
-    caml_stat_alloc_noexc(sizeof(struct caml_ref_table));
-  if (Caml_state->ref_table == NULL)
-    caml_fatal_error ("cannot initialize minor heap");
-  memset(Caml_state->ref_table, 0, sizeof(struct caml_ref_table));
+static atomic_intnat domains_finished_minor_gc;
 
-  Caml_state->ephe_ref_table =
-    caml_stat_alloc_noexc(sizeof(struct caml_ephe_ref_table));
-  if (Caml_state->ephe_ref_table == NULL)
-    caml_fatal_error ("cannot initialize minor heap");
-  memset(Caml_state->ephe_ref_table, 0, sizeof(struct caml_ephe_ref_table));
-
-  Caml_state->custom_table =
-    caml_stat_alloc_noexc(sizeof(struct caml_custom_table));
-  if (Caml_state->custom_table == NULL)
-    caml_fatal_error ("cannot initialize minor heap");
-  memset(Caml_state->custom_table, 0, sizeof(struct caml_custom_table));
-}
+static atomic_uintnat caml_minor_cycles_started = 0;
 
 /* [sz] and [rsv] are numbers of entries */
 static void alloc_generic_table (struct generic_table *tbl, asize_t sz,
@@ -108,20 +71,6 @@ void caml_alloc_table (struct caml_ref_table *tbl, asize_t sz, asize_t rsv)
   alloc_generic_table ((struct generic_table *) tbl, sz, rsv, sizeof (value *));
 }
 
-void caml_alloc_ephe_table (struct caml_ephe_ref_table *tbl, asize_t sz,
-                            asize_t rsv)
-{
-  alloc_generic_table ((struct generic_table *) tbl, sz, rsv,
-                       sizeof (struct caml_ephe_ref_elt));
-}
-
-void caml_alloc_custom_table (struct caml_custom_table *tbl, asize_t sz,
-                              asize_t rsv)
-{
-  alloc_generic_table ((struct generic_table *) tbl, sz, rsv,
-                       sizeof (struct caml_custom_elt));
-}
-
 static void reset_table (struct generic_table *tbl)
 {
   tbl->size = 0;
@@ -136,210 +85,344 @@ static void clear_table (struct generic_table *tbl)
     tbl->limit = tbl->threshold;
 }
 
-void caml_set_minor_heap_size (asize_t bsz)
+struct caml_minor_tables* caml_alloc_minor_tables(void)
+{
+  struct caml_minor_tables *r =
+      caml_stat_alloc_noexc(sizeof(struct caml_minor_tables));
+  if(r != NULL)
+    memset(r, 0, sizeof(*r));
+  return r;
+}
+
+static void reset_minor_tables(struct caml_minor_tables* r)
+{
+  reset_table((struct generic_table *)&r->major_ref);
+  reset_table((struct generic_table *)&r->ephe_ref);
+  reset_table((struct generic_table *)&r->custom);
+}
+
+void caml_free_minor_tables(struct caml_minor_tables* r)
 {
-  char *new_heap;
-  void *new_heap_base;
-
-  CAMLassert (bsz >= Bsize_wsize(Minor_heap_min));
-  CAMLassert (bsz <= Bsize_wsize(Minor_heap_max));
-  CAMLassert (bsz % Page_size == 0);
-  CAMLassert (bsz % sizeof (value) == 0);
-  if (Caml_state->young_ptr != Caml_state->young_alloc_end){
-    CAML_EV_COUNTER (EV_C_FORCE_MINOR_SET_MINOR_HEAP_SIZE, 1);
-    Caml_state->requested_minor_gc = 0;
-    Caml_state->young_trigger = Caml_state->young_alloc_mid;
-    caml_update_young_limit();
-    caml_empty_minor_heap ();
+  CAMLassert(r->major_ref.ptr == r->major_ref.base);
+
+  reset_minor_tables(r);
+  caml_stat_free(r);
+}
+
+#ifdef DEBUG
+extern int caml_debug_is_minor(value val) {
+  return Is_young(val);
+}
+
+extern int caml_debug_is_major(value val) {
+  return Is_block(val) && !Is_young(val);
+}
+#endif
+
+void caml_set_minor_heap_size (asize_t wsize)
+{
+  caml_domain_state* domain_state = Caml_state;
+  struct caml_minor_tables *r = domain_state->minor_tables;
+
+  if (domain_state->young_ptr != domain_state->young_end)
+    caml_minor_collection();
+
+  if(caml_reallocate_minor_heap(wsize) < 0) {
+    caml_fatal_error("Fatal error: No memory for minor heap");
+  }
+
+  reset_minor_tables(r);
+}
+
+/*****************************************************************************/
+
+struct oldify_state {
+  value todo_list;
+  uintnat live_bytes;
+  caml_domain_state* domain;
+};
+
+static value alloc_shared(caml_domain_state* d, mlsize_t wosize, tag_t tag)
+{
+  void* mem = caml_shared_try_alloc(d->shared_heap, wosize, tag,
+                                    0 /* not pinned */);
+  d->allocated_words += Whsize_wosize(wosize);
+  if (mem == NULL) {
+    caml_fatal_error("allocation failure during minor GC");
+  }
+  return Val_hp(mem);
+}
+
+/* in progress updates are zeros except for the lowest color bit set to 1
+   that is a header with: wosize == 0 && color == 1 && tag == 0 */
+#define In_progress_update_val ((header_t)0x100)
+#define Is_update_in_progress(hd) ((hd) == In_progress_update_val)
+
+static void spin_on_header(value v) {
+  SPIN_WAIT {
+    if (atomic_load(Hp_atomic_val(v)) == 0)
+      return;
   }
-  CAMLassert (Caml_state->young_ptr == Caml_state->young_alloc_end);
-  new_heap = caml_stat_alloc_aligned_noexc(bsz, 0, &new_heap_base);
-  if (new_heap == NULL) caml_raise_out_of_memory();
-  if (caml_page_table_add(In_young, new_heap, new_heap + bsz) != 0)
-    caml_raise_out_of_memory();
-
-  if (Caml_state->young_start != NULL){
-    caml_page_table_remove(In_young, Caml_state->young_start,
-                           Caml_state->young_end);
-    caml_stat_free (Caml_state->young_base);
+}
+
+Caml_inline header_t get_header_val(value v) {
+  header_t hd = atomic_load_explicit(Hp_atomic_val(v), memory_order_acquire);
+  if (!Is_update_in_progress(hd))
+    return hd;
+
+  spin_on_header(v);
+  return 0;
+}
+
+header_t caml_get_header_val(value v) {
+  return get_header_val(v);
+}
+
+
+static int try_update_object_header(value v, value *p, value result,
+                                      mlsize_t infix_offset) {
+  int success = 0;
+
+  if( caml_domain_alone() ) {
+    *Hp_val (v) = 0;
+    Field(v, 0) = result;
+    success = 1;
+  } else {
+    header_t hd = atomic_load(Hp_atomic_val(v));
+    if( hd == 0 ) {
+      /* in this case this has been updated by another domain, throw away result
+         and return the one in the object */
+      result = Field(v, 0);
+    } else if( Is_update_in_progress(hd) ) {
+      /* here we've caught a domain in the process of moving a minor heap object
+         we need to wait for it to finish */
+      spin_on_header(v);
+      /* Also throw away result and use the one from the other domain */
+      result = Field(v, 0);
+    } else {
+      /* Here the header is neither zero nor an in-progress update */
+      header_t desired_hd = In_progress_update_val;
+      if( atomic_compare_exchange_strong(Hp_atomic_val(v), &hd, desired_hd) ) {
+        /* Success. Now we can write the forwarding pointer. */
+        atomic_store_explicit(Op_atomic_val(v), result, memory_order_relaxed);
+        /* And update header ('release' ensures after update of fwd pointer) */
+        atomic_store_explicit(Hp_atomic_val(v), 0, memory_order_release);
+        /* Let the caller know we were responsible for the update */
+        success = 1;
+      } else {
+        /* Updated by another domain. Spin for that update to complete and
+           then throw away the result and use the one from the other domain. */
+        spin_on_header(v);
+        result = Field(v, 0);
+      }
+    }
   }
-  Caml_state->young_base = new_heap_base;
-  Caml_state->young_start = (value *) new_heap;
-  Caml_state->young_end = (value *) (new_heap + bsz);
-  Caml_state->young_alloc_start = Caml_state->young_start;
-  Caml_state->young_alloc_mid =
-    Caml_state->young_alloc_start + Wsize_bsize (bsz) / 2;
-  Caml_state->young_alloc_end = Caml_state->young_end;
-  /* caml_update_young_limit called by caml_memprof_renew_minor_sample */
-  Caml_state->young_trigger = Caml_state->young_alloc_start;
-  Caml_state->young_ptr = Caml_state->young_alloc_end;
-  Caml_state->minor_heap_wsz = Wsize_bsize (bsz);
-  caml_memprof_renew_minor_sample();
-
-  reset_table ((struct generic_table *) Caml_state->ref_table);
-  reset_table ((struct generic_table *) Caml_state->ephe_ref_table);
-  reset_table ((struct generic_table *) Caml_state->custom_table);
-}
-
-static value oldify_todo_list = 0;
+
+  *p = result + infix_offset;
+  return success;
+}
 
 /* Note that the tests on the tag depend on the fact that Infix_tag,
    Forward_tag, and No_scan_tag are contiguous. */
-
-void caml_oldify_one (value v, value *p)
+static void oldify_one (void* st_v, value v, value *p)
 {
+  struct oldify_state* st = st_v;
   value result;
   header_t hd;
   mlsize_t sz, i;
+  mlsize_t infix_offset;
   tag_t tag;
 
- tail_call:
-  if (Is_block (v) && Is_young (v)){
-    CAMLassert ((value *) Hp_val (v) >= Caml_state->young_ptr);
-    hd = Hd_val (v);
-    if (hd == 0){         /* If already forwarded */
-      *p = Field (v, 0);  /*  then forward pointer is first field. */
-    }else{
-      CAMLassert_young_header(hd);
-      tag = Tag_hd (hd);
-      if (tag < Infix_tag){
-        value field0;
-
-        sz = Wosize_hd (hd);
-        result = caml_alloc_shr_for_minor_gc (sz, tag, hd);
-        *p = result;
-        field0 = Field (v, 0);
-        Hd_val (v) = 0;            /* Set forward flag */
-        Field (v, 0) = result;     /*  and forward pointer. */
-        if (sz > 1){
-          Field (result, 0) = field0;
-          Field (result, 1) = oldify_todo_list;    /* Add this block */
-          oldify_todo_list = v;                    /*  to the "to do" list. */
-        }else{
-          CAMLassert (sz == 1);
-          p = &Field (result, 0);
-          v = field0;
-          goto tail_call;
-        }
-      }else if (tag >= No_scan_tag){
-        sz = Wosize_hd (hd);
-        result = caml_alloc_shr_for_minor_gc (sz, tag, hd);
-        for (i = 0; i < sz; i++) Field (result, i) = Field (v, i);
-        Hd_val (v) = 0;            /* Set forward flag */
-        Field (v, 0) = result;     /*  and forward pointer. */
-        *p = result;
-      }else if (tag == Infix_tag){
-        mlsize_t offset = Infix_offset_hd (hd);
-        caml_oldify_one (v - offset, p);   /* Cannot recurse deeper than 1. */
-        *p += offset;
-      }else{
-        value f = Forward_val (v);
-        tag_t ft = 0;
-        int vv = 1;
-
-        CAMLassert (tag == Forward_tag);
-        if (Is_block (f)){
-          if (Is_young (f)){
-            vv = 1;
-            ft = Tag_val (Hd_val (f) == 0 ? Field (f, 0) : f);
-          }else{
-            vv = Is_in_value_area(f);
-            if (vv){
-              ft = Tag_val (f);
-            }
-          }
-        }
-        if (!vv || ft == Forward_tag || ft == Lazy_tag
-#ifdef FLAT_FLOAT_ARRAY
-            || ft == Double_tag
-#endif
-            ){
-          /* Do not short-circuit the pointer.  Copy as a normal block. */
-          CAMLassert (Wosize_hd (hd) == 1);
-          result = caml_alloc_shr_for_minor_gc (1, Forward_tag, hd);
-          *p = result;
-          Hd_val (v) = 0;             /* Set (GC) forward flag */
-          Field (v, 0) = result;      /*  and forward pointer. */
-          p = &Field (result, 0);
-          v = f;
-          goto tail_call;
-        }else{
-          v = f;                        /* Follow the forwarding */
-          goto tail_call;               /*  then oldify. */
+  tail_call:
+  if (!(Is_block(v) && Is_young(v))) {
+    /* not a minor block */
+    *p = v;
+    return;
+  }
+
+  infix_offset = 0;
+  do {
+    hd = get_header_val(v);
+    if (hd == 0) {
+      /* already forwarded, another domain is likely working on this. */
+      *p = Field(v, 0) + infix_offset;
+      return;
+    }
+    tag = Tag_hd (hd);
+    if (tag == Infix_tag) {
+      /* Infix header, retry with the real block */
+      CAMLassert (infix_offset == 0);
+      infix_offset = Infix_offset_hd (hd);
+      CAMLassert(infix_offset > 0);
+      v -= infix_offset;
+    }
+  } while (tag == Infix_tag);
+
+  if (tag == Cont_tag) {
+    value stack_value = Field(v, 0);
+    CAMLassert(Wosize_hd(hd) == 1 && infix_offset == 0);
+    result = alloc_shared(st->domain, 1, Cont_tag);
+    if( try_update_object_header(v, p, result, 0) ) {
+      struct stack_info* stk = Ptr_val(stack_value);
+      Field(result, 0) = Val_ptr(stk);
+      if (stk != NULL) {
+        caml_scan_stack(&oldify_one, st, stk, 0);
+      }
+    }
+    else
+    {
+      /* Conflict - fix up what we allocated on the major heap */
+      *Hp_val(result) = Make_header(1, No_scan_tag,
+                                    caml_global_heap_state.MARKED);
+      #ifdef DEBUG
+      Field(result, 0) = Val_long(1);
+      #endif
+    }
+  } else if (tag < Infix_tag) {
+    value field0;
+    sz = Wosize_hd (hd);
+    st->live_bytes += Bhsize_hd(hd);
+    result = alloc_shared(st->domain, sz, tag);
+    field0 = Field(v, 0);
+    if( try_update_object_header(v, p, result, infix_offset) ) {
+      if (sz > 1){
+        Field(result, 0) = field0;
+        Field(result, 1) = st->todo_list;
+        st->todo_list = v;
+      } else {
+        CAMLassert (sz == 1);
+        p = Op_val(result);
+        v = field0;
+        goto tail_call;
+      }
+    } else {
+      /* Conflict - fix up what we allocated on the major heap */
+      *Hp_val(result) = Make_header(sz, No_scan_tag,
+                                    caml_global_heap_state.MARKED);
+      #ifdef DEBUG
+      {
+        int c;
+        for( c = 0; c < sz ; c++ ) {
+          Field(result, c) = Val_long(1);
         }
       }
+      #endif
     }
-  }else{
-    *p = v;
-  }
-}
 
-/* Test if the ephemeron is alive, everything outside minor heap is alive */
-Caml_inline int ephe_check_alive_data(struct caml_ephe_ref_elt *re){
-  mlsize_t i;
-  value child;
-  for (i = CAML_EPHE_FIRST_KEY; i < Wosize_val(re->ephe); i++){
-    child = Field (re->ephe, i);
-    if(child != caml_ephe_none
-       && Is_block (child) && Is_young (child)) {
-      if(Tag_val(child) == Infix_tag) child -= Infix_offset_val(child);
-      if(Hd_val (child) != 0) return 0; /* Value not copied to major heap */
+  } else if (tag >= No_scan_tag) {
+    sz = Wosize_hd (hd);
+    st->live_bytes += Bhsize_hd(hd);
+    result = alloc_shared(st->domain, sz, tag);
+    for (i = 0; i < sz; i++) {
+      Field(result, i) = Field(v, i);
+    }
+    CAMLassert (infix_offset == 0);
+    if( !try_update_object_header(v, p, result, 0) ) {
+      /* Conflict */
+      *Hp_val(result) = Make_header(sz, No_scan_tag,
+                                    caml_global_heap_state.MARKED);
+      #ifdef DEBUG
+      for( i = 0; i < sz ; i++ ) {
+        Field(result, i) = Val_long(1);
+      }
+      #endif
+    }
+  } else {
+    value f;
+    tag_t ft;
+    CAMLassert (tag == Forward_tag);
+    CAMLassert (infix_offset == 0);
+
+    f = Forward_val (v);
+    ft = 0;
+
+    if (Is_block (f)) {
+      ft = Tag_val (get_header_val(f) == 0 ? Field(f, 0) : f);
+    }
+
+    if (ft == Forward_tag || ft == Lazy_tag ||
+        ft == Forcing_tag || ft == Double_tag) {
+      /* Do not short-circuit the pointer.  Copy as a normal block. */
+      CAMLassert (Wosize_hd (hd) == 1);
+      st->live_bytes += Bhsize_hd(hd);
+      result = alloc_shared(st->domain, 1, Forward_tag);
+      if( try_update_object_header(v, p, result, 0) ) {
+        p = Op_val (result);
+        v = f;
+        goto tail_call;
+      } else {
+        *Hp_val(result) = Make_header(1, No_scan_tag,
+                                      caml_global_heap_state.MARKED);
+        #ifdef DEBUG
+        Field(result, 0) = Val_long(1);
+        #endif
+      }
+    } else {
+      v = f;                        /* Follow the forwarding */
+      goto tail_call;               /*  then oldify. */
     }
   }
-  return 1;
 }
 
-/* Finish the work that was put off by [caml_oldify_one].
-   Note that [caml_oldify_one] itself is called by oldify_mopup, so we
+/* Finish the work that was put off by [oldify_one].
+   Note that [oldify_one] itself is called by oldify_mopup, so we
    have to be careful to remove the first entry from the list before
    oldifying its fields. */
-void caml_oldify_mopup (void)
+static void oldify_mopup (struct oldify_state* st, int do_ephemerons)
 {
   value v, new_v, f;
   mlsize_t i;
+  caml_domain_state* domain_state = st->domain;
+  struct caml_ephe_ref_table ephe_ref_table =
+                                    domain_state->minor_tables->ephe_ref;
   struct caml_ephe_ref_elt *re;
   int redo;
 
-  again:
+again:
   redo = 0;
 
-  while (oldify_todo_list != 0){
-    v = oldify_todo_list;                /* Get the head. */
-    CAMLassert (Hd_val (v) == 0);            /* It must be forwarded. */
-    new_v = Field (v, 0);                /* Follow forward pointer. */
-    oldify_todo_list = Field (new_v, 1); /* Remove from list. */
+  while (st->todo_list != 0) {
+    v = st->todo_list;                   /* Get the head. */
+    CAMLassert (get_header_val(v) == 0); /* It must be forwarded. */
+    new_v = Field(v, 0);                 /* Follow forward pointer. */
+    st->todo_list = Field (new_v, 1);    /* Remove from list. */
 
-    f = Field (new_v, 0);
-    if (Is_block (f) && Is_young (f)){
-      caml_oldify_one (f, &Field (new_v, 0));
+    f = Field(new_v, 0);
+    CAMLassert (!Is_debug_tag(f));
+    if (Is_block (f) && Is_young(f)) {
+      oldify_one (st, f, Op_val (new_v));
     }
     for (i = 1; i < Wosize_val (new_v); i++){
-      f = Field (v, i);
-      if (Is_block (f) && Is_young (f)){
-        caml_oldify_one (f, &Field (new_v, i));
-      }else{
-        Field (new_v, i) = f;
+      f = Field(v, i);
+      CAMLassert (!Is_debug_tag(f));
+      if (Is_block (f) && Is_young(f)) {
+        oldify_one (st, f, Op_val (new_v) + i);
+      } else {
+        Field(new_v, i) = f;
       }
     }
+    CAMLassert (Wosize_val(new_v));
   }
 
-  /* Oldify the data in the minor heap of alive ephemeron
-     During minor collection keys outside the minor heap are considered alive */
-  for (re = Caml_state->ephe_ref_table->base;
-       re < Caml_state->ephe_ref_table->ptr; re++){
-    /* look only at ephemeron with data in the minor heap */
-    if (re->offset == 1){
-      value *data = &Field(re->ephe,1), v = *data;
-      if (v != caml_ephe_none && Is_block (v) && Is_young (v)){
+  /* Oldify the key and data in the minor heap of all ephemerons touched in this
+     cycle. We are doing this to avoid introducing a barrier for the end of all
+     domains promoting reachable objects and having to handle the complexity
+     of determining which ephemerons are dead when they link across domains */
+  if( do_ephemerons ) {
+    for (re = ephe_ref_table.base;
+         re < ephe_ref_table.ptr; re++) {
+      value *data = re->offset == CAML_EPHE_DATA_OFFSET
+              ? &Ephe_data(re->ephe)
+              :  &Field(re->ephe, re->offset);
+      value v = *data;
+      if (v != caml_ephe_none && Is_block(v) && Is_young(v) ) {
         mlsize_t offs = Tag_val(v) == Infix_tag ? Infix_offset_val(v) : 0;
         v -= offs;
-        if (Hd_val (v) == 0){ /* Value copied to major heap */
-          *data = Field (v, 0) + offs;
+        if (get_header_val(v) == 0) { /* Value copied to major heap */
+          *data = Field(v, 0) + offs;
         } else {
-          if (ephe_check_alive_data(re)){
-            caml_oldify_one(*data,data);
-            redo = 1; /* oldify_todo_list can still be 0 */
-          }
+          oldify_one(st, *data, data);
+          redo = 1; /* oldify_todo_list can still be 0 */
         }
       }
     }
@@ -348,231 +431,369 @@ void caml_oldify_mopup (void)
   if (redo) goto again;
 }
 
-/* Make sure the minor heap is empty by performing a minor collection
-   if needed.
-*/
-void caml_empty_minor_heap (void)
+void caml_empty_minor_heap_domain_clear(caml_domain_state* domain, void* unused)
 {
-  value **r;
+  struct caml_minor_tables *minor_tables = domain->minor_tables;
+
+  caml_final_empty_young(domain);
+
+  clear_table ((struct generic_table *)&minor_tables->major_ref);
+  clear_table ((struct generic_table *)&minor_tables->ephe_ref);
+  clear_table ((struct generic_table *)&minor_tables->custom);
+
+  domain->extra_heap_resources_minor = 0.0;
+
+#ifdef DEBUG
+  {
+    uintnat* p = (uintnat*)domain->young_start;
+    for (; p < (uintnat*)domain->young_end; p++)
+      *p = Debug_uninit_align;
+  }
+#endif
+}
+
+void caml_empty_minor_heap_promote(caml_domain_state* domain,
+                                    int participating_count,
+                                    caml_domain_state** participating)
+{
+  struct caml_minor_tables *self_minor_tables = domain->minor_tables;
   struct caml_custom_elt *elt;
+  value* young_ptr = domain->young_ptr;
+  value* young_end = domain->young_end;
+  uintnat minor_allocated_bytes = (uintnat)young_end - (uintnat)young_ptr;
   uintnat prev_alloc_words;
-  struct caml_ephe_ref_elt *re;
+  struct oldify_state st = {0};
+  value **r;
+  intnat c, curr_idx;
+  int remembered_roots = 0;
 
-  if (Caml_state->young_ptr != Caml_state->young_alloc_end){
-    CAMLassert_young_header(*(header_t*)Caml_state->young_ptr);
-    if (caml_minor_gc_begin_hook != NULL) (*caml_minor_gc_begin_hook) ();
-    prev_alloc_words = caml_allocated_words;
-    Caml_state->in_minor_collection = 1;
-    caml_gc_message (0x02, "<");
-    CAML_EV_BEGIN(EV_MINOR_LOCAL_ROOTS);
-    caml_oldify_local_roots();
-    CAML_EV_END(EV_MINOR_LOCAL_ROOTS);
-    CAML_EV_BEGIN(EV_MINOR_REF_TABLES);
-    for (r = Caml_state->ref_table->base;
-         r < Caml_state->ref_table->ptr; r++) {
-      caml_oldify_one (**r, *r);
-    }
-    CAML_EV_END(EV_MINOR_REF_TABLES);
-    CAML_EV_BEGIN(EV_MINOR_COPY);
-    caml_oldify_mopup ();
-    CAML_EV_END(EV_MINOR_COPY);
-    /* Update the ephemerons */
-    for (re = Caml_state->ephe_ref_table->base;
-         re < Caml_state->ephe_ref_table->ptr; re++){
-      if(re->offset < Wosize_val(re->ephe)){
-        /* If it is not the case, the ephemeron has been truncated */
-        value *key = &Field(re->ephe,re->offset), v = *key;
-        if (v != caml_ephe_none && Is_block (v) && Is_young (v)){
-          mlsize_t offs = Tag_val (v) == Infix_tag ? Infix_offset_val (v) : 0;
-          v -= offs;
-          if (Hd_val (v) == 0){ /* Value copied to major heap */
-            *key = Field (v, 0) + offs;
-          }else{ /* Value not copied so it's dead */
-            CAMLassert(!ephe_check_alive_data(re));
-            *key = caml_ephe_none;
-            Field(re->ephe,1) = caml_ephe_none;
-          }
-        }
+  st.domain = domain;
+
+  prev_alloc_words = domain->allocated_words;
+
+  caml_gc_log ("Minor collection of domain %d starting", domain->id);
+  CAML_EV_BEGIN(EV_MINOR);
+
+  if( participating[0] == Caml_state ) {
+    CAML_EV_BEGIN(EV_MINOR_GLOBAL_ROOTS);
+    caml_scan_global_young_roots(oldify_one, &st);
+    CAML_EV_END(EV_MINOR_GLOBAL_ROOTS);
+  }
+
+ CAML_EV_BEGIN(EV_MINOR_REMEMBERED_SET);
+
+  if( participating_count > 1 ) {
+    int participating_idx = -1;
+    CAMLassert(domain == Caml_state);
+
+    for( int i = 0; i < participating_count ; i++ ) {
+      if( participating[i] == domain ) {
+        participating_idx = i;
+        break;
       }
     }
-    /* Update the OCaml finalise_last values */
-    CAML_EV_BEGIN(EV_MINOR_UPDATE_WEAK);
-    caml_final_update_minor_roots();
-    /* Trigger memprofs callbacks for blocks in the minor heap. */
-    caml_memprof_minor_update();
-    /* Run custom block finalisation of dead minor values */
-    for (elt = Caml_state->custom_table->base;
-         elt < Caml_state->custom_table->ptr; elt++){
-      value v = elt->block;
-      if (Hd_val (v) == 0){
-        /* Block was copied to the major heap: adjust GC speed numbers. */
-        caml_adjust_gc_speed(elt->mem, elt->max);
-      }else{
-        /* Block will be freed: call finalization function, if any. */
-        void (*final_fun)(value) = Custom_ops_val(v)->finalize;
-        if (final_fun != NULL) final_fun(v);
+
+    CAMLassert(participating_idx != -1);
+
+    /* We use this rather odd scheme because it better smoothes the remainder */
+    for( curr_idx = 0, c = participating_idx;
+         curr_idx < participating_count; curr_idx++) {
+      caml_domain_state* foreign_domain = participating[c];
+
+      struct caml_minor_tables* foreign_minor_tables =
+                                                 foreign_domain->minor_tables;
+
+      struct caml_ref_table* foreign_major_ref =
+                                              &foreign_minor_tables->major_ref;
+
+      /* calculate the size of the remembered set */
+      intnat major_ref_size = foreign_major_ref->ptr - foreign_major_ref->base;
+
+      /* number of remembered set entries each domain takes here */
+      intnat refs_per_domain = (major_ref_size / participating_count);
+
+      /* where to start in the remembered set */
+      value** ref_start = foreign_major_ref->base
+                          + (curr_idx * refs_per_domain);
+
+      /* where to end in the remembered set */
+      value** ref_end = foreign_major_ref->base
+                        + ((curr_idx+1) * refs_per_domain);
+
+      /* if we're the last domain this time, cover all the remaining refs */
+      if( curr_idx == participating_count-1 ) {
+        caml_gc_log("taking remainder");
+        ref_end = foreign_major_ref->ptr;
       }
+
+      caml_gc_log("idx: %d, foreign_domain: %d, ref_size: %"
+        ARCH_INTNAT_PRINTF_FORMAT"d, refs_per_domain: %"
+        ARCH_INTNAT_PRINTF_FORMAT"d, ref_base: %p, ref_ptr: %p, ref_start: %p"
+        ", ref_end: %p",
+        participating_idx, foreign_domain->id, major_ref_size, refs_per_domain,
+        foreign_major_ref->base, foreign_major_ref->ptr, ref_start, ref_end);
+
+      for( r = ref_start ; r < foreign_major_ref->ptr && r < ref_end ; r++ )
+      {
+        oldify_one (&st, **r, *r);
+        remembered_roots++;
+      }
+
+      c = (c+1) % participating_count;
     }
-    CAML_EV_END(EV_MINOR_UPDATE_WEAK);
-    CAML_EV_BEGIN(EV_MINOR_FINALIZED);
-    Caml_state->stat_minor_words +=
-      Caml_state->young_alloc_end - Caml_state->young_ptr;
-    caml_gc_clock +=
-      (double) (Caml_state->young_alloc_end - Caml_state->young_ptr)
-      / Caml_state->minor_heap_wsz;
-    Caml_state->young_ptr = Caml_state->young_alloc_end;
-    clear_table ((struct generic_table *) Caml_state->ref_table);
-    clear_table ((struct generic_table *) Caml_state->ephe_ref_table);
-    clear_table ((struct generic_table *) Caml_state->custom_table);
-    Caml_state->extra_heap_resources_minor = 0;
-    caml_gc_message (0x02, ">");
-    Caml_state->in_minor_collection = 0;
-    caml_final_empty_young ();
-    CAML_EV_END(EV_MINOR_FINALIZED);
-    Caml_state->stat_promoted_words += caml_allocated_words - prev_alloc_words;
-    CAML_EV_COUNTER (EV_C_MINOR_PROMOTED,
-                     caml_allocated_words - prev_alloc_words);
-    ++ Caml_state->stat_minor_collections;
-    caml_memprof_renew_minor_sample();
-    if (caml_minor_gc_end_hook != NULL) (*caml_minor_gc_end_hook) ();
-  }else{
-    /* The minor heap is empty nothing to do. */
-    caml_final_empty_young ();
   }
-#ifdef DEBUG
+  else
   {
-    value *p;
-    for (p = Caml_state->young_alloc_start; p < Caml_state->young_alloc_end;
-         ++p) {
-      *p = Debug_free_minor;
+    /* If we're alone, we just do our own remembered set */
+    for( r = self_minor_tables->major_ref.base ;
+      r < self_minor_tables->major_ref.ptr ; r++ )
+    {
+      oldify_one (&st, **r, *r);
+      remembered_roots++;
     }
   }
+
+  #ifdef DEBUG
+    caml_global_barrier();
+    /* At this point all domains should have gone through all remembered set
+       entries. We need to verify that all our remembered set entries are now in
+       the major heap or promoted */
+    for( r = self_minor_tables->major_ref.base ;
+         r < self_minor_tables->major_ref.ptr ; r++ ) {
+      /* Everything should be promoted */
+      CAMLassert(!(Is_block(**r)) || !(Is_young(**r)));
+    }
+  #endif
+
+  /* unconditionally promote custom blocks so accounting is correct */
+  for (elt = self_minor_tables->custom.base;
+       elt < self_minor_tables->custom.ptr; elt++) {
+    value *v = &elt->block;
+    if (Is_block(*v) && Is_young(*v)) {
+      caml_adjust_gc_speed(elt->mem, elt->max);
+      if (get_header_val(*v) == 0) { /* value copied to major heap */
+        *v = Field(*v, 0);
+      } else {
+        oldify_one(&st, *v, v);
+      }
+    }
+  }
+
+  CAML_EV_BEGIN(EV_MINOR_FINALIZERS_OLDIFY);
+  /* promote the finalizers unconditionally as we want to avoid barriers */
+  caml_final_do_young_roots (&oldify_one, &st, domain, 0);
+  CAML_EV_END(EV_MINOR_FINALIZERS_OLDIFY);
+
+  CAML_EV_BEGIN(EV_MINOR_REMEMBERED_SET_PROMOTE);
+  oldify_mopup (&st, 1); /* ephemerons promoted here */
+  CAML_EV_END(EV_MINOR_REMEMBERED_SET_PROMOTE);
+  CAML_EV_END(EV_MINOR_REMEMBERED_SET);
+  caml_gc_log("promoted %d roots, %" ARCH_INTNAT_PRINTF_FORMAT "u bytes",
+              remembered_roots, st.live_bytes);
+
+  CAML_EV_BEGIN(EV_MINOR_FINALIZERS_ADMIN);
+  caml_gc_log("running finalizer data structure book-keeping");
+  /* do the finalizer data structure book-keeping */
+  caml_final_update_last_minor(domain);
+  CAML_EV_END(EV_MINOR_FINALIZERS_ADMIN);
+
+#ifdef DEBUG
+  caml_global_barrier();
+  caml_gc_log("ref_base: %p, ref_ptr: %p",
+    self_minor_tables->major_ref.base, self_minor_tables->major_ref.ptr);
+  for (r = self_minor_tables->major_ref.base;
+       r < self_minor_tables->major_ref.ptr; r++) {
+    value vnew = **r;
+    CAMLassert (!Is_block(vnew)
+            || (get_header_val(vnew) != 0 && !Is_young(vnew)));
+  }
+
+  for (elt = self_minor_tables->custom.base;
+       elt < self_minor_tables->custom.ptr; elt++) {
+    value vnew = elt->block;
+    CAMLassert (!Is_block(vnew)
+            || (get_header_val(vnew) != 0 && !Is_young(vnew)));
+  }
 #endif
+
+  CAML_EV_BEGIN(EV_MINOR_LOCAL_ROOTS);
+  caml_do_local_roots(&oldify_one, &st, domain->local_roots,
+                      domain->current_stack, domain->gc_regs);
+
+  if (caml_scan_roots_hook != NULL)
+    (*caml_scan_roots_hook)(&oldify_one, &st, domain);
+
+  CAML_EV_BEGIN(EV_MINOR_LOCAL_ROOTS_PROMOTE);
+  oldify_mopup (&st, 0);
+  CAML_EV_END(EV_MINOR_LOCAL_ROOTS_PROMOTE);
+  CAML_EV_END(EV_MINOR_LOCAL_ROOTS);
+
+  /* we reset these pointers before allowing any mutators to be
+     released to avoid races where another domain signals an interrupt
+     and we clobber it */
+  atomic_store_rel
+    ((atomic_uintnat*)&domain->young_limit, (uintnat)domain->young_start);
+
+  atomic_store_rel
+    ((atomic_uintnat*)&domain->young_ptr, (uintnat)domain->young_end);
+
+  if( participating_count > 1 ) {
+    atomic_fetch_add_explicit
+      (&domains_finished_minor_gc, 1, memory_order_release);
+  }
+
+  domain->stat_minor_words += Wsize_bsize (minor_allocated_bytes);
+  domain->stat_minor_collections++;
+  domain->stat_promoted_words += domain->allocated_words - prev_alloc_words;
+
+  CAML_EV_END(EV_MINOR);
+  caml_gc_log ("Minor collection of domain %d completed: %2.0f%% of %u KB live",
+               domain->id,
+               100.0 * (double)st.live_bytes / (double)minor_allocated_bytes,
+               (unsigned)(minor_allocated_bytes + 512)/1024);
 }
 
-#ifdef CAML_INSTR
-extern uintnat caml_instr_alloc_jump;
-#endif /*CAML_INSTR*/
+void caml_do_opportunistic_major_slice
+  (caml_domain_state* domain_unused, void* unused)
+{
+  /* NB: need to put guard around the ev logs to prevent
+    spam when we poll */
+  if (caml_opportunistic_major_work_available()) {
+    int log_events = caml_params->verb_gc & 0x40;
+    if (log_events) CAML_EV_BEGIN(EV_MAJOR_MARK_OPPORTUNISTIC);
+    caml_opportunistic_major_collection_slice(0x200);
+    if (log_events) CAML_EV_END(EV_MAJOR_MARK_OPPORTUNISTIC);
+  }
+}
 
-/* Do a minor collection or a slice of major collection, etc.
-   Leave enough room in the minor heap to allocate at least one object.
-   Guaranteed not to call any OCaml callback.
+/* Make sure the minor heap is empty by performing a minor collection
+   if needed.
 */
-void caml_gc_dispatch (void)
+void caml_empty_minor_heap_setup(caml_domain_state* domain_unused) {
+  atomic_store_explicit(&domains_finished_minor_gc, 0, memory_order_release);
+}
+
+/* must be called within a STW section */
+static void caml_stw_empty_minor_heap_no_major_slice(caml_domain_state* domain,
+                                            void* unused,
+                                            int participating_count,
+                                            caml_domain_state** participating)
 {
-  CAML_EVENTLOG_DO({
-    CAML_EV_COUNTER(EV_C_ALLOC_JUMP, caml_instr_alloc_jump);
-    caml_instr_alloc_jump =  0;
-  });
-
-  if (Caml_state->young_trigger == Caml_state->young_alloc_start){
-    /* The minor heap is full, we must do a minor collection. */
-    Caml_state->requested_minor_gc = 1;
-  }else{
-    /* The minor heap is half-full, do a major GC slice. */
-    Caml_state->requested_major_slice = 1;
-  }
-  if (caml_gc_phase == Phase_idle){
-    /* The major GC needs an empty minor heap in order to start a new cycle.
-       If a major slice was requested, we need to do a minor collection
-       before we can do the major slice that starts a new major GC cycle.
-       If a minor collection was requested, we take the opportunity to start
-       a new major GC cycle.
-       In either case, we have to do a minor cycle followed by a major slice.
-    */
-    Caml_state->requested_minor_gc = 1;
-    Caml_state->requested_major_slice = 1;
-  }
-  if (Caml_state->requested_minor_gc) {
-    /* reset the pointers first because the end hooks might allocate */
-    CAML_EV_BEGIN(EV_MINOR);
-    Caml_state->requested_minor_gc = 0;
-    Caml_state->young_trigger = Caml_state->young_alloc_mid;
-    caml_update_young_limit();
-    caml_empty_minor_heap ();
-    CAML_EV_END(EV_MINOR);
+  #ifdef DEBUG
+  CAMLassert(caml_domain_is_in_stw());
+  #endif
+
+  if( participating[0] == Caml_state ) {
+    atomic_fetch_add(&caml_minor_cycles_started, 1);
   }
-  if (Caml_state->requested_major_slice) {
-    Caml_state->requested_major_slice = 0;
-    Caml_state->young_trigger = Caml_state->young_alloc_start;
-    caml_update_young_limit();
-    CAML_EV_BEGIN(EV_MAJOR);
-    caml_major_collection_slice (-1);
-    CAML_EV_END(EV_MAJOR);
+
+  caml_gc_log("running stw empty_minor_heap_promote");
+  caml_empty_minor_heap_promote(domain, participating_count, participating);
+
+  /* collect gc stats before leaving the barrier */
+  caml_sample_gc_collect(domain);
+
+  if( participating_count > 1 ) {
+    CAML_EV_BEGIN(EV_MINOR_LEAVE_BARRIER);
+    {
+      SPIN_WAIT {
+        if( atomic_load_explicit
+          (&domains_finished_minor_gc, memory_order_acquire)
+          ==
+          participating_count ) {
+          break;
+        }
+
+        caml_do_opportunistic_major_slice(domain, 0);
+      }
+    }
+    CAML_EV_END(EV_MINOR_LEAVE_BARRIER);
   }
+
+  CAML_EV_BEGIN(EV_MINOR_CLEAR);
+  caml_gc_log("running stw empty_minor_heap_domain_clear");
+  caml_empty_minor_heap_domain_clear(domain, 0);
+  CAML_EV_END(EV_MINOR_CLEAR);
+  caml_gc_log("finished stw empty_minor_heap");
 }
 
-/* Called by young allocations when [Caml_state->young_ptr] reaches
-   [Caml_state->young_limit]. We may have to either call memprof or
-   the gc. */
-void caml_alloc_small_dispatch (intnat wosize, int flags,
-                                int nallocs, unsigned char* encoded_alloc_lens)
+static void caml_stw_empty_minor_heap (caml_domain_state* domain, void* unused,
+                                       int participating_count,
+                                       caml_domain_state** participating)
 {
-  intnat whsize = Whsize_wosize (wosize);
-
-  /* First, we un-do the allocation performed in [Alloc_small] */
-  Caml_state->young_ptr += whsize;
-
-  while(1) {
-    /* We might be here because of an async callback / urgent GC
-       request. Take the opportunity to do what has been requested. */
-    if (flags & CAML_FROM_CAML)
-      /* In the case of allocations performed from OCaml, execute
-         asynchronous callbacks. */
-      caml_raise_if_exception(caml_do_pending_actions_exn ());
-    else {
-      caml_check_urgent_gc (Val_unit);
-      /* In the case of long-running C code that regularly polls with
-         caml_process_pending_actions, force a query of all callbacks
-         at every minor collection or major slice. */
-      caml_something_to_do = 1;
-    }
+  caml_stw_empty_minor_heap_no_major_slice(domain, unused,
+                                            participating_count, participating);
 
-    /* Now, there might be enough room in the minor heap to do our
-       allocation. */
-    if (Caml_state->young_ptr - whsize >= Caml_state->young_trigger)
-      break;
+  /* schedule a major collection slice for this domain */
+  caml_request_major_slice();
 
-    /* If not, then empty the minor heap, and check again for async
-       callbacks. */
-    CAML_EV_COUNTER (EV_C_FORCE_MINOR_ALLOC_SMALL, 1);
-    caml_gc_dispatch ();
-  }
+  /* can change how we account clock in future, here just do raw count */
+  domain->major_gc_clock += 1.0;
+}
 
-  /* Re-do the allocation: we now have enough space in the minor heap. */
-  Caml_state->young_ptr -= whsize;
-
-  /* Check if the allocated block has been sampled by memprof. */
-  if(Caml_state->young_ptr < caml_memprof_young_trigger){
-    if(flags & CAML_DO_TRACK) {
-      caml_memprof_track_young(wosize, flags & CAML_FROM_CAML,
-                               nallocs, encoded_alloc_lens);
-      /* Until the allocation actually takes place, the heap is in an invalid
-         state (see comments in [caml_memprof_track_young]). Hence, very little
-         heap operations are allowed before the actual allocation.
-
-         Moreover, [Caml_state->young_ptr] should not be modified before the
-         allocation, because its value has been used as the pointer to
-         the sampled block.
-      */
-    } else caml_memprof_renew_minor_sample();
+/* must be called within a STW section  */
+void caml_empty_minor_heap_no_major_slice_from_stw(caml_domain_state* domain,
+                                            void* unused,
+                                             int participating_count,
+                                             caml_domain_state** participating)
+{
+  barrier_status b = caml_global_barrier_begin();
+  if( caml_global_barrier_is_final(b) ) {
+    caml_empty_minor_heap_setup(domain);
   }
+  caml_global_barrier_end(b);
+
+  /* if we are entering from within a major GC STW section then
+     we do not schedule another major collection slice */
+  caml_stw_empty_minor_heap_no_major_slice(domain, (void*)0,
+                                           participating_count, participating);
+}
+
+/* must be called outside a STW section */
+int caml_try_stw_empty_minor_heap_on_all_domains (void)
+{
+  #ifdef DEBUG
+  CAMLassert(!caml_domain_is_in_stw());
+  #endif
+
+  caml_gc_log("requesting stw empty_minor_heap");
+  return caml_try_run_on_all_domains_with_spin_work(
+    &caml_stw_empty_minor_heap, 0, /* stw handler */
+    &caml_empty_minor_heap_setup, /* leader setup */
+    &caml_do_opportunistic_major_slice, 0 /* enter spin work */);
+    /* leaves when done by default*/
+}
+
+/* must be called outside a STW section, will retry until we have emptied our
+   minor heap */
+void caml_empty_minor_heaps_once (void)
+{
+  uintnat saved_minor_cycle = atomic_load(&caml_minor_cycles_started);
+
+  #ifdef DEBUG
+  CAMLassert(!caml_domain_is_in_stw());
+  #endif
+
+  /* To handle the case where multiple domains try to execute a minor gc
+     STW section */
+  do {
+    caml_try_stw_empty_minor_heap_on_all_domains();
+  } while (saved_minor_cycle == atomic_load(&caml_minor_cycles_started));
 }
 
-/* Exported for backward compatibility with Lablgtk: do a minor
-   collection to ensure that the minor heap is empty.
+/* Request a minor collection and enter as if it were an interrupt.
 */
 CAMLexport void caml_minor_collection (void)
 {
-  Caml_state->requested_minor_gc = 1;
-  caml_gc_dispatch ();
+  caml_request_minor_gc();
+  caml_handle_gc_interrupt();
 }
 
 CAMLexport value caml_check_urgent_gc (value extra_root)
 {
-  if (Caml_state->requested_major_slice || Caml_state->requested_minor_gc){
-    CAMLparam1 (extra_root);
-    caml_gc_dispatch();
+  if (Caml_check_gc_interrupt(Caml_state)) {
+    CAMLparam1(extra_root);
+    caml_handle_gc_interrupt();
     CAMLdrop;
   }
   return extra_root;
@@ -580,8 +801,7 @@ CAMLexport value caml_check_urgent_gc (value extra_root)
 
 static void realloc_generic_table
 (struct generic_table *tbl, asize_t element_size,
- ev_gc_counter ev_counter_name,
- char *msg_threshold, char *msg_growing, char *msg_error)
+ char * msg_intr_int, char *msg_threshold, char *msg_growing, char *msg_error)
 {
   CAMLassert (tbl->ptr == tbl->limit);
   CAMLassert (tbl->limit <= tbl->end);
@@ -591,14 +811,12 @@ static void realloc_generic_table
     alloc_generic_table (tbl, Caml_state->minor_heap_wsz / 8, 256,
                          element_size);
   }else if (tbl->limit == tbl->threshold){
-    CAML_EV_COUNTER (ev_counter_name, 1);
     caml_gc_message (0x08, msg_threshold, 0);
     tbl->limit = tbl->end;
     caml_request_minor_gc ();
   }else{
     asize_t sz;
     asize_t cur_ptr = tbl->ptr - tbl->base;
-    CAMLassert (Caml_state->requested_minor_gc);
 
     tbl->size *= 2;
     sz = (tbl->size + tbl->reserve) * element_size;
@@ -618,7 +836,7 @@ void caml_realloc_ref_table (struct caml_ref_table *tbl)
 {
   realloc_generic_table
     ((struct generic_table *) tbl, sizeof (value *),
-     EV_C_REQUEST_MINOR_REALLOC_REF_TABLE,
+     "request_minor/realloc_ref_table@",
      "ref_table threshold crossed\n",
      "Growing ref_table to %" ARCH_INTNAT_PRINTF_FORMAT "dk bytes\n",
      "ref_table overflow");
@@ -628,7 +846,7 @@ void caml_realloc_ephe_ref_table (struct caml_ephe_ref_table *tbl)
 {
   realloc_generic_table
     ((struct generic_table *) tbl, sizeof (struct caml_ephe_ref_elt),
-     EV_C_REQUEST_MINOR_REALLOC_EPHE_REF_TABLE,
+     "request_minor/realloc_ephe_ref_table@",
      "ephe_ref_table threshold crossed\n",
      "Growing ephe_ref_table to %" ARCH_INTNAT_PRINTF_FORMAT "dk bytes\n",
      "ephe_ref_table overflow");
@@ -638,7 +856,7 @@ void caml_realloc_custom_table (struct caml_custom_table *tbl)
 {
   realloc_generic_table
     ((struct generic_table *) tbl, sizeof (struct caml_custom_elt),
-     EV_C_REQUEST_MINOR_REALLOC_CUSTOM_TABLE,
+     "request_minor/realloc_custom_table@",
      "custom_table threshold crossed\n",
      "Growing custom_table to %" ARCH_INTNAT_PRINTF_FORMAT "dk bytes\n",
      "custom_table overflow");
diff --git a/runtime/misc.c b/runtime/misc.c
index e817a6cc483a..182f53c06f9e 100644
--- a/runtime/misc.c
+++ b/runtime/misc.c
@@ -28,11 +28,14 @@ __declspec(noreturn) void __cdecl abort(void);
 #include <stdio.h>
 #include <string.h>
 #include <stdarg.h>
+#include <stdlib.h>
 #include "caml/config.h"
 #include "caml/misc.h"
 #include "caml/memory.h"
 #include "caml/osdeps.h"
-#include "caml/version.h"
+#include "caml/domain.h"
+#include "caml/startup.h"
+#include "caml/startup_aux.h"
 
 caml_timing_hook caml_major_slice_begin_hook = NULL;
 caml_timing_hook caml_major_slice_end_hook = NULL;
@@ -46,28 +49,47 @@ caml_timing_hook caml_finalise_end_hook = NULL;
 void caml_failed_assert (char * expr, char_os * file_os, int line)
 {
   char* file = caml_stat_strdup_of_os(file_os);
-  fprintf (stderr, "file %s; line %d ### Assertion failed: %s\n",
-           file, line, expr);
-  fflush (stderr);
+  fprintf(stderr, "[%02d] file %s; line %d ### Assertion failed: %s\n",
+          Caml_state ? Caml_state->id : -1, file, line, expr);
+  fflush(stderr);
   caml_stat_free(file);
   abort();
 }
+#endif
 
-void caml_set_fields (value v, uintnat start, uintnat filler)
+#if defined(DEBUG)
+static __thread int noalloc_level = 0;
+int caml_noalloc_begin(void)
 {
-  mlsize_t i;
-  for (i = start; i < Wosize_val (v); i++){
-    Field (v, i) = (value) filler;
-  }
+  return noalloc_level++;
+}
+void caml_noalloc_end(int* noalloc)
+{
+  int curr = --noalloc_level;
+  CAMLassert(*noalloc == curr);
+}
+void caml_alloc_point_here(void)
+{
+  CAMLassert(noalloc_level == 0);
 }
-
 #endif /* DEBUG */
 
-uintnat caml_verb_gc = 0;
+void caml_gc_log (char *msg, ...)
+{
+  if ((caml_params->verb_gc & 0x800) != 0) {
+    char fmtbuf[512];
+    va_list args;
+    va_start (args, msg);
+    sprintf(fmtbuf, "[%02d] %s\n", Caml_state ? Caml_state->id : -1, msg);
+    vfprintf(stderr, fmtbuf, args);
+    va_end (args);
+    fflush(stderr);
+  }
+}
 
 void caml_gc_message (int level, char *msg, ...)
 {
-  if ((caml_verb_gc & level) != 0){
+  if ((caml_params->verb_gc & level) != 0){
     va_list ap;
     va_start(ap, msg);
     vfprintf (stderr, msg, ap);
@@ -93,6 +115,20 @@ CAMLexport void caml_fatal_error (char *msg, ...)
   abort();
 }
 
+CAMLexport void caml_fatal_error_arg (const char *fmt, const char *arg)
+{
+  fprintf (stderr, fmt, arg);
+  exit(2);
+}
+
+CAMLexport void caml_fatal_error_arg2 (const char *fmt1, const char *arg1,
+                                       const char *fmt2, const char *arg2)
+{
+  fprintf (stderr, fmt1, arg1);
+  fprintf (stderr, fmt2, arg2);
+  exit(2);
+}
+
 void caml_ext_table_init(struct ext_table * tbl, int init_capa)
 {
   tbl->size = 0;
diff --git a/runtime/obj.c b/runtime/obj.c
index 4530415505af..1215c142a1c5 100644
--- a/runtime/obj.c
+++ b/runtime/obj.c
@@ -18,6 +18,7 @@
 /* Operations on objects */
 
 #include <string.h>
+#include "caml/camlatomic.h"
 #include "caml/alloc.h"
 #include "caml/fail.h"
 #include "caml/gc.h"
@@ -27,22 +28,31 @@
 #include "caml/minor_gc.h"
 #include "caml/misc.h"
 #include "caml/mlvalues.h"
+#include "caml/platform.h"
 #include "caml/prims.h"
-#include "caml/signals.h"
 
-CAMLprim value caml_obj_tag(value arg)
+static int obj_tag (value arg)
 {
-  if (Is_long (arg)){
-    return Val_int (1000);   /* int_tag */
-  }else if ((long) arg & (sizeof (value) - 1)){
-    return Val_int (1002);   /* unaligned_tag */
-  }else if (Is_in_value_area (arg)){
-    return Val_int(Tag_val(arg));
-  }else{
-    return Val_int (1001);   /* out_of_heap_tag */
+  header_t hd;
+
+  if (Is_long (arg)) {
+    return 1000;   /* int_tag */
+  } else if ((long) arg & (sizeof (value) - 1)) {
+    return 1002;   /* unaligned_tag */
+  } else {
+    /* The acquire load ensures that reading the field of a Forward_tag
+       block in stdlib/camlinternalLazy.ml:force_gen has the necessary
+       synchronization. */
+    hd = (header_t)atomic_load_acq(Hp_atomic_val(arg));
+    return Tag_hd(hd);
   }
 }
 
+CAMLprim value caml_obj_tag(value arg)
+{
+  return Val_int (obj_tag(arg));
+}
+
 CAMLprim value caml_obj_set_tag (value arg, value new_tag)
 {
   Tag_val (arg) = Int_val (new_tag);
@@ -61,13 +71,6 @@ CAMLprim value caml_obj_set_raw_field(value arg, value pos, value bits)
   return Val_unit;
 }
 
-CAMLprim value caml_obj_make_forward (value blk, value fwd)
-{
-  caml_modify(&Field(blk, 0), fwd);
-  Tag_val (blk) = Forward_tag;
-  return Val_unit;
-}
-
 /* [size] is a value encoding a number of blocks */
 CAMLprim value caml_obj_block(value tag, value size)
 {
@@ -154,9 +157,8 @@ CAMLprim value caml_obj_with_tag(value new_tag_v, value arg)
        and some of the "values" being copied are actually code pointers.
        That's because the new "value" does not point to the minor heap. */
     for (i = 0; i < sz; i++) caml_initialize(&Field(res, i), Field(arg, i));
-    /* Give gc a chance to run, and run memprof callbacks */
-    caml_process_pending_actions();
   }
+
   CAMLreturn (res);
 }
 
@@ -183,39 +185,7 @@ CAMLprim value caml_obj_dup(value arg)
 */
 CAMLprim value caml_obj_truncate (value v, value newsize)
 {
-  mlsize_t new_wosize = Long_val (newsize);
-  header_t hd = Hd_val (v);
-  tag_t tag = Tag_hd (hd);
-  color_t color = Color_hd (hd);
-  color_t frag_color = Is_young(v) ? 0 : Caml_black;
-  mlsize_t wosize = Wosize_hd (hd);
-  mlsize_t i;
-
-  if (tag == Double_array_tag) new_wosize *= Double_wosize;  /* PR#2520 */
-
-  if (new_wosize <= 0 || new_wosize > wosize){
-    caml_invalid_argument ("Obj.truncate");
-  }
-  if (new_wosize == wosize) return Val_unit;
-  /* PR#2400: since we're about to lose our references to the elements
-     beyond new_wosize in v, erase them explicitly so that the GC
-     can darken them as appropriate. */
-  if (tag < No_scan_tag) {
-    for (i = new_wosize; i < wosize; i++){
-      caml_modify(&Field(v, i), Val_unit);
-#ifdef DEBUG
-      Field (v, i) = Debug_free_truncate;
-#endif
-    }
-  }
-  /* We must use an odd tag for the header of the leftovers so it does not
-     look like a pointer because there may be some references to it in
-     ref_table. */
-  Field (v, new_wosize) =
-    Make_header (Wosize_whsize (wosize-new_wosize), Abstract_tag, frag_color);
-  Hd_val (v) =
-    Make_header_with_profinfo (new_wosize, tag, color, Profinfo_val(v));
-  return Val_unit;
+  caml_failwith("Obj.truncate not supported");
 }
 
 CAMLprim value caml_obj_add_offset (value v, value offset)
@@ -223,11 +193,22 @@ CAMLprim value caml_obj_add_offset (value v, value offset)
   return v + (unsigned long) Int32_val (offset);
 }
 
-/* The following function is used in stdlib/lazy.ml.
-   It is not written in OCaml because it must be atomic with respect
-   to the GC.
- */
+CAMLprim value caml_obj_compare_and_swap (value v, value f,
+                                          value oldv, value newv)
+{
+  int res = caml_atomic_cas_field(v, Int_val(f), oldv, newv);
+  caml_check_urgent_gc(Val_unit);
+  return Val_int(res);
+}
+
+CAMLprim value caml_obj_is_shared (value obj)
+{
+  return Val_int(Is_long(obj) || !Is_young(obj));
+}
 
+/* The following functions are used to support lazy values. They are not
+ * written in OCaml in order to ensure atomicity guarantees with respect to the
+ * GC. */
 CAMLprim value caml_lazy_make_forward (value v)
 {
   CAMLparam1 (v);
@@ -238,6 +219,61 @@ CAMLprim value caml_lazy_make_forward (value v)
   CAMLreturn (res);
 }
 
+static int obj_update_tag (value blk, int old_tag, int new_tag)
+{
+  header_t hd;
+  tag_t tag;
+
+  SPIN_WAIT {
+    hd = Hd_val(blk);
+    tag = Tag_hd(hd);
+
+    if (tag != old_tag) return 0;
+    if (caml_domain_alone()) {
+      Tag_val (blk) = new_tag;
+      return 1;
+    }
+
+    if (atomic_compare_exchange_strong(Hp_atomic_val(blk), &hd,
+                                       (hd & ~0xFF) | new_tag))
+      return 1;
+  }
+}
+
+CAMLprim value caml_lazy_reset_to_lazy (value v)
+{
+  CAMLassert (Tag_val(v) == Forcing_tag);
+
+  obj_update_tag (v, Forcing_tag, Lazy_tag);
+  return Val_unit;
+}
+
+CAMLprim value caml_lazy_update_to_forward (value v)
+{
+  CAMLassert (Tag_val(v) == Forcing_tag);
+
+  obj_update_tag (v, Forcing_tag, Forward_tag);
+  return Val_unit;
+}
+
+CAMLprim value caml_lazy_read_result (value v)
+{
+  if (obj_tag(v) == Forward_tag)
+    return Field(v,0);
+  return v;
+}
+
+CAMLprim value caml_lazy_update_to_forcing (value v)
+{
+  if (Is_block(v) && /* Needed to ensure that we don't attempt to update the
+                        header of a integer value */
+      obj_update_tag (v, Lazy_tag, Forcing_tag)) {
+    return Val_int(0);
+  } else {
+    return Val_int(1);
+  }
+}
+
 /* For mlvalues.h and camlinternalOO.ml
    See also GETPUBMET in interp.c
  */
@@ -255,20 +291,26 @@ CAMLprim value caml_get_public_method (value obj, value tag)
   return (tag == Field(meths,li) ? Field (meths, li-1) : 0);
 }
 
-static value oo_last_id = Val_int(0);
+/* Allocate OO ids in chunks, to avoid contention */
+#define Id_chunk 1024
 
-CAMLprim value caml_set_oo_id (value obj) {
-  Field(obj, 1) = oo_last_id;
-  oo_last_id += 2;
-  return obj;
-}
+static atomic_uintnat oo_next_id;
 
 CAMLprim value caml_fresh_oo_id (value v) {
-  v = oo_last_id;
-  oo_last_id += 2;
+  if (Caml_state->oo_next_id_local % Id_chunk == 0) {
+    Caml_state->oo_next_id_local =
+      atomic_fetch_add(&oo_next_id, Id_chunk);
+  }
+  v = Val_long(Caml_state->oo_next_id_local++);
   return v;
 }
 
+CAMLprim value caml_set_oo_id (value obj) {
+  value v = Val_unit;
+  Field(obj, 1) = caml_fresh_oo_id(v);
+  return obj;
+}
+
 CAMLprim value caml_int_as_pointer (value n) {
   return n - 1;
 }
diff --git a/runtime/parsing.c b/runtime/parsing.c
index 990eb1f6fc1b..d6fe750270db 100644
--- a/runtime/parsing.c
+++ b/runtime/parsing.c
@@ -23,6 +23,7 @@
 #include "caml/mlvalues.h"
 #include "caml/memory.h"
 #include "caml/alloc.h"
+#include "caml/startup.h"
 
 #define ERRCODE 256
 
@@ -72,8 +73,6 @@ struct parser_env {       /* Mirrors parser_env in ../stdlib/parsing.ml */
 #define Short(tbl,n) (((short *)(tbl))[n])
 #endif
 
-int caml_parser_trace = 0;
-
 /* Input codes */
 /* Mirrors parser_input in ../stdlib/parsing.ml */
 #define START 0
@@ -138,6 +137,11 @@ static void print_token(struct parser_tables *tables, int state, value tok)
   }
 }
 
+static int trace()
+{
+  return caml_params->parser_trace || Caml_state->parser_trace;
+}
+
 /* The pushdown automata */
 
 CAMLprim value caml_parse_engine(struct parser_tables *tables,
@@ -172,7 +176,7 @@ CAMLprim value caml_parse_engine(struct parser_tables *tables,
       env->curr_char = Field(tables->transl_const, Int_val(arg));
       caml_modify(&env->lval, Val_long(0));
     }
-    if (caml_parser_trace) print_token(tables, state, arg);
+    if (trace()) print_token(tables, state, arg);
 
   testshift:
     n1 = Short(tables->sindex, state);
@@ -201,15 +205,15 @@ CAMLprim value caml_parse_engine(struct parser_tables *tables,
         n2 = n1 + ERRCODE;
         if (n1 != 0 && n2 >= 0 && n2 <= Int_val(tables->tablesize) &&
             Short(tables->check, n2) == ERRCODE) {
-          if (caml_parser_trace)
+          if (trace())
             fprintf(stderr, "Recovering in state %d\n", state1);
           goto shift_recover;
         } else {
-          if (caml_parser_trace){
+          if (trace()){
             fprintf(stderr, "Discarding state %d\n", state1);
           }
           if (sp <= Int_val(env->stackbase)) {
-            if (caml_parser_trace){
+            if (trace()){
               fprintf(stderr, "No more states to discard\n");
             }
             return RAISE_PARSE_ERROR; /* The ML code raises Parse_error */
@@ -220,7 +224,7 @@ CAMLprim value caml_parse_engine(struct parser_tables *tables,
     } else {
       if (Int_val(env->curr_char) == 0)
         return RAISE_PARSE_ERROR; /* The ML code raises Parse_error */
-      if (caml_parser_trace) fprintf(stderr, "Discarding last token read\n");
+      if (trace()) fprintf(stderr, "Discarding last token read\n");
       env->curr_char = Val_int(-1);
       goto loop;
     }
@@ -229,7 +233,7 @@ CAMLprim value caml_parse_engine(struct parser_tables *tables,
     env->curr_char = Val_int(-1);
     if (errflag > 0) errflag--;
   shift_recover:
-    if (caml_parser_trace)
+    if (trace())
       fprintf(stderr, "State %d: shift to state %d\n",
               state, Short(tables->table, n2));
     state = Short(tables->table, n2);
@@ -248,7 +252,7 @@ CAMLprim value caml_parse_engine(struct parser_tables *tables,
     goto loop;
 
   reduce:
-    if (caml_parser_trace)
+    if (trace())
       fprintf(stderr, "State %d: reduce by rule %d\n", state, n);
     m = Short(tables->len, n);
     env->asp = Val_int(sp);
@@ -298,7 +302,7 @@ CAMLprim value caml_parse_engine(struct parser_tables *tables,
 
 CAMLprim value caml_set_parser_trace(value flag)
 {
-  value oldflag = Val_bool(caml_parser_trace);
-  caml_parser_trace = Bool_val(flag);
+  value oldflag = Val_bool(trace());
+  Caml_state->parser_trace = Bool_val(flag);
   return oldflag;
 }
diff --git a/runtime/platform.c b/runtime/platform.c
new file mode 100644
index 000000000000..268eab587ef0
--- /dev/null
+++ b/runtime/platform.c
@@ -0,0 +1,253 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                   Stephen Dolan, University of Cambridge               */
+/*                                                                        */
+/*   Copyright 2016 Indian Institute of Technology, Madras                */
+/*   Copyright 2016 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+#define CAML_INTERNALS
+
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include "caml/platform.h"
+#include "caml/fail.h"
+#ifdef HAS_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+/* Mutexes */
+
+void caml_plat_mutex_init(caml_plat_mutex * m)
+{
+  int rc;
+  pthread_mutexattr_t attr;
+  rc = pthread_mutexattr_init(&attr);
+  if (rc != 0) goto error1;
+  rc = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+  if (rc != 0) goto error2;
+  rc = pthread_mutex_init(m, &attr);
+  // fall through
+error2:
+  pthread_mutexattr_destroy(&attr);
+error1:
+  check_err("mutex_init", rc);
+}
+
+void caml_plat_assert_locked(caml_plat_mutex* m)
+{
+#ifdef DEBUG
+  int r = pthread_mutex_trylock(m);
+  if (r == EBUSY) {
+    /* ok, it was locked */
+    return;
+  } else if (r == 0) {
+    caml_fatal_error("Required mutex not locked");
+  } else {
+    check_err("assert_locked", r);
+  }
+#endif
+}
+
+void caml_plat_assert_all_locks_unlocked(void)
+{
+#ifdef DEBUG
+  if (lockdepth) caml_fatal_error("Locks still locked at termination");
+#endif
+}
+
+void caml_plat_mutex_free(caml_plat_mutex* m)
+{
+  check_err("mutex_free", pthread_mutex_destroy(m));
+}
+
+static void caml_plat_cond_init_aux(caml_plat_cond *cond)
+{
+  pthread_condattr_t attr;
+  pthread_condattr_init(&attr);
+#if defined(_POSIX_TIMERS) && \
+    defined(_POSIX_MONOTONIC_CLOCK) && \
+    _POSIX_MONOTONIC_CLOCK != (-1)
+  pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+#endif
+  pthread_cond_init(&cond->cond, &attr);
+}
+
+/* Condition variables */
+void caml_plat_cond_init(caml_plat_cond* cond, caml_plat_mutex* m)
+{
+  caml_plat_cond_init_aux(cond);
+  cond->mutex = m;
+}
+
+void caml_plat_wait(caml_plat_cond* cond)
+{
+  caml_plat_assert_locked(cond->mutex);
+  check_err("wait", pthread_cond_wait(&cond->cond, cond->mutex));
+}
+
+void caml_plat_broadcast(caml_plat_cond* cond)
+{
+  caml_plat_assert_locked(cond->mutex);
+  check_err("cond_broadcast", pthread_cond_broadcast(&cond->cond));
+}
+
+void caml_plat_signal(caml_plat_cond* cond)
+{
+  caml_plat_assert_locked(cond->mutex);
+  check_err("cond_signal", pthread_cond_signal(&cond->cond));
+}
+
+void caml_plat_cond_free(caml_plat_cond* cond)
+{
+  check_err("cond_free", pthread_cond_destroy(&cond->cond));
+  cond->mutex=0;
+}
+
+
+/* Memory management */
+
+static uintnat round_up(uintnat size, uintnat align) {
+  CAMLassert(Is_power_of_2(align));
+  return (size + align - 1) & ~(align - 1);
+}
+
+long caml_sys_pagesize = 0;
+
+uintnat caml_mem_round_up_pages(uintnat size)
+{
+  return round_up(size, caml_sys_pagesize);
+}
+
+#ifdef _WIN32
+#define MAP_FAILED 0
+#endif
+
+void* caml_mem_map(uintnat size, uintnat alignment, int reserve_only)
+{
+  uintnat alloc_sz = caml_mem_round_up_pages(size + alignment);
+  void* mem;
+  uintnat base, aligned_start, aligned_end;
+
+  CAMLassert(Is_power_of_2(alignment));
+  alignment = caml_mem_round_up_pages(alignment);
+
+  CAMLassert (alloc_sz > size);
+#ifdef _WIN32
+  /* Memory is only reserved at this point. It'll be committed after the
+     trim. */
+  mem = VirtualAlloc(NULL, alloc_sz, MEM_RESERVE, PAGE_NOACCESS);
+#else
+  mem = mmap(0, alloc_sz, reserve_only ? PROT_NONE : (PROT_READ | PROT_WRITE),
+             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#endif
+  if (mem == MAP_FAILED) {
+    return 0;
+  }
+
+  /* trim to an aligned region */
+  base = (uintnat)mem;
+  aligned_start = round_up(base, alignment);
+  aligned_end = aligned_start + caml_mem_round_up_pages(size);
+#ifdef _WIN32
+  /* VirtualFree can be used to decommit portions of memory, but it can only
+     release the entire block of memory. For Windows, repeat the call but this
+     time specify the address. */
+  if (!VirtualFree(mem, 0, MEM_RELEASE))
+    printf("The world seems to be upside down\n");
+  mem = VirtualAlloc((void*)aligned_start,
+                     aligned_end - aligned_start + 1,
+                     MEM_RESERVE | (reserve_only ? 0 : MEM_COMMIT),
+                     reserve_only ? PAGE_NOACCESS : PAGE_READWRITE);
+  if (!mem)
+    printf("Trimming failed\n");
+  else if (mem != (void*)aligned_start)
+    printf("Hang on a sec - it's allocated a different block?!\n");
+#else
+  caml_mem_unmap((void*)base, aligned_start - base);
+  caml_mem_unmap((void*)aligned_end, (base + alloc_sz) - aligned_end);
+#endif
+  return (void*)aligned_start;
+}
+
+#ifndef _WIN32
+static void* map_fixed(void* mem, uintnat size, int prot)
+{
+  if (mmap((void*)mem, size, prot,
+           MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+           -1, 0) == MAP_FAILED) {
+    return 0;
+  } else {
+    return mem;
+  }
+}
+#endif
+
+void* caml_mem_commit(void* mem, uintnat size)
+{
+#ifdef _WIN32
+  return VirtualAlloc(mem, size, MEM_COMMIT, PAGE_READWRITE);
+#else
+  void* p = map_fixed(mem, size, PROT_READ | PROT_WRITE);
+  /*
+    FIXME: On Linux, with overcommit, you stand a better
+    chance of getting good error messages in OOM conditions
+    by forcing the kernel to allocate actual memory by touching
+    all the pages. Not sure whether this is a good idea, though.
+
+      if (p) memset(p, 0, size);
+  */
+  return p;
+#endif
+}
+
+void caml_mem_decommit(void* mem, uintnat size)
+{
+#ifdef _WIN32
+  if (!VirtualFree(mem, size, MEM_DECOMMIT))
+    printf("VirtualFree failed to decommit\n");
+#else
+  map_fixed(mem, size, PROT_NONE);
+#endif
+}
+
+void caml_mem_unmap(void* mem, uintnat size)
+{
+#ifdef _WIN32
+  if (!VirtualFree(mem, size, MEM_RELEASE))
+    printf("VirtualFree failed\n");
+#else
+  munmap(mem, size);
+#endif
+}
+
+#define Min_sleep_ns       10000 // 10 us
+#define Slow_sleep_ns    1000000 //  1 ms
+#define Max_sleep_ns  1000000000 //  1 s
+
+unsigned caml_plat_spin_wait(unsigned spins,
+                             const char* file, int line,
+                             const char* function)
+{
+  unsigned next_spins;
+  if (spins < Min_sleep_ns) spins = Min_sleep_ns;
+  if (spins > Max_sleep_ns) spins = Max_sleep_ns;
+  next_spins = spins + spins / 4;
+  if (spins < Slow_sleep_ns && Slow_sleep_ns <= next_spins) {
+    caml_gc_log("Slow spin-wait loop in %s at %s:%d", function, file, line);
+  }
+  usleep(spins/1000);
+  return next_spins;
+}
diff --git a/runtime/printexc.c b/runtime/printexc.c
index 2828fdbc5adc..13ae9ca5a396 100644
--- a/runtime/printexc.c
+++ b/runtime/printexc.c
@@ -142,12 +142,6 @@ void caml_fatal_uncaught_exception(value exn)
   handle_uncaught_exception =
     caml_named_value("Printexc.handle_uncaught_exception");
 
-  /* If the callback allocates, memprof could be called. In this case,
-     memprof's callback could raise an exception while
-     [handle_uncaught_exception] is running, so that the printing of
-     the exception fails. */
-  caml_memprof_set_suspended(1);
-
   if (handle_uncaught_exception != NULL)
     /* [Printexc.handle_uncaught_exception] does not raise exception. */
     caml_callback2(*handle_uncaught_exception, exn, Val_bool(DEBUGGER_IN_USE));
diff --git a/runtime/roots.c b/runtime/roots.c
new file mode 100644
index 000000000000..3fd7908a2952
--- /dev/null
+++ b/runtime/roots.c
@@ -0,0 +1,77 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*          Xavier Leroy and Damien Doligez, INRIA Rocquencourt           */
+/*                                                                        */
+/*   Copyright 1996 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#define CAML_INTERNALS
+
+/* To walk the memory roots for garbage collection */
+
+#include "caml/finalise.h"
+#include "caml/globroots.h"
+#include "caml/major_gc.h"
+#include "caml/memory.h"
+#include "caml/minor_gc.h"
+#include "caml/misc.h"
+#include "caml/mlvalues.h"
+#include "caml/roots.h"
+#include "caml/major_gc.h"
+#include "caml/shared_heap.h"
+#include "caml/fiber.h"
+
+#ifdef NATIVE_CODE
+#include "caml/stack.h"
+/* Communication with [caml_start_program] and [caml_call_gc]. */
+
+/* The global roots.
+   FIXME: These should be promoted, and not scanned here.
+   FIXME: caml_globals_inited makes assumptions about store ordering.
+   XXX KC : What to do here?
+*/
+
+intnat caml_globals_inited = 0;
+#endif
+
+CAMLexport void (*caml_scan_roots_hook)
+  (scanning_action, void* fdata, caml_domain_state *) = NULL;
+
+void caml_do_roots (scanning_action f, void* fdata, caml_domain_state* d,
+                    int do_final_val)
+{
+  caml_do_local_roots(f, fdata, d->local_roots, d->current_stack, d->gc_regs);
+  if (caml_scan_roots_hook != NULL) (*caml_scan_roots_hook)(f, fdata, d);
+  caml_final_do_roots(f, fdata, d, do_final_val);
+
+}
+
+CAMLexport void caml_do_local_roots (scanning_action f, void* fdata,
+                                     struct caml__roots_block *local_roots,
+                                     struct stack_info *current_stack,
+                                     value * v_gc_regs)
+{
+  struct caml__roots_block *lr;
+  int i, j;
+  value* sp;
+
+  for (lr = local_roots; lr != NULL; lr = lr->next) {
+    for (i = 0; i < lr->ntables; i++){
+      for (j = 0; j < lr->nitems; j++){
+        sp = &(lr->tables[i][j]);
+        if (*sp != 0) {
+          f (fdata, *sp, sp);
+        }
+      }
+    }
+  }
+  caml_scan_stack(f, fdata, current_stack, v_gc_regs);
+}
diff --git a/runtime/roots_byt.c b/runtime/roots_byt.c
deleted file mode 100644
index 9d65e0806a2a..000000000000
--- a/runtime/roots_byt.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*          Xavier Leroy and Damien Doligez, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-#define CAML_INTERNALS
-
-/* To walk the memory roots for garbage collection */
-
-#include "caml/codefrag.h"
-#include "caml/finalise.h"
-#include "caml/globroots.h"
-#include "caml/major_gc.h"
-#include "caml/memory.h"
-#include "caml/minor_gc.h"
-#include "caml/misc.h"
-#include "caml/mlvalues.h"
-#include "caml/roots.h"
-#include "caml/stacks.h"
-#include "caml/memprof.h"
-#include "caml/eventlog.h"
-
-CAMLexport void (*caml_scan_roots_hook) (scanning_action f) = NULL;
-
-/* FIXME should rename to [caml_oldify_minor_roots] and synchronise with
-   roots_nat.c */
-/* Call [caml_oldify_one] on (at least) all the roots that point to the minor
-   heap. */
-void caml_oldify_local_roots (void)
-{
-  register value * sp;
-  struct caml__roots_block *lr;
-  intnat i, j;
-
-  /* The stack */
-  /* [caml_oldify_one] acts only on pointers into the minor heap.
-     So, it is safe to pass code pointers to [caml_oldify_one],
-     even in no-naked-pointers mode */
-  for (sp = Caml_state->extern_sp; sp < Caml_state->stack_high; sp++) {
-    caml_oldify_one (*sp, sp);
-  }
-  /* Local C roots */  /* FIXME do the old-frame trick ? */
-  for (lr = Caml_state->local_roots; lr != NULL; lr = lr->next) {
-    for (i = 0; i < lr->ntables; i++){
-      for (j = 0; j < lr->nitems; j++){
-        sp = &(lr->tables[i][j]);
-        caml_oldify_one (*sp, sp);
-      }
-    }
-  }
-  /* Global C roots */
-  caml_scan_global_young_roots(&caml_oldify_one);
-  /* Finalised values */
-  caml_final_oldify_young_roots ();
-  /* Memprof */
-  caml_memprof_oldify_young_roots ();
-  /* Hook */
-  if (caml_scan_roots_hook != NULL) (*caml_scan_roots_hook)(&caml_oldify_one);
-}
-
-/* Call [caml_darken] on all roots */
-
-void caml_darken_all_roots_start (void)
-{
-  caml_do_roots (caml_darken, 1);
-}
-
-uintnat caml_incremental_roots_count = 1;
-
-intnat caml_darken_all_roots_slice (intnat work)
-{
-  return work;
-}
-
-/* Note, in byte-code there is only one global root, so [do_globals] is
-   ignored and [caml_darken_all_roots_slice] does nothing. */
-void caml_do_roots (scanning_action f, int do_globals)
-{
-  /* Global variables */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_GLOBAL);
-  f(caml_global_data, &caml_global_data);
-  CAML_EV_END(EV_MAJOR_ROOTS_GLOBAL);
-  /* The stack and the local C roots */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_LOCAL);
-  caml_do_local_roots_byt(f, Caml_state->extern_sp, Caml_state->stack_high,
-                          Caml_state->local_roots);
-  CAML_EV_END(EV_MAJOR_ROOTS_LOCAL);
-  /* Global C roots */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_C);
-  caml_scan_global_roots(f);
-  CAML_EV_END(EV_MAJOR_ROOTS_C);
-  /* Finalised values */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_FINALISED);
-  caml_final_do_roots (f);
-  CAML_EV_END(EV_MAJOR_ROOTS_FINALISED);
-  /* Memprof */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_MEMPROF);
-  caml_memprof_do_roots (f);
-  CAML_EV_END(EV_MAJOR_ROOTS_MEMPROF);
-  /* Hook */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_HOOK);
-  if (caml_scan_roots_hook != NULL) (*caml_scan_roots_hook)(f);
-  CAML_EV_END(EV_MAJOR_ROOTS_HOOK);
-}
-
-CAMLexport void caml_do_local_roots_byt (scanning_action f, value *stack_low,
-                                         value *stack_high,
-                                         struct caml__roots_block *local_roots)
-{
-  register value * sp;
-  struct caml__roots_block *lr;
-  int i, j;
-
-  for (sp = stack_low; sp < stack_high; sp++) {
-#ifdef NO_NAKED_POINTERS
-    /* Code pointers inside the stack are naked pointers.
-       We must avoid passing them to function [f]. */
-    value v = *sp;
-    if (Is_block(v) && caml_find_code_fragment_by_pc((char *) v) == NULL) {
-      f(v, sp);
-    }
-#else
-    f (*sp, sp);
-#endif
-  }
-  for (lr = local_roots; lr != NULL; lr = lr->next) {
-    for (i = 0; i < lr->ntables; i++){
-      for (j = 0; j < lr->nitems; j++){
-        sp = &(lr->tables[i][j]);
-        f (*sp, sp);
-      }
-    }
-  }
-}
diff --git a/runtime/roots_nat.c b/runtime/roots_nat.c
deleted file mode 100644
index aba070619a2c..000000000000
--- a/runtime/roots_nat.c
+++ /dev/null
@@ -1,524 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-#define CAML_INTERNALS
-
-/* To walk the memory roots for garbage collection */
-
-#include "caml/finalise.h"
-#include "caml/globroots.h"
-#include "caml/memory.h"
-#include "caml/major_gc.h"
-#include "caml/minor_gc.h"
-#include "caml/misc.h"
-#include "caml/mlvalues.h"
-#include "caml/stack.h"
-#include "caml/roots.h"
-#include "caml/memprof.h"
-#include "caml/eventlog.h"
-#include <string.h>
-#include <stdio.h>
-
-/* Roots registered from C functions */
-
-void (*caml_scan_roots_hook) (scanning_action) = NULL;
-
-/* The hashtable of frame descriptors */
-frame_descr ** caml_frame_descriptors = NULL;
-uintnat caml_frame_descriptors_mask = 0;
-
-/* Linked-list */
-
-typedef struct link {
-  void *data;
-  struct link *next;
-} link;
-
-static link *cons(void *data, link *tl) {
-  link *lnk = caml_stat_alloc(sizeof(link));
-  lnk->data = data;
-  lnk->next = tl;
-  return lnk;
-}
-
-#define iter_list(list,lnk) \
-  for (lnk = list; lnk != NULL; lnk = lnk->next)
-
-/* Linked-list of frametables */
-
-static link *frametables = NULL;
-static intnat num_descr = 0;
-
-static intnat count_descriptors(link *list) {
-  intnat num_descr = 0;
-  link *lnk;
-  iter_list(list,lnk) {
-    num_descr += *((intnat*) lnk->data);
-  }
-  return num_descr;
-}
-
-static link* frametables_list_tail(link *list) {
-  link *lnk, *tail = NULL;
-  iter_list(list,lnk) {
-    tail = lnk;
-  }
-  return tail;
-}
-
-static frame_descr * next_frame_descr(frame_descr * d) {
-  unsigned char num_allocs = 0, *p;
-  CAMLassert(d->retaddr >= 4096);
-  /* Skip to end of live_ofs */
-  p = (unsigned char*)&d->live_ofs[d->num_live];
-  /* Skip alloc_lengths if present */
-  if (d->frame_size & 2) {
-    num_allocs = *p;
-    p += num_allocs + 1;
-  }
-  /* Skip debug info if present */
-  if (d->frame_size & 1) {
-    /* Align to 32 bits */
-    p = Align_to(p, uint32_t);
-    p += sizeof(uint32_t) * (d->frame_size & 2 ? num_allocs : 1);
-  }
-  /* Align to word size */
-  p = Align_to(p, void*);
-  return ((frame_descr*) p);
-}
-
-static void fill_hashtable(link *frametables) {
-  intnat len, j;
-  intnat * tbl;
-  frame_descr * d;
-  uintnat h;
-  link *lnk = NULL;
-
-  iter_list(frametables,lnk) {
-    tbl = (intnat*) lnk->data;
-    len = *tbl;
-    d = (frame_descr *)(tbl + 1);
-    for (j = 0; j < len; j++) {
-      h = Hash_retaddr(d->retaddr);
-      while (caml_frame_descriptors[h] != NULL) {
-        h = (h+1) & caml_frame_descriptors_mask;
-      }
-      caml_frame_descriptors[h] = d;
-      d = next_frame_descr(d);
-    }
-  }
-}
-
-static void init_frame_descriptors(link *new_frametables)
-{
-  intnat tblsize, increase, i;
-  link *tail = NULL;
-
-  CAMLassert(new_frametables);
-
-  tail = frametables_list_tail(new_frametables);
-  increase = count_descriptors(new_frametables);
-  tblsize = caml_frame_descriptors_mask + 1;
-
-  /* Reallocate the caml_frame_descriptor table if it is too small */
-  if(tblsize < (num_descr + increase) * 2) {
-
-    /* Merge both lists */
-    tail->next = frametables;
-    frametables = NULL;
-
-    /* [num_descr] can be less than [num_descr + increase] if frame
-       tables where unregistered */
-    num_descr = count_descriptors(new_frametables);
-
-    tblsize = 4;
-    while (tblsize < 2 * num_descr) tblsize *= 2;
-
-    caml_frame_descriptors_mask = tblsize - 1;
-    if(caml_frame_descriptors) caml_stat_free(caml_frame_descriptors);
-    caml_frame_descriptors =
-      (frame_descr **) caml_stat_alloc(tblsize * sizeof(frame_descr *));
-    for (i = 0; i < tblsize; i++) caml_frame_descriptors[i] = NULL;
-
-    fill_hashtable(new_frametables);
-  } else {
-    num_descr += increase;
-    fill_hashtable(new_frametables);
-    tail->next = frametables;
-  }
-
-  frametables = new_frametables;
-}
-
-void caml_init_frame_descriptors(void) {
-  intnat i;
-  link *new_frametables = NULL;
-  for (i = 0; caml_frametable[i] != 0; i++)
-    new_frametables = cons(caml_frametable[i],new_frametables);
-  init_frame_descriptors(new_frametables);
-}
-
-void caml_register_frametable(intnat *table) {
-  link *new_frametables = cons(table,NULL);
-  init_frame_descriptors(new_frametables);
-}
-
-static void remove_entry(frame_descr * d) {
-  uintnat i;
-  uintnat r;
-  uintnat j;
-
-  i = Hash_retaddr(d->retaddr);
-  while (caml_frame_descriptors[i] != d) {
-    i = (i+1) & caml_frame_descriptors_mask;
-  }
-
- r1:
-  j = i;
-  caml_frame_descriptors[i] = NULL;
- r2:
-  i = (i+1) & caml_frame_descriptors_mask;
-  // r3
-  if(caml_frame_descriptors[i] == NULL) return;
-  r = Hash_retaddr(caml_frame_descriptors[i]->retaddr);
-  /* If r is between i and j (cyclically), i.e. if
-     caml_frame_descriptors[i]->retaddr don't need to be moved */
-  if(( ( j < r )  && ( r <= i ) ) ||
-     ( ( i < j )  && ( j < r )  ) ||      /* i cycled, r not */
-     ( ( r <= i ) && ( i < j ) )     ) {  /* i and r cycled */
-    goto r2;
-  }
-  // r4
-  caml_frame_descriptors[j] = caml_frame_descriptors[i];
-  goto r1;
-}
-
-void caml_unregister_frametable(intnat *table) {
-  intnat len, j;
-  link *lnk;
-  link *previous = frametables;
-  frame_descr * d;
-
-  len = *table;
-  d = (frame_descr *)(table + 1);
-  for (j = 0; j < len; j++) {
-    remove_entry(d);
-    d = next_frame_descr(d);
-  }
-
-  iter_list(frametables,lnk) {
-    if(lnk->data == table) {
-      previous->next = lnk->next;
-      caml_stat_free(lnk);
-      break;
-    }
-    previous = lnk;
-  }
-}
-
-/* Communication with [caml_start_program] and [caml_call_gc]. */
-
-intnat caml_globals_inited = 0;
-static intnat caml_globals_scanned = 0;
-static link * caml_dyn_globals = NULL;
-
-void caml_register_dyn_global(void *v) {
-  caml_dyn_globals = cons((void*) v,caml_dyn_globals);
-}
-
-/* Call [caml_oldify_one] on (at least) all the roots that point to the minor
-   heap. */
-void caml_oldify_local_roots (void)
-{
-  char * sp;
-  uintnat retaddr;
-  value * regs;
-  frame_descr * d;
-  uintnat h;
-  intnat i, j;
-  int n, ofs;
-  unsigned short * p;
-  value * glob;
-  value * root;
-  struct caml__roots_block *lr;
-  link *lnk;
-
-  /* The global roots */
-  for (i = caml_globals_scanned;
-       i <= caml_globals_inited && caml_globals[i] != 0;
-       i++) {
-    for(glob = caml_globals[i]; *glob != 0; glob++) {
-      for (j = 0; j < Wosize_val(*glob); j++){
-        Oldify (&Field (*glob, j));
-      }
-    }
-  }
-  caml_globals_scanned = caml_globals_inited;
-
-  /* Dynamic global roots */
-  iter_list(caml_dyn_globals, lnk) {
-    for(glob = (value *) lnk->data; *glob != 0; glob++) {
-      for (j = 0; j < Wosize_val(*glob); j++){
-        Oldify (&Field (*glob, j));
-      }
-    }
-  }
-
-  /* The stack and local roots */
-  sp = Caml_state->bottom_of_stack;
-  retaddr = Caml_state->last_return_address;
-  regs = Caml_state->gc_regs;
-  if (sp != NULL) {
-    while (1) {
-      /* Find the descriptor corresponding to the return address */
-      h = Hash_retaddr(retaddr);
-      while(1) {
-        d = caml_frame_descriptors[h];
-        if (d->retaddr == retaddr) break;
-        h = (h+1) & caml_frame_descriptors_mask;
-      }
-      if (d->frame_size != 0xFFFF) {
-        /* Scan the roots in this frame */
-        for (p = d->live_ofs, n = d->num_live; n > 0; n--, p++) {
-          ofs = *p;
-          if (ofs & 1) {
-            root = regs + (ofs >> 1);
-          } else {
-            root = (value *)(sp + ofs);
-          }
-          Oldify (root);
-        }
-        /* Move to next frame */
-        sp += (d->frame_size & 0xFFFC);
-        retaddr = Saved_return_address(sp);
-#ifdef Already_scanned
-        /* Stop here if the frame has been scanned during earlier GCs  */
-        if (Already_scanned(sp, retaddr)) break;
-        /* Mark frame as already scanned */
-        Mark_scanned(sp, retaddr);
-#endif
-      } else {
-        /* This marks the top of a stack chunk for an ML callback.
-           Skip C portion of stack and continue with next ML stack chunk. */
-        struct caml_context * next_context = Callback_link(sp);
-        sp = next_context->bottom_of_stack;
-        retaddr = next_context->last_retaddr;
-        regs = next_context->gc_regs;
-        /* A null sp means no more ML stack chunks; stop here. */
-        if (sp == NULL) break;
-      }
-    }
-  }
-  /* Local C roots */
-  for (lr = Caml_state->local_roots; lr != NULL; lr = lr->next) {
-    for (i = 0; i < lr->ntables; i++){
-      for (j = 0; j < lr->nitems; j++){
-        root = &(lr->tables[i][j]);
-        Oldify (root);
-      }
-    }
-  }
-  /* Global C roots */
-  caml_scan_global_young_roots(&caml_oldify_one);
-  /* Finalised values */
-  caml_final_oldify_young_roots ();
-  /* Memprof */
-  caml_memprof_oldify_young_roots ();
-  /* Hook */
-  if (caml_scan_roots_hook != NULL) (*caml_scan_roots_hook)(&caml_oldify_one);
-}
-
-uintnat caml_incremental_roots_count = 0;
-
-/* Call [caml_darken] on all roots, incrementally:
-   [caml_darken_all_roots_start] does the non-incremental part and
-   sets things up for [caml_darken_all_roots_slice].
-*/
-void caml_darken_all_roots_start (void)
-{
-  caml_do_roots (caml_darken, 0);
-}
-
-/* Call [caml_darken] on at most [work] global roots. Return the
-   amount of work not done, if any. If this is strictly positive,
-   the darkening is done.
- */
-intnat caml_darken_all_roots_slice (intnat work)
-{
-  static int i, j;
-  static value *glob;
-  static int do_resume = 0;
-  static mlsize_t roots_count = 0;
-  intnat remaining_work = work;
-  CAML_EV_BEGIN(EV_MAJOR_MARK_GLOBAL_ROOTS_SLICE);
-
-  /* If the loop was started in a previous call, resume it. */
-  if (do_resume) goto resume;
-
-  /* This is the same loop as in [caml_do_roots], but we make it
-     suspend itself when [work] reaches 0. */
-  for (i = 0; caml_globals[i] != 0; i++) {
-    for(glob = caml_globals[i]; *glob != 0; glob++) {
-      for (j = 0; j < Wosize_val(*glob); j++){
-        caml_darken (Field (*glob, j), &Field (*glob, j));
-        -- remaining_work;
-        if (remaining_work == 0){
-          roots_count += work;
-          do_resume = 1;
-          goto suspend;
-        }
-      resume: ;
-      }
-    }
-  }
-
-  /* The loop finished normally, so all roots are now darkened. */
-  caml_incremental_roots_count = roots_count + work - remaining_work;
-  /* Prepare for the next run. */
-  do_resume = 0;
-  roots_count = 0;
-
- suspend:
-  /* Do this in both cases. */
-  CAML_EV_END(EV_MAJOR_MARK_GLOBAL_ROOTS_SLICE);
-  return remaining_work;
-}
-
-void caml_do_roots (scanning_action f, int do_globals)
-{
-  int i, j;
-  value * glob;
-  link *lnk;
-
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_DYNAMIC_GLOBAL);
-  if (do_globals){
-    /* The global roots */
-    for (i = 0; caml_globals[i] != 0; i++) {
-      for(glob = caml_globals[i]; *glob != 0; glob++) {
-        for (j = 0; j < Wosize_val(*glob); j++)
-          f (Field (*glob, j), &Field (*glob, j));
-      }
-    }
-  }
-  /* Dynamic global roots */
-  iter_list(caml_dyn_globals, lnk) {
-    for(glob = (value *) lnk->data; *glob != 0; glob++) {
-      for (j = 0; j < Wosize_val(*glob); j++){
-        f (Field (*glob, j), &Field (*glob, j));
-      }
-    }
-  }
-  CAML_EV_END(EV_MAJOR_ROOTS_DYNAMIC_GLOBAL);
-  /* The stack and local roots */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_LOCAL);
-  caml_do_local_roots_nat(f, Caml_state->bottom_of_stack,
-                          Caml_state->last_return_address, Caml_state->gc_regs,
-                          Caml_state->local_roots);
-  CAML_EV_END(EV_MAJOR_ROOTS_LOCAL);
-  /* Global C roots */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_C);
-  caml_scan_global_roots(f);
-  CAML_EV_END(EV_MAJOR_ROOTS_C);
-  /* Finalised values */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_FINALISED);
-  caml_final_do_roots (f);
-  CAML_EV_END(EV_MAJOR_ROOTS_FINALISED);
-  /* Memprof */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_MEMPROF);
-  caml_memprof_do_roots (f);
-  CAML_EV_END(EV_MAJOR_ROOTS_MEMPROF);
-  /* Hook */
-  CAML_EV_BEGIN(EV_MAJOR_ROOTS_HOOK);
-  if (caml_scan_roots_hook != NULL) (*caml_scan_roots_hook)(f);
-  CAML_EV_END(EV_MAJOR_ROOTS_HOOK);
-}
-
-void caml_do_local_roots_nat(scanning_action f, char * bottom_of_stack,
-                             uintnat last_retaddr, value * gc_regs,
-                             struct caml__roots_block * local_roots)
-{
-  char * sp;
-  uintnat retaddr;
-  value * regs;
-  frame_descr * d;
-  uintnat h;
-  int i, j, n, ofs;
-  unsigned short * p;
-  value * root;
-  struct caml__roots_block *lr;
-
-  sp = bottom_of_stack;
-  retaddr = last_retaddr;
-  regs = gc_regs;
-  if (sp != NULL) {
-    while (1) {
-      /* Find the descriptor corresponding to the return address */
-      h = Hash_retaddr(retaddr);
-      while(1) {
-        d = caml_frame_descriptors[h];
-        if (d->retaddr == retaddr) break;
-        h = (h+1) & caml_frame_descriptors_mask;
-      }
-      if (d->frame_size != 0xFFFF) {
-        /* Scan the roots in this frame */
-        for (p = d->live_ofs, n = d->num_live; n > 0; n--, p++) {
-          ofs = *p;
-          if (ofs & 1) {
-            root = regs + (ofs >> 1);
-          } else {
-            root = (value *)(sp + ofs);
-          }
-          f (*root, root);
-        }
-        /* Move to next frame */
-        sp += (d->frame_size & 0xFFFC);
-        retaddr = Saved_return_address(sp);
-#ifdef Mask_already_scanned
-        retaddr = Mask_already_scanned(retaddr);
-#endif
-      } else {
-        /* This marks the top of a stack chunk for an ML callback.
-           Skip C portion of stack and continue with next ML stack chunk. */
-        struct caml_context * next_context = Callback_link(sp);
-        sp = next_context->bottom_of_stack;
-        retaddr = next_context->last_retaddr;
-        regs = next_context->gc_regs;
-        /* A null sp means no more ML stack chunks; stop here. */
-        if (sp == NULL) break;
-      }
-    }
-  }
-  /* Local C roots */
-  for (lr = local_roots; lr != NULL; lr = lr->next) {
-    for (i = 0; i < lr->ntables; i++){
-      for (j = 0; j < lr->nitems; j++){
-        root = &(lr->tables[i][j]);
-        f (*root, root);
-      }
-    }
-  }
-}
-
-uintnat (*caml_stack_usage_hook)(void) = NULL;
-
-uintnat caml_stack_usage (void)
-{
-  uintnat sz;
-  sz = (value *) Caml_state->top_of_stack -
-       (value *) Caml_state->bottom_of_stack;
-  if (caml_stack_usage_hook != NULL)
-    sz += (*caml_stack_usage_hook)();
-  return sz;
-}
diff --git a/runtime/shared_heap.c b/runtime/shared_heap.c
new file mode 100644
index 000000000000..2a53d61f450d
--- /dev/null
+++ b/runtime/shared_heap.c
@@ -0,0 +1,886 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       */
+/*                 Stephen Dolan, University of Cambridge                 */
+/*                                                                        */
+/*   Copyright 2015 Indian Institute of Technology, Madras                */
+/*   Copyright 2015 University of Cambridge                               */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+#define CAML_INTERNALS
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "caml/addrmap.h"
+#include "caml/custom.h"
+#include "caml/fail.h"
+#include "caml/fiber.h" /* for verification */
+#include "caml/gc.h"
+#include "caml/globroots.h"
+#include "caml/memory.h"
+#include "caml/mlvalues.h"
+#include "caml/platform.h"
+#include "caml/roots.h"
+#include "caml/shared_heap.h"
+#include "caml/sizeclasses.h"
+#include "caml/startup_aux.h"
+
+typedef unsigned int sizeclass;
+struct global_heap_state caml_global_heap_state = {0 << 8, 1 << 8, 2 << 8};
+
+typedef struct pool {
+  struct pool* next;
+  value* next_obj;
+  caml_domain_state* owner;
+  sizeclass sz;
+} pool;
+CAML_STATIC_ASSERT(sizeof(pool) == Bsize_wsize(POOL_HEADER_WSIZE));
+#define POOL_HEADER_SZ sizeof(pool)
+
+typedef struct large_alloc {
+  caml_domain_state* owner;
+  struct large_alloc* next;
+} large_alloc;
+CAML_STATIC_ASSERT(sizeof(large_alloc) % sizeof(value) == 0);
+#define LARGE_ALLOC_HEADER_SZ sizeof(large_alloc)
+
+static struct {
+  caml_plat_mutex lock;
+  pool* free;
+
+  /* these only contain swept memory of terminated domains*/
+  struct heap_stats stats;
+  pool* global_avail_pools[NUM_SIZECLASSES];
+  pool* global_full_pools[NUM_SIZECLASSES];
+  large_alloc* global_large;
+} pool_freelist = {
+  CAML_PLAT_MUTEX_INITIALIZER,
+  NULL,
+  { 0, },
+  { 0, },
+  { 0, },
+  NULL
+};
+
+/* readable and writable only by the current thread */
+struct caml_heap_state {
+  pool* avail_pools[NUM_SIZECLASSES];
+  pool* full_pools[NUM_SIZECLASSES];
+  pool* unswept_avail_pools[NUM_SIZECLASSES];
+  pool* unswept_full_pools[NUM_SIZECLASSES];
+
+  large_alloc* swept_large;
+  large_alloc* unswept_large;
+
+  sizeclass next_to_sweep;
+
+  caml_domain_state* owner;
+
+  struct heap_stats stats;
+};
+
+struct caml_heap_state* caml_init_shared_heap (void) {
+  int i;
+  struct caml_heap_state* heap;
+
+  heap = caml_stat_alloc_noexc(sizeof(struct caml_heap_state));
+  if(heap != NULL) {
+    for (i = 0; i<NUM_SIZECLASSES; i++) {
+      heap->avail_pools[i] = heap->full_pools[i] =
+        heap->unswept_avail_pools[i] = heap->unswept_full_pools[i] = 0;
+    }
+    heap->next_to_sweep = 0;
+    heap->swept_large = NULL;
+    heap->unswept_large = NULL;
+    heap->owner = Caml_state;
+    memset(&heap->stats, 0, sizeof(heap->stats));
+  }
+  return heap;
+}
+
+static int move_all_pools(pool** src, pool** dst, caml_domain_state* new_owner){
+  int count = 0;
+  while (*src) {
+    pool* p = *src;
+    *src = p->next;
+    p->owner = new_owner;
+    p->next = *dst;
+    *dst = p;
+    count++;
+  }
+  return count;
+}
+
+void caml_teardown_shared_heap(struct caml_heap_state* heap) {
+  int i;
+  int released = 0, released_large = 0;
+  caml_plat_lock(&pool_freelist.lock);
+  for (i = 0; i < NUM_SIZECLASSES; i++) {
+    released +=
+      move_all_pools(&heap->avail_pools[i],
+                     &pool_freelist.global_avail_pools[i], NULL);
+
+    released +=
+      move_all_pools(&heap->full_pools[i],
+                     &pool_freelist.global_full_pools[i], NULL);
+
+    /* should be swept by now */
+    CAMLassert(!heap->unswept_avail_pools[i]);
+    CAMLassert(!heap->unswept_full_pools[i]);
+  }
+  CAMLassert(!heap->unswept_large);
+  while (heap->swept_large) {
+    large_alloc* a = heap->swept_large;
+    heap->swept_large = a->next;
+    a->next = pool_freelist.global_large;
+    pool_freelist.global_large = a;
+    released_large++;
+  }
+  caml_accum_heap_stats(&pool_freelist.stats, &heap->stats);
+  caml_plat_unlock(&pool_freelist.lock);
+  caml_stat_free(heap);
+  caml_gc_log("Shutdown shared heap. Released %d active pools, %d large",
+              released, released_large);
+}
+
+void caml_sample_heap_stats(struct caml_heap_state* local, struct heap_stats* h)
+{
+  *h = local->stats;
+}
+
+
+/* Allocating and deallocating pools from the global freelist. */
+
+#define POOLS_PER_ALLOCATION 16
+static pool* pool_acquire(struct caml_heap_state* local) {
+  pool* r;
+
+  caml_plat_lock(&pool_freelist.lock);
+  if (!pool_freelist.free) {
+    void* mem = caml_mem_map(Bsize_wsize(POOL_WSIZE) * POOLS_PER_ALLOCATION,
+                              Bsize_wsize(POOL_WSIZE), 0 /* allocate */);
+    int i;
+    if (mem) {
+      CAMLassert(pool_freelist.free == NULL);
+      for (i=0; i<POOLS_PER_ALLOCATION; i++) {
+        r = (pool*)(((uintnat)mem) + ((uintnat)i) * Bsize_wsize(POOL_WSIZE));
+        r->next = pool_freelist.free;
+        r->owner = NULL;
+        pool_freelist.free = r;
+      }
+    }
+  }
+  r = pool_freelist.free;
+  if (r)
+    pool_freelist.free = r->next;
+  caml_plat_unlock(&pool_freelist.lock);
+
+  if (r) CAMLassert (r->owner == NULL);
+  return r;
+}
+
+static void pool_release(struct caml_heap_state* local,
+                         pool* pool,
+                         sizeclass sz) {
+  pool->owner = NULL;
+  CAMLassert(pool->sz == sz);
+  local->stats.pool_words -= POOL_WSIZE;
+  local->stats.pool_frag_words -= POOL_HEADER_WSIZE + wastage_sizeclass[sz];
+  /* TODO: give free pools back to the OS. Issue #698 */
+  caml_plat_lock(&pool_freelist.lock);
+  pool->next = pool_freelist.free;
+  pool_freelist.free = pool;
+  caml_plat_unlock(&pool_freelist.lock);
+}
+
+static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s) {
+  value* p = (value*)((char*)a + POOL_HEADER_SZ);
+  value* end = (value*)a + POOL_WSIZE;
+  mlsize_t wh = wsize_sizeclass[sz];
+  s->pool_frag_words += Wsize_bsize(POOL_HEADER_SZ);
+
+  while (p + wh <= end) {
+    header_t hd = (header_t)*p;
+    if (hd) {
+      s->pool_live_words += Whsize_hd(hd);
+      s->pool_frag_words += wh - Whsize_hd(hd);
+      s->pool_live_blocks++;
+    }
+
+    p += wh;
+  }
+  CAMLassert(end - p == wastage_sizeclass[sz]);
+  s->pool_frag_words += end - p;
+  s->pool_words += POOL_WSIZE;
+}
+
+/* Initialize a pool and its object freelist */
+Caml_inline void pool_initialize(pool* r,
+                                 sizeclass sz,
+                                 caml_domain_state* owner)
+{
+  mlsize_t wh = wsize_sizeclass[sz];
+  value* p = (value*)((char*)r + POOL_HEADER_SZ);
+  value* end = (value*)((char*)r + Bsize_wsize(POOL_WSIZE));
+
+  r->next = 0;
+  r->owner = owner;
+  r->next_obj = 0;
+  r->sz = sz;
+
+  p[0] = 0;
+  p[1] = 0;
+  p += wh;
+
+  while (p + wh <= end) {
+    p[0] = 0; /* zero header indicates free object */
+    p[1] = (value)(p - wh);
+    p += wh;
+  }
+  r->next_obj = p - wh;
+}
+
+/* Allocating an object from a pool */
+static intnat pool_sweep(struct caml_heap_state* local,
+                         pool**,
+                         sizeclass sz ,
+                         int release_to_global_pool);
+
+/* Adopt pool from the pool_freelist avail and full pools
+   to satisfy an alloction */
+static pool* pool_global_adopt(struct caml_heap_state* local, sizeclass sz)
+{
+  pool* r = NULL;
+  int adopted_pool = 0;
+
+  /* probably no available pools out there to be had */
+  if( !pool_freelist.global_avail_pools[sz] &&
+      !pool_freelist.global_full_pools[sz] )
+    return NULL;
+
+  /* Haven't managed to find a pool locally, try the global ones */
+  caml_plat_lock(&pool_freelist.lock);
+  if( pool_freelist.global_avail_pools[sz] ) {
+    r = pool_freelist.global_avail_pools[sz];
+
+    if( r ) {
+      struct heap_stats tmp_stats = { 0 };
+      pool_freelist.global_avail_pools[sz] = r->next;
+      r->next = 0;
+      local->avail_pools[sz] = r;
+
+      #ifdef DEBUG
+      {
+        value* next_obj = r->next_obj;
+        while( next_obj ) {
+          CAMLassert(next_obj[0] == 0);
+          next_obj = (value*)next_obj[1];
+        }
+      }
+      #endif
+
+      calc_pool_stats(r, sz, &tmp_stats);
+      caml_accum_heap_stats(&local->stats, &tmp_stats);
+      caml_remove_heap_stats(&pool_freelist.stats, &tmp_stats);
+
+      if (local->stats.pool_words > local->stats.pool_max_words)
+        local->stats.pool_max_words = local->stats.pool_words;
+    }
+  }
+
+  /* There were no global avail pools, so let's adopt one of the full ones and
+     try our luck sweeping it later on */
+  if( !r ) {
+    struct heap_stats tmp_stats = { 0 };
+
+    r = pool_freelist.global_full_pools[sz];
+
+    if( r ) {
+      pool_freelist.global_full_pools[sz] = r->next;
+      r->next = local->full_pools[sz];
+      local->full_pools[sz] = r;
+
+      calc_pool_stats(r, sz, &tmp_stats);
+      caml_accum_heap_stats(&local->stats, &tmp_stats);
+      caml_remove_heap_stats(&pool_freelist.stats, &tmp_stats);
+
+      adopted_pool = 1;
+      r = 0; // this pool is full
+
+      if (local->stats.pool_words > local->stats.pool_max_words) {
+        local->stats.pool_max_words = local->stats.pool_words;
+      }
+    }
+  }
+
+  caml_plat_unlock(&pool_freelist.lock);
+
+  if( !r && adopted_pool ) {
+    local->owner->major_work_todo -=
+      pool_sweep(local, &local->full_pools[sz], sz, 0);
+    r = local->avail_pools[sz];
+  }
+  return r;
+}
+
+/* Allocating an object from a pool */
+static pool* pool_find(struct caml_heap_state* local, sizeclass sz) {
+  pool* r;
+
+  /* Hopefully we have a pool we can use directly */
+  r = local->avail_pools[sz];
+  if (r) return r;
+
+  /* Otherwise, try to sweep until we find one */
+  while (!local->avail_pools[sz] && local->unswept_avail_pools[sz]) {
+    local->owner->major_work_todo -=
+      pool_sweep(local, &local->unswept_avail_pools[sz], sz, 0);
+  }
+
+  r = local->avail_pools[sz];
+  if (r) return r;
+
+  /* Haven't managed to find a pool locally, try the global ones */
+  r = pool_global_adopt(local, sz);
+  if (r) return r;
+
+  /* Failing that, we need to allocate a new pool */
+  r = pool_acquire(local);
+  if (!r) return 0; /* if we can't allocate, give up */
+
+  local->stats.pool_words += POOL_WSIZE;
+  if (local->stats.pool_words > local->stats.pool_max_words)
+    local->stats.pool_max_words = local->stats.pool_words;
+  local->stats.pool_frag_words += POOL_HEADER_WSIZE + wastage_sizeclass[sz];
+
+  /* Having allocated a new pool, set it up for size sz */
+  local->avail_pools[sz] = r;
+  pool_initialize(r, sz, local->owner);
+
+  return r;
+}
+
+static void* pool_allocate(struct caml_heap_state* local, sizeclass sz) {
+  value* p;
+  value* next;
+  pool* r = pool_find(local, sz);
+
+  if (!r) return 0;
+
+
+  p = r->next_obj;
+  next = (value*)p[1];
+  r->next_obj = next;
+  CAMLassert(p[0] == 0);
+  if (!next) {
+    local->avail_pools[sz] = r->next;
+    r->next = local->full_pools[sz];
+    local->full_pools[sz] = r;
+  }
+
+  CAMLassert(r->next_obj == 0 || *r->next_obj == 0);
+  return p;
+}
+
+static void* large_allocate(struct caml_heap_state* local, mlsize_t sz) {
+  large_alloc* a = malloc(sz + LARGE_ALLOC_HEADER_SZ);
+  if (!a) return NULL;
+  local->stats.large_words += Wsize_bsize(sz + LARGE_ALLOC_HEADER_SZ);
+  if (local->stats.large_words > local->stats.large_max_words)
+    local->stats.large_max_words = local->stats.large_words;
+  local->stats.large_blocks++;
+  a->owner = local->owner;
+  a->next = local->swept_large;
+  local->swept_large = a;
+  return (char*)a + LARGE_ALLOC_HEADER_SZ;
+}
+
+value* caml_shared_try_alloc(struct caml_heap_state* local, mlsize_t wosize,
+                             tag_t tag, int pinned)
+{
+  mlsize_t whsize = Whsize_wosize(wosize);
+  value* p;
+  uintnat colour;
+
+  CAMLassert (wosize > 0);
+  CAMLassert (tag != Infix_tag);
+  if (whsize <= SIZECLASS_MAX) {
+    struct heap_stats* s;
+    sizeclass sz = sizeclass_wsize[whsize];
+    CAMLassert(wsize_sizeclass[sz] >= whsize);
+    p = pool_allocate(local, sz);
+    if (!p) return 0;
+    s = &local->stats;
+    s->pool_live_blocks++;
+    s->pool_live_words += whsize;
+    s->pool_frag_words += wsize_sizeclass[sz] - whsize;
+  } else {
+    p = large_allocate(local, Bsize_wsize(whsize));
+    if (!p) return 0;
+  }
+  colour = pinned ? NOT_MARKABLE : caml_global_heap_state.MARKED;
+  Hd_hp (p) = Make_header(wosize, tag, colour);
+#ifdef DEBUG
+  {
+    int i;
+    for (i = 0; i < wosize; i++) {
+      Field(Val_hp(p), i) = Debug_free_major;
+    }
+  }
+#endif
+  return p;
+}
+
+struct pool* caml_pool_of_shared_block(value v)
+{
+  mlsize_t whsize;
+  CAMLassert (Is_block(v) && !Is_young(v));
+  whsize = Whsize_wosize(Wosize_val(v));
+  if (whsize > 0 && whsize <= SIZECLASS_MAX) {
+    return (pool*)((uintnat)v &~(POOL_WSIZE * sizeof(value) - 1));
+  } else {
+    return 0;
+  }
+}
+
+/* Sweeping */
+
+static intnat pool_sweep(struct caml_heap_state* local, pool** plist,
+                         sizeclass sz, int release_to_global_pool) {
+  intnat work = 0;
+  pool* a = *plist;
+  if (!a) return 0;
+  *plist = a->next;
+
+  {
+    value* p = (value*)((char*)a + POOL_HEADER_SZ);
+    value* end = (value*)a + POOL_WSIZE;
+    mlsize_t wh = wsize_sizeclass[sz];
+    int all_used = 1;
+    struct heap_stats* s = &local->stats;
+
+    while (p + wh <= end) {
+      header_t hd = (header_t)*p;
+      if (hd == 0) {
+        /* already on freelist */
+        all_used = 0;
+      } else if (Has_status_hd(hd, caml_global_heap_state.GARBAGE)) {
+        CAMLassert(Whsize_hd(hd) <= wh);
+        if (Tag_hd (hd) == Custom_tag) {
+          void (*final_fun)(value) = Custom_ops_val(Val_hp(p))->finalize;
+          if (final_fun != NULL) final_fun(Val_hp(p));
+        }
+        /* add to freelist */
+        p[0] = 0;
+        p[1] = (value)a->next_obj;
+        CAMLassert(Is_block((value)p));
+#ifdef DEBUG
+        {
+          int i;
+          mlsize_t wo = Wosize_whsize(wh);
+          for (i = 2; i < wo; i++) {
+            Field(Val_hp(p), i) = Debug_free_major;
+          }
+        }
+#endif
+        a->next_obj = p;
+        all_used = 0;
+        /* update stats */
+        s->pool_live_blocks--;
+        s->pool_live_words -= Whsize_hd(hd);
+        local->owner->swept_words += Whsize_hd(hd);
+        s->pool_frag_words -= (wh - Whsize_hd(hd));
+      } else {
+        /* still live, the pool can't be released to the global freelist */
+        release_to_global_pool = 0;
+      }
+      p += wh;
+      work += wh;
+    }
+
+    if (release_to_global_pool) {
+      pool_release(local, a, sz);
+    } else {
+      pool** list = all_used ? &local->full_pools[sz] : &local->avail_pools[sz];
+      a->next = *list;
+      *list = a;
+    }
+  }
+
+  return work;
+}
+
+static intnat large_alloc_sweep(struct caml_heap_state* local) {
+  value* p;
+  header_t hd;
+  large_alloc* a = local->unswept_large;
+  if (!a) return 0;
+  local->unswept_large = a->next;
+
+  p = (value*)((char*)a + LARGE_ALLOC_HEADER_SZ);
+  hd = (header_t)*p;
+  if (Has_status_hd(hd, caml_global_heap_state.GARBAGE)) {
+    if (Tag_hd (hd) == Custom_tag) {
+      void (*final_fun)(value) = Custom_ops_val(Val_hp(p))->finalize;
+      if (final_fun != NULL) final_fun(Val_hp(p));
+    }
+
+    local->stats.large_words -=
+      Whsize_hd(hd) + Wsize_bsize(LARGE_ALLOC_HEADER_SZ);
+    local->owner->swept_words +=
+      Whsize_hd(hd) + Wsize_bsize(LARGE_ALLOC_HEADER_SZ);
+    local->stats.large_blocks--;
+    free(a);
+  } else {
+    a->next = local->swept_large;
+    local->swept_large = a;
+  }
+
+  return Whsize_hd(hd);
+}
+
+static void verify_swept(struct caml_heap_state*);
+
+intnat caml_sweep(struct caml_heap_state* local, intnat work) {
+  /* Sweep local pools */
+  while (work > 0 && local->next_to_sweep < NUM_SIZECLASSES) {
+    sizeclass sz = local->next_to_sweep;
+    intnat full_sweep_work = 0;
+    intnat avail_sweep_work =
+      pool_sweep(local, &local->unswept_avail_pools[sz], sz, 1);
+    work -= avail_sweep_work;
+
+    if (work > 0) {
+      full_sweep_work = pool_sweep(local,
+                                   &local->unswept_full_pools[sz],
+                                   sz, 1);
+
+      work -= full_sweep_work;
+    }
+
+    if(full_sweep_work+avail_sweep_work == 0) {
+      local->next_to_sweep++;
+    }
+  }
+
+  /* Sweep global pools */
+  while (work > 0 && local->unswept_large) {
+    work -= large_alloc_sweep(local);
+  }
+
+  if (caml_params->verify_heap && work > 0) {
+    /* sweeping is complete, check everything worked */
+    verify_swept(local);
+  }
+  return work;
+}
+
+uintnat caml_heap_size(struct caml_heap_state* local) {
+  return Bsize_wsize(local->stats.pool_words + local->stats.large_words);
+}
+
+uintnat caml_top_heap_words(struct caml_heap_state* local) {
+  return local->stats.pool_max_words + local->stats.large_max_words;
+}
+
+
+uintnat caml_heap_blocks(struct caml_heap_state* local) {
+  return local->stats.pool_live_blocks + local->stats.large_blocks;
+}
+
+void caml_redarken_pool(struct pool* r, scanning_action f, void* fdata) {
+  mlsize_t wh = wsize_sizeclass[r->sz];
+  value* p = (value*)((char*)r + POOL_HEADER_SZ);
+  value* end = (value*)((char*)r + Bsize_wsize(POOL_WSIZE));
+
+  while (p + wh <= end) {
+    header_t hd = p[0];
+    if (hd != 0 && Has_status_hd(hd, caml_global_heap_state.MARKED)) {
+      f(fdata, Val_hp(p), 0);
+    }
+    p += wh;
+  }
+}
+
+
+static const header_t atoms[256] = {
+#define A(i) Make_header(0, i, NOT_MARKABLE)
+A(0),A(1),A(2),A(3),A(4),A(5),A(6),A(7),A(8),A(9),A(10),
+A(11),A(12),A(13),A(14),A(15),A(16),A(17),A(18),A(19),A(20),
+A(21),A(22),A(23),A(24),A(25),A(26),A(27),A(28),A(29),A(30),
+A(31),A(32),A(33),A(34),A(35),A(36),A(37),A(38),A(39),A(40),
+A(41),A(42),A(43),A(44),A(45),A(46),A(47),A(48),A(49),A(50),
+A(51),A(52),A(53),A(54),A(55),A(56),A(57),A(58),A(59),A(60),
+A(61),A(62),A(63),A(64),A(65),A(66),A(67),A(68),A(69),A(70),
+A(71),A(72),A(73),A(74),A(75),A(76),A(77),A(78),A(79),A(80),
+A(81),A(82),A(83),A(84),A(85),A(86),A(87),A(88),A(89),A(90),
+A(91),A(92),A(93),A(94),A(95),A(96),A(97),A(98),A(99),A(100),
+A(101),A(102),A(103),A(104),A(105),A(106),A(107),A(108),A(109),
+A(110),A(111),A(112),A(113),A(114),A(115),A(116),A(117),A(118),
+A(119),A(120),A(121),A(122),A(123),A(124),A(125),A(126),A(127),
+A(128),A(129),A(130),A(131),A(132),A(133),A(134),A(135),A(136),
+A(137),A(138),A(139),A(140),A(141),A(142),A(143),A(144),A(145),
+A(146),A(147),A(148),A(149),A(150),A(151),A(152),A(153),A(154),
+A(155),A(156),A(157),A(158),A(159),A(160),A(161),A(162),A(163),
+A(164),A(165),A(166),A(167),A(168),A(169),A(170),A(171),A(172),
+A(173),A(174),A(175),A(176),A(177),A(178),A(179),A(180),A(181),
+A(182),A(183),A(184),A(185),A(186),A(187),A(188),A(189),A(190),
+A(191),A(192),A(193),A(194),A(195),A(196),A(197),A(198),A(199),
+A(200),A(201),A(202),A(203),A(204),A(205),A(206),A(207),A(208),
+A(209),A(210),A(211),A(212),A(213),A(214),A(215),A(216),A(217),
+A(218),A(219),A(220),A(221),A(222),A(223),A(224),A(225),A(226),
+A(227),A(228),A(229),A(230),A(231),A(232),A(233),A(234),A(235),
+A(236),A(237),A(238),A(239),A(240),A(241),A(242),A(243),A(244),
+A(245),A(246),A(247),A(248),A(249),A(250),A(251),A(252),A(253),
+A(254),A(255)
+#undef A
+};
+
+CAMLexport value caml_atom(tag_t tag) {
+  return Val_hp(&atoms[tag]);
+}
+
+void caml_init_major_heap (asize_t size) {
+}
+
+
+/* Verify heap invariants.
+
+   Verification happens just after the heap is cycled during STW, so
+   everything should be unmarked. If something reachable marked after
+   cycling the heap, it means that garbage was reachable beforehand.
+*/
+struct heap_verify_state {
+  value* stack;
+  int stack_len;
+  int sp;
+  intnat objs;
+  struct addrmap seen;
+};
+
+struct heap_verify_state* caml_verify_begin (void)
+{
+  struct heap_verify_state init = {0, 0, 0, 0, ADDRMAP_INIT};
+  struct heap_verify_state* st = caml_stat_alloc(sizeof init);
+  *st = init;
+  return st;
+}
+
+static void verify_push (void* st_v, value v, value* p)
+{
+  struct heap_verify_state* st = st_v;
+  if (!Is_block(v)) return;
+
+  if (st->sp == st->stack_len) {
+    st->stack_len = st->stack_len * 2 + 100;
+    st->stack = caml_stat_resize(st->stack,
+         sizeof(value*) * st->stack_len);
+  }
+  st->stack[st->sp++] = v;
+}
+
+void caml_verify_root(void* state, value v, value* p)
+{
+  verify_push(state, v, p);
+}
+
+static void verify_object(struct heap_verify_state* st, value v) {
+  intnat* entry;
+  if (!Is_block(v)) return;
+
+  CAMLassert (!Is_young(v));
+  CAMLassert (Hd_val(v));
+
+  if (Tag_val(v) == Infix_tag) {
+    v -= Infix_offset_val(v);
+    CAMLassert(Tag_val(v) == Closure_tag);
+  }
+
+  entry = caml_addrmap_insert_pos(&st->seen, v);
+  if (*entry != ADDRMAP_NOT_PRESENT) return;
+  *entry = 1;
+
+  if (Has_status_hd(Hd_val(v), NOT_MARKABLE)) return;
+  st->objs++;
+
+  CAMLassert(Has_status_hd(Hd_val(v), caml_global_heap_state.UNMARKED));
+
+  if (Tag_val(v) == Cont_tag) {
+    struct stack_info* stk = Ptr_val(Field(v, 0));
+    if (stk != NULL)
+      caml_scan_stack(verify_push, st, stk, 0);
+  } else if (Tag_val(v) < No_scan_tag) {
+    int i = 0;
+    if (Tag_val(v) == Closure_tag) {
+      i = Start_env_closinfo(Closinfo_val(v));
+    }
+    for (; i < Wosize_val(v); i++) {
+      value f = Field(v, i);
+      if (Is_block(f)) verify_push(st, f, Op_val(v)+i);
+    }
+  }
+}
+
+void caml_verify_heap(struct heap_verify_state* st) {
+  while (st->sp) verify_object(st, st->stack[--st->sp]);
+
+  caml_addrmap_clear(&st->seen);
+  caml_stat_free(st->stack);
+  caml_stat_free(st);
+}
+
+
+struct mem_stats {
+  /* unit is words */
+  uintnat alloced;
+  uintnat live;
+  uintnat free;
+  uintnat overhead;
+
+  uintnat live_blocks;
+};
+
+static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
+  value* v;
+  for (v = a->next_obj; v; v = (value*)v[1]) {
+    CAMLassert(*v == 0);
+  }
+
+  {
+    value* p = (value*)((char*)a + POOL_HEADER_SZ);
+    value* end = (value*)a + POOL_WSIZE;
+    mlsize_t wh = wsize_sizeclass[sz];
+    s->overhead += Wsize_bsize(POOL_HEADER_SZ);
+
+    while (p + wh <= end) {
+      header_t hd = (header_t)*p;
+      CAMLassert(hd == 0 || !Has_status_hd(hd, caml_global_heap_state.GARBAGE));
+      if (hd) {
+        s->live += Whsize_hd(hd);
+        s->overhead += wh - Whsize_hd(hd);
+        s->live_blocks++;
+      } else {
+        s->free += wh;
+      }
+      p += wh;
+    }
+    CAMLassert(end - p == wastage_sizeclass[sz]);
+    s->overhead += end - p;
+    s->alloced += POOL_WSIZE;
+  }
+}
+
+static void verify_large(large_alloc* a, struct mem_stats* s) {
+  for (; a; a = a->next) {
+    header_t hd = *(header_t*)((char*)a + LARGE_ALLOC_HEADER_SZ);
+    CAMLassert (!Has_status_hd(hd, caml_global_heap_state.GARBAGE));
+    s->alloced += Wsize_bsize(LARGE_ALLOC_HEADER_SZ) + Whsize_hd(hd);
+    s->overhead += Wsize_bsize(LARGE_ALLOC_HEADER_SZ);
+    s->live_blocks++;
+  }
+}
+
+static void verify_swept (struct caml_heap_state* local) {
+  int i;
+  struct mem_stats pool_stats = {}, large_stats = {};
+
+  /* sweeping should be done by this point */
+  CAMLassert(local->next_to_sweep == NUM_SIZECLASSES);
+  for (i = 0; i < NUM_SIZECLASSES; i++) {
+    pool* p;
+    CAMLassert(local->unswept_avail_pools[i] == NULL &&
+               local->unswept_full_pools[i] == NULL);
+    for (p = local->avail_pools[i]; p; p = p->next)
+      verify_pool(p, i, &pool_stats);
+    for (p = local->full_pools[i]; p; p = p->next) {
+      CAMLassert(p->next_obj == NULL);
+      verify_pool(p, i, &pool_stats);
+    }
+  }
+  caml_gc_log("Pooled memory: %" ARCH_INTNAT_PRINTF_FORMAT
+                "u alloced, %" ARCH_INTNAT_PRINTF_FORMAT
+                "u free, %" ARCH_INTNAT_PRINTF_FORMAT
+                "u fragmentation",
+              pool_stats.alloced, pool_stats.free, pool_stats.overhead);
+
+  verify_large(local->swept_large, &large_stats);
+  CAMLassert(local->unswept_large == NULL);
+  caml_gc_log("Large memory: %" ARCH_INTNAT_PRINTF_FORMAT
+                "u alloced, %" ARCH_INTNAT_PRINTF_FORMAT
+                "u free, %" ARCH_INTNAT_PRINTF_FORMAT
+                "u fragmentation",
+              large_stats.alloced, large_stats.free, large_stats.overhead);
+
+  /* Check stats are being computed correctly */
+  CAMLassert(local->stats.pool_words == pool_stats.alloced);
+  CAMLassert(local->stats.pool_live_words == pool_stats.live);
+  CAMLassert(local->stats.pool_live_blocks == pool_stats.live_blocks);
+  CAMLassert(local->stats.pool_frag_words == pool_stats.overhead);
+  CAMLassert(local->stats.pool_words -
+         (local->stats.pool_live_words + local->stats.pool_frag_words)
+         == pool_stats.free);
+  CAMLassert(local->stats.large_words == large_stats.alloced);
+  CAMLassert(local->stats.large_blocks == large_stats.live_blocks);
+}
+
+void caml_cycle_heap_stw (void) {
+  struct global_heap_state oldg = caml_global_heap_state;
+  struct global_heap_state newg;
+  newg.UNMARKED     = oldg.MARKED;
+  newg.GARBAGE      = oldg.UNMARKED;
+  newg.MARKED       = oldg.GARBAGE; /* should be empty because
+                                        garbage was swept */
+  caml_global_heap_state = newg;
+}
+
+void caml_cycle_heap(struct caml_heap_state* local) {
+  int i, received_p = 0, received_l = 0;
+
+  caml_gc_log("Cycling heap [%02d]", local->owner->id);
+  for (i = 0; i < NUM_SIZECLASSES; i++) {
+    CAMLassert(local->unswept_avail_pools[i] == NULL);
+    local->unswept_avail_pools[i] = local->avail_pools[i];
+    local->avail_pools[i] = NULL;
+    CAMLassert(local->unswept_full_pools[i] == NULL);
+    local->unswept_full_pools[i] = local->full_pools[i];
+    local->full_pools[i] = NULL;
+  }
+  CAMLassert(local->unswept_large == NULL);
+  local->unswept_large = local->swept_large;
+  local->swept_large = NULL;
+
+  caml_plat_lock(&pool_freelist.lock);
+  for (i = 0; i < NUM_SIZECLASSES; i++) {
+    received_p += move_all_pools(&pool_freelist.global_avail_pools[i],
+                                 &local->unswept_avail_pools[i],
+                                 local->owner);
+    received_p += move_all_pools(&pool_freelist.global_full_pools[i],
+                                 &local->unswept_full_pools[i],
+                                 local->owner);
+  }
+  while (pool_freelist.global_large) {
+    large_alloc* a = pool_freelist.global_large;
+    pool_freelist.global_large = a->next;
+    a->owner = local->owner;
+    a->next = local->unswept_large;
+    local->unswept_large = a;
+    received_l++;
+  }
+  if (received_p || received_l) {
+    caml_accum_heap_stats(&local->stats, &pool_freelist.stats);
+    memset(&pool_freelist.stats, 0, sizeof(pool_freelist.stats));
+  }
+  caml_plat_unlock(&pool_freelist.lock);
+  if (received_p || received_l)
+    caml_gc_log("Received %d new pools, %d new large allocs",
+                received_p, received_l);
+
+  local->next_to_sweep = 0;
+}
diff --git a/runtime/signals.c b/runtime/signals.c
index 7cf746f275a6..df80557c9beb 100644
--- a/runtime/signals.c
+++ b/runtime/signals.c
@@ -37,32 +37,18 @@
 #define NSIG 64
 #endif
 
-CAMLexport int volatile caml_something_to_do = 0;
-
 /* The set of pending signals (received but not yet processed) */
 
-static intnat volatile signals_are_pending = 0;
-CAMLexport intnat volatile caml_pending_signals[NSIG];
+CAMLexport atomic_intnat caml_pending_signals[NSIG];
+static caml_plat_mutex signal_install_mutex = CAML_PLAT_MUTEX_INITIALIZER;
 
-#ifdef POSIX_SIGNALS
-/* This wrapper makes [sigprocmask] compatible with
-   [pthread_sigmask]. Indeed, the latter returns the error code while
-   the former sets [errno].
- */
-static int sigprocmask_wrapper(int how, const sigset_t *set, sigset_t *oldset) {
-  if(sigprocmask(how, set, oldset) != 0) return errno;
-  else return 0;
-}
-
-CAMLexport int (*caml_sigmask_hook)(int, const sigset_t *, sigset_t *)
-  = sigprocmask_wrapper;
-#endif
-
-static int check_for_pending_signals(void)
+int caml_check_for_pending_signals(void)
 {
   int i;
+
   for (i = 0; i < NSIG; i++) {
-    if (caml_pending_signals[i]) return 1;
+    if (atomic_load_explicit(&caml_pending_signals[i], memory_order_seq_cst))
+      return 1;
   }
   return 0;
 }
@@ -72,75 +58,81 @@ static int check_for_pending_signals(void)
 CAMLexport value caml_process_pending_signals_exn(void)
 {
   int i;
+  intnat specific_signal_pending;
+  value exn;
 #ifdef POSIX_SIGNALS
   sigset_t set;
 #endif
 
-  if(!signals_are_pending)
-    return Val_unit;
-  signals_are_pending = 0;
-
   /* Check that there is indeed a pending signal before issuing the
-     syscall in [caml_sigmask_hook]. */
-  if (!check_for_pending_signals())
+      syscall in [pthread_sigmask]. */
+  if (!caml_check_for_pending_signals())
     return Val_unit;
 
 #ifdef POSIX_SIGNALS
-  caml_sigmask_hook(/* dummy */ SIG_BLOCK, NULL, &set);
+  pthread_sigmask(/* dummy */ SIG_BLOCK, NULL, &set);
 #endif
   for (i = 0; i < NSIG; i++) {
-    if (!caml_pending_signals[i])
+    if ( atomic_load_explicit
+          (&caml_pending_signals[i], memory_order_seq_cst) == 0 )
       continue;
 #ifdef POSIX_SIGNALS
     if(sigismember(&set, i))
       continue;
 #endif
-    caml_pending_signals[i] = 0;
-    {
-      value exn = caml_execute_signal_exn(i, 0);
+  again:
+    specific_signal_pending = atomic_load_explicit
+                               (&caml_pending_signals[i], memory_order_seq_cst);
+    if( specific_signal_pending > 0 ) {
+      if( !atomic_compare_exchange_strong(
+            &caml_pending_signals[i],
+             &specific_signal_pending, 0) ) {
+        /* We failed our CAS because another thread beat us to processing
+           this signal. Try again to see if there are more of this signal
+           to process. */
+        goto again;
+      }
+
+      exn = caml_execute_signal_exn(i, 0);
       if (Is_exception_result(exn)) return exn;
     }
   }
   return Val_unit;
 }
 
-CAMLno_tsan /* When called from [caml_record_signal], these memory
-               accesses may not be synchronized. */
-void caml_set_action_pending(void)
-{
-  caml_something_to_do = 1;
-
-  /* When this function is called without [caml_c_call] (e.g., in
-     [caml_modify]), this is only moderately effective on ports that cache
-     [Caml_state->young_limit] in a register, so it may take a while before the
-     register is reloaded from [Caml_state->young_limit]. */
-  Caml_state->young_limit = Caml_state->young_alloc_end;
+CAMLexport void caml_process_pending_signals(void) {
+  value exn = caml_process_pending_signals_exn();
+  caml_raise_if_exception(exn);
 }
 
 /* Record the delivery of a signal, and arrange for it to be processed
    as soon as possible:
-   - via caml_something_to_do, processed in
-     caml_process_pending_actions_exn.
+   - via the pending signal counters, processed in
+     caml_process_pending_signals_exn.
    - by playing with the allocation limit, processed in
-     caml_garbage_collection and caml_alloc_small_dispatch.
+     caml_garbage_collection
 */
 
-CAMLno_tsan
 CAMLexport void caml_record_signal(int signal_number)
 {
-  caml_pending_signals[signal_number] = 1;
-  signals_are_pending = 1;
-  caml_set_action_pending();
+  atomic_store_explicit
+    (&caml_pending_signals[signal_number], 1, memory_order_seq_cst);
+
+  caml_interrupt_self();
 }
 
 /* Management of blocking sections. */
 
 static void caml_enter_blocking_section_default(void)
 {
+  caml_bt_exit_ocaml();
+  caml_release_domain_lock();
 }
 
 static void caml_leave_blocking_section_default(void)
 {
+  caml_bt_enter_ocaml();
+  caml_acquire_domain_lock();
 }
 
 CAMLexport void (*caml_enter_blocking_section_hook)(void) =
@@ -148,16 +140,15 @@ CAMLexport void (*caml_enter_blocking_section_hook)(void) =
 CAMLexport void (*caml_leave_blocking_section_hook)(void) =
    caml_leave_blocking_section_default;
 
-CAMLno_tsan /* The read of [caml_something_to_do] is not synchronized. */
 CAMLexport void caml_enter_blocking_section(void)
 {
   while (1){
     /* Process all pending signals now */
-    caml_raise_if_exception(caml_process_pending_signals_exn());
+    caml_process_pending_signals();
     caml_enter_blocking_section_hook ();
     /* Check again for pending signals.
        If none, done; otherwise, try again */
-    if (! signals_are_pending) break;
+    if (!caml_check_for_pending_signals()) break;
     caml_leave_blocking_section_hook ();
   }
 }
@@ -174,29 +165,21 @@ CAMLexport void caml_leave_blocking_section(void)
   saved_errno = errno;
   caml_leave_blocking_section_hook ();
 
-  /* Some other thread may have switched
-     [signals_are_pending] to 0 even though there are still
-     pending signals (masked in the other thread). To handle this
-     case, we force re-examination of all signals by setting it back
-     to 1.
-
-     Another case where this is necessary (even in a single threaded
-     setting) is when the blocking section unmasks a pending signal:
-     If the signal is pending and masked but has already been
-     examined by [caml_process_pending_signals_exn], then
-     [signals_are_pending] is 0 but the signal needs to be
-     handled at this point. */
-  if (check_for_pending_signals()) {
-    signals_are_pending = 1;
-    caml_set_action_pending();
-  }
-
   errno = saved_errno;
 }
 
-/* Execute a signal handler immediately */
+static value caml_signal_handlers;
+
+void caml_init_signal_handling(void) {
+  mlsize_t i;
 
-static value caml_signal_handlers = 0;
+  caml_signal_handlers = caml_alloc_shr(NSIG, 0);
+  for (i = 0; i < NSIG; i++)
+    Field(caml_signal_handlers, i) = Val_unit;
+  caml_register_generational_global_root(&caml_signal_handlers);
+}
+
+/* Execute a signal handler immediately */
 
 value caml_execute_signal_exn(int signal_number, int in_signal_handler)
 {
@@ -208,7 +191,7 @@ value caml_execute_signal_exn(int signal_number, int in_signal_handler)
      the original signal mask */
   sigemptyset(&nsigs);
   sigaddset(&nsigs, signal_number);
-  caml_sigmask_hook(SIG_BLOCK, &nsigs, &sigs);
+  pthread_sigmask(SIG_BLOCK, &nsigs, &sigs);
 #endif
   handler = Field(caml_signal_handlers, signal_number);
     res = caml_callback_exn(
@@ -217,120 +200,40 @@ value caml_execute_signal_exn(int signal_number, int in_signal_handler)
 #ifdef POSIX_SIGNALS
   if (! in_signal_handler) {
     /* Restore the original signal mask */
-    caml_sigmask_hook(SIG_SETMASK, &sigs, NULL);
+    pthread_sigmask(SIG_SETMASK, &sigs, NULL);
   } else if (Is_exception_result(res)) {
     /* Restore the original signal mask and unblock the signal itself */
     sigdelset(&sigs, signal_number);
-    caml_sigmask_hook(SIG_SETMASK, &sigs, NULL);
+    pthread_sigmask(SIG_SETMASK, &sigs, NULL);
   }
 #endif
   return res;
 }
 
-void caml_update_young_limit (void)
-{
-  CAMLassert(Caml_state->young_alloc_start <= caml_memprof_young_trigger &&
-             caml_memprof_young_trigger <= Caml_state->young_alloc_end);
-  CAMLassert(Caml_state->young_alloc_start <= Caml_state->young_trigger &&
-             Caml_state->young_trigger < Caml_state->young_alloc_end);
-
-  /* The minor heap grows downwards. The first trigger is the largest one. */
-  Caml_state->young_limit =
-    caml_memprof_young_trigger < Caml_state->young_trigger ?
-    Caml_state->young_trigger : caml_memprof_young_trigger;
-
-  if(caml_something_to_do)
-    Caml_state->young_limit = Caml_state->young_alloc_end;
-}
-
 /* Arrange for a garbage collection to be performed as soon as possible */
 
 void caml_request_major_slice (void)
 {
   Caml_state->requested_major_slice = 1;
-  caml_set_action_pending();
+  caml_interrupt_self();
 }
 
 void caml_request_minor_gc (void)
 {
   Caml_state->requested_minor_gc = 1;
-  caml_set_action_pending();
-}
-
-value caml_do_pending_actions_exn(void)
-{
-  value exn;
-
-  caml_something_to_do = 0;
-
-  // Do any pending minor collection or major slice
-  caml_check_urgent_gc(Val_unit);
-
-  caml_update_young_limit();
-
-  // Call signal handlers first
-  exn = caml_process_pending_signals_exn();
-  if (Is_exception_result(exn)) goto exception;
-
-  // Call memprof callbacks
-  exn = caml_memprof_handle_postponed_exn();
-  if (Is_exception_result(exn)) goto exception;
-
-  // Call finalisers
-  exn = caml_final_do_calls_exn();
-  if (Is_exception_result(exn)) goto exception;
-
-  return Val_unit;
-
-exception:
-  /* If an exception is raised during an asynchronous callback, then
-     it might be the case that we did not run all the callbacks we
-     needed. Therefore, we set [caml_something_to_do] again in order
-     to force reexamination of callbacks. */
-  caml_set_action_pending();
-  return exn;
+  caml_interrupt_self();
 }
 
-CAMLno_tsan /* The access to [caml_something_to_do] is not synchronized. */
-Caml_inline value process_pending_actions_with_root_exn(value extra_root)
+CAMLextern value caml_process_pending_signals_with_root_exn(value extra_root)
 {
-  if (caml_something_to_do) {
-    CAMLparam1(extra_root);
-    value exn = caml_do_pending_actions_exn();
-    if (Is_exception_result(exn))
-      CAMLreturn(exn);
-    CAMLdrop;
-  }
+  CAMLparam1(extra_root);
+  value exn = caml_process_pending_signals_exn();
+  if (Is_exception_result(exn))
+    CAMLreturn(exn);
+  CAMLdrop;
   return extra_root;
 }
 
-CAMLno_tsan /* The access to [caml_something_to_do] is not synchronized. */
-int caml_check_pending_actions()
-{
-  return caml_something_to_do;
-}
-
-value caml_process_pending_actions_with_root_exn(value extra_root)
-{
-  return process_pending_actions_with_root_exn(extra_root);
-}
-
-value caml_process_pending_actions_with_root(value extra_root)
-{
-  value res = process_pending_actions_with_root_exn(extra_root);
-  return caml_raise_if_exception(res);
-}
-
-CAMLexport value caml_process_pending_actions_exn(void)
-{
-  return process_pending_actions_with_root_exn(Val_unit);
-}
-
-CAMLexport void caml_process_pending_actions(void)
-{
-  value exn = process_pending_actions_with_root_exn(Val_unit);
-  caml_raise_if_exception(exn);
-}
 
 /* OS-independent numbering of signals */
 
@@ -442,12 +345,60 @@ CAMLexport int caml_rev_convert_signal_number(int signo)
   return signo;
 }
 
+int caml_init_signal_stack(void)
+{
+#ifdef POSIX_SIGNALS
+  stack_t stk;
+  stk.ss_flags = 0;
+  stk.ss_size = SIGSTKSZ;
+  stk.ss_sp = caml_stat_alloc_noexc(stk.ss_size);
+  if(stk.ss_sp == NULL) {
+    return -1;
+  }
+  if (sigaltstack(&stk, NULL) < 0) {
+    caml_stat_free(stk.ss_sp);
+    return -1;
+  }
+
+  /* gprof installs a signal handler for SIGPROF.
+     Make it run on the alternate signal stack, to prevent segfaults. */
+  {
+    struct sigaction act;
+    sigaction(SIGPROF, NULL, &act);
+    if ((act.sa_flags & SA_SIGINFO) ||
+        (act.sa_handler != SIG_IGN && act.sa_handler != SIG_DFL)) {
+      /* found a handler */
+      if ((act.sa_flags & SA_ONSTACK) == 0) {
+        act.sa_flags |= SA_ONSTACK;
+        sigaction(SIGPROF, &act, NULL);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+void caml_free_signal_stack(void)
+{
+#ifdef POSIX_SIGNALS
+  stack_t stk, disable = {0};
+  disable.ss_flags = SS_DISABLE;
+  /* POSIX says ss_size is ignored when SS_DISABLE is set,
+     but OSX/Darwin fails if the size isn't set. */
+  disable.ss_size = SIGSTKSZ;
+  if (sigaltstack(&disable, &stk) < 0) {
+    caml_fatal_error_arg("Failed to reset signal stack: %s", strerror(errno));
+  }
+  caml_stat_free(stk.ss_sp);
+#endif
+}
+
 /* Installation of a signal handler (as per [Sys.signal]) */
 
 CAMLprim value caml_install_signal_handler(value signal_number, value action)
 {
   CAMLparam2 (signal_number, action);
-  CAMLlocal1 (res);
+  CAMLlocal2 (res, tmp_signal_handlers);
   int sig, act, oldact;
 
   sig = caml_convert_signal_number(Int_val(signal_number));
@@ -480,12 +431,21 @@ CAMLprim value caml_install_signal_handler(value signal_number, value action)
     caml_sys_error(NO_ARG);
   }
   if (Is_block(action)) {
+    /* Speculatively allocate this so we don't hold the lock for
+       a GC */
+    if (caml_signal_handlers == 0) {
+      tmp_signal_handlers = caml_alloc(NSIG, 0);
+    }
+    caml_plat_lock(&signal_install_mutex);
     if (caml_signal_handlers == 0) {
-      caml_signal_handlers = caml_alloc(NSIG, 0);
+      /* caml_alloc cannot raise asynchronous exceptions from signals
+         so this is safe */
+      caml_signal_handlers = tmp_signal_handlers;
       caml_register_global_root(&caml_signal_handlers);
     }
     caml_modify(&Field(caml_signal_handlers, sig), Field(action, 0));
+    caml_plat_unlock(&signal_install_mutex);
   }
-  caml_raise_if_exception(caml_process_pending_signals_exn());
+  caml_process_pending_signals();
   CAMLreturn (res);
 }
diff --git a/runtime/signals_byt.c b/runtime/signals_byt.c
index 439fb5640481..b0a2187a3e26 100644
--- a/runtime/signals_byt.c
+++ b/runtime/signals_byt.c
@@ -80,8 +80,3 @@ int caml_set_signal_action(int signo, int action)
   else
     return 0;
 }
-
-CAMLexport int caml_setup_stack_overflow_detection(void) { return 0; }
-CAMLexport int caml_stop_stack_overflow_detection(void) { return 0; }
-CAMLexport void caml_init_signals(void) { }
-CAMLexport void caml_terminate_signals(void) { }
diff --git a/runtime/signals_nat.c b/runtime/signals_nat.c
index 76e6e179be3b..12c431c85969 100644
--- a/runtime/signals_nat.c
+++ b/runtime/signals_nat.c
@@ -27,15 +27,15 @@
 #include <errno.h>
 #include <stdio.h>
 #include "caml/codefrag.h"
+#include "caml/domain.h"
 #include "caml/fail.h"
+#include "caml/fiber.h"
+#include "caml/frame_descriptors.h"
 #include "caml/memory.h"
 #include "caml/osdeps.h"
 #include "caml/signals.h"
-#include "caml/signals_machdep.h"
 #include "signals_osdep.h"
 #include "caml/stack.h"
-#include "caml/memprof.h"
-#include "caml/finalise.h"
 
 #ifndef NSIG
 #define NSIG 64
@@ -46,7 +46,6 @@ typedef void (*signal_handler)(int signo);
 #ifdef _WIN32
 extern signal_handler caml_win32_signal(int sig, signal_handler action);
 #define signal(sig,act) caml_win32_signal(sig,act)
-extern void caml_win32_overflow_detection();
 #endif
 
 /* This routine is the common entry point for garbage collection
@@ -61,40 +60,66 @@ extern void caml_win32_overflow_detection();
 void caml_garbage_collection(void)
 {
   frame_descr* d;
-  intnat allocsz = 0, i, nallocs;
-  unsigned char* alloc_len;
+  intnat allocsz = 0;
+  char *sp;
+  uintnat retaddr;
+  intnat whsize;
+
+  caml_frame_descrs fds = caml_get_frame_descrs();
+  struct stack_info* stack = Caml_state->current_stack;
+
+  sp = (char*)stack->sp;
+  retaddr = *(uintnat*)sp;
 
   { /* Find the frame descriptor for the current allocation */
-    uintnat h = Hash_retaddr(Caml_state->last_return_address);
+    uintnat h = Hash_retaddr(retaddr, fds.mask);
     while (1) {
-      d = caml_frame_descriptors[h];
-      if (d->retaddr == Caml_state->last_return_address) break;
-      h = (h + 1) & caml_frame_descriptors_mask;
+      d = fds.descriptors[h];
+      if (d->retaddr == retaddr) break;
+      h = (h + 1) & fds.mask;
     }
     /* Must be an allocation frame */
     CAMLassert(d && d->frame_size != 0xFFFF && (d->frame_size & 2));
   }
 
-  /* Compute the total allocation size at this point,
-     including allocations combined by Comballoc */
-  alloc_len = (unsigned char*)(&d->live_ofs[d->num_live]);
-  nallocs = *alloc_len++;
+  { /* Compute the total allocation size at this point,
+       including allocations combined by Comballoc */
+    unsigned char* alloc_len = (unsigned char*)(&d->live_ofs[d->num_live]);
+    int i, nallocs = *alloc_len++;
 
-  if (nallocs == 0) {
-    /* This is a poll */
-    caml_process_pending_actions();
-  }
-  else
-  {
-    for (i = 0; i < nallocs; i++) {
-      allocsz += Whsize_wosize(Wosize_encoded_alloc_len(alloc_len[i]));
+    if (nallocs == 0) {
+      /* This is a poll */
+      caml_process_pending_actions();
+      return;
+    }
+    else
+    {
+      for (i = 0; i < nallocs; i++) {
+        allocsz += Whsize_wosize(Wosize_encoded_alloc_len(alloc_len[i]));
+      }
+      /* We have computed whsize (including header)
+         but need wosize (without) */
+      allocsz -= 1;
     }
 
-    /* We have computed whsize (including header), but need wosize (without) */
-    allocsz -= 1;
+    whsize = Whsize_wosize(allocsz);
 
-    caml_alloc_small_dispatch(allocsz, CAML_DO_TRACK | CAML_FROM_CAML,
-                              nallocs, alloc_len);
+    /* Put the young pointer back to what is was before our tiggering
+       allocation */
+    Caml_state->young_ptr += whsize;
+
+    /* When caml_garbage_collection returns, we assume there is enough space in
+      the minor heap for the triggering allocation. Due to finalisers in the
+      major heap, it is possible for there to be a sequence of events where a
+      single call to caml_handle_gc_interrupt does not lead to that. We do it
+      in a loop to ensure it. */
+    do {
+      caml_process_pending_actions();
+    } while
+       ( (uintnat)(Caml_state->young_ptr - whsize) <= Caml_state->young_limit );
+
+    /* Re-do the allocation: we now have enough space in the minor heap. */
+    Caml_state->young_ptr -= whsize;
   }
 }
 
@@ -153,204 +178,3 @@ int caml_set_signal_action(int signo, int action)
   else
     return 0;
 }
-
-/* Machine- and OS-dependent handling of bound check trap */
-
-#if defined(TARGET_power) \
-  || defined(TARGET_s390x)
-DECLARE_SIGNAL_HANDLER(trap_handler)
-{
-#if defined(SYS_rhapsody)
-  /* Unblock SIGTRAP */
-  { sigset_t mask;
-    sigemptyset(&mask);
-    sigaddset(&mask, SIGTRAP);
-    caml_sigmask_hook(SIG_UNBLOCK, &mask, NULL);
-  }
-#endif
-  Caml_state->exception_pointer = (char *) CONTEXT_EXCEPTION_POINTER;
-  Caml_state->young_ptr = (value *) CONTEXT_YOUNG_PTR;
-  Caml_state->bottom_of_stack = (char *) CONTEXT_SP;
-  Caml_state->last_return_address = (uintnat) CONTEXT_PC;
-  caml_array_bound_error();
-}
-#endif
-
-/* Machine- and OS-dependent handling of stack overflow */
-
-#ifdef HAS_STACK_OVERFLOW_DETECTION
-#ifndef CONTEXT_SP
-#error "CONTEXT_SP is required if HAS_STACK_OVERFLOW_DETECTION is defined"
-#endif
-
-/* Code compiled with ocamlopt never accesses more than
-   EXTRA_STACK bytes below the stack pointer. */
-#define EXTRA_STACK 256
-
-#ifdef RETURN_AFTER_STACK_OVERFLOW
-extern void caml_stack_overflow(caml_domain_state*);
-#endif
-
-/* Address sanitizer is confused when running the stack overflow
-   handler in an alternate stack. We deactivate it for all the
-   functions used by the stack overflow handler. */
-CAMLno_asan
-DECLARE_SIGNAL_HANDLER(segv_handler)
-{
-  struct sigaction act;
-  char * fault_addr;
-
-  /* Sanity checks:
-     - faulting address is word-aligned
-     - faulting address is on the stack, or within EXTRA_STACK of it
-     - we are in OCaml code */
-  fault_addr = CONTEXT_FAULTING_ADDRESS;
-  if (((uintnat) fault_addr & (sizeof(intnat) - 1)) == 0
-      && fault_addr < Caml_state->top_of_stack
-      && (uintnat)fault_addr >= CONTEXT_SP - EXTRA_STACK
-#ifdef CONTEXT_PC
-      && caml_find_code_fragment_by_pc((char *) CONTEXT_PC) != NULL
-#endif
-      ) {
-#ifdef RETURN_AFTER_STACK_OVERFLOW
-    /* Tweak the PC part of the context so that on return from this
-       handler, we jump to the asm function [caml_stack_overflow]
-       (from $ARCH.S). */
-#ifdef CONTEXT_PC
-    CONTEXT_C_ARG_1 = (context_reg) Caml_state;
-    CONTEXT_PC = (context_reg) &caml_stack_overflow;
-#else
-#error "CONTEXT_PC must be defined if RETURN_AFTER_STACK_OVERFLOW is"
-#endif
-#else
-    /* Raise a Stack_overflow exception straight from this signal handler */
-#if defined(CONTEXT_YOUNG_PTR)
-    Caml_state->young_ptr = (value *) CONTEXT_YOUNG_PTR;
-#endif
-#if defined(CONTEXT_EXCEPTION_POINTER)
-    Caml_state->exception_pointer = (char *) CONTEXT_EXCEPTION_POINTER;
-#endif
-    caml_raise_stack_overflow();
-#endif
-#ifdef NAKED_POINTERS_CHECKER
-  } else if (Caml_state->checking_pointer_pc) {
-#ifdef CONTEXT_PC
-    CONTEXT_PC = (context_reg)Caml_state->checking_pointer_pc;
-#else
-#error "CONTEXT_PC must be defined if RETURN_AFTER_STACK_OVERFLOW is"
-#endif /* CONTEXT_PC */
-#endif /* NAKED_POINTERS_CHECKER */
-  } else {
-    /* Otherwise, deactivate our exception handler and return,
-       causing fatal signal to be generated at point of error. */
-    act.sa_handler = SIG_DFL;
-    act.sa_flags = 0;
-    sigemptyset(&act.sa_mask);
-    sigaction(SIGSEGV, &act, NULL);
-  }
-}
-
-#endif
-
-/* Initialization of signal stuff */
-
-void caml_init_signals(void)
-{
-  /* Bound-check trap handling */
-
-#if defined(TARGET_power)
-  { struct sigaction act;
-    sigemptyset(&act.sa_mask);
-    SET_SIGACT(act, trap_handler);
-#if !defined(SYS_rhapsody)
-    act.sa_flags |= SA_NODEFER;
-#endif
-    sigaction(SIGTRAP, &act, NULL);
-  }
-#endif
-
-#if defined(TARGET_s390x)
-  { struct sigaction act;
-    sigemptyset(&act.sa_mask);
-    SET_SIGACT(act, trap_handler);
-    sigaction(SIGFPE, &act, NULL);
-  }
-#endif
-
-#ifdef HAS_STACK_OVERFLOW_DETECTION
-  if (caml_setup_stack_overflow_detection() != -1) {
-    struct sigaction act;
-    SET_SIGACT(act, segv_handler);
-    act.sa_flags |= SA_ONSTACK | SA_NODEFER;
-    sigemptyset(&act.sa_mask);
-    sigaction(SIGSEGV, &act, NULL);
-  }
-#endif
-}
-
-/* Termination of signal stuff */
-
-#if defined(TARGET_power) || defined(TARGET_s390x) \
-    || defined(HAS_STACK_OVERFLOW_DETECTION)
-static void set_signal_default(int signum)
-{
-  struct sigaction act;
-  sigemptyset(&act.sa_mask);
-  act.sa_handler = SIG_DFL;
-  act.sa_flags = 0;
-  sigaction(signum, &act, NULL);
-}
-#endif
-
-void caml_terminate_signals(void)
-{
-#if defined(TARGET_power)
-  set_signal_default(SIGTRAP);
-#endif
-
-#if defined(TARGET_s390x)
-  set_signal_default(SIGFPE);
-#endif
-
-#ifdef HAS_STACK_OVERFLOW_DETECTION
-  set_signal_default(SIGSEGV);
-  caml_stop_stack_overflow_detection();
-#endif
-}
-
-/* Allocate and select an alternate stack for handling signals,
-   especially SIGSEGV signals.
-   Each thread needs its own alternate stack.
-   The alternate stack used to be statically-allocated for the main thread,
-   but this is incompatible with Glibc 2.34 and newer, where SIGSTKSZ
-   may not be a compile-time constant (issue #10250). */
-
-CAMLexport int caml_setup_stack_overflow_detection(void)
-{
-#ifdef HAS_STACK_OVERFLOW_DETECTION
-  stack_t stk;
-  stk.ss_sp = malloc(SIGSTKSZ);
-  if (stk.ss_sp == NULL) return -1;
-  stk.ss_size = SIGSTKSZ;
-  stk.ss_flags = 0;
-  return sigaltstack(&stk, NULL);
-#else
-  return 0;
-#endif
-}
-
-CAMLexport int caml_stop_stack_overflow_detection(void)
-{
-#ifdef HAS_STACK_OVERFLOW_DETECTION
-  stack_t oldstk, stk;
-  stk.ss_flags = SS_DISABLE;
-  if (sigaltstack(&stk, &oldstk) == -1) return -1;
-  /* If caml_setup_stack_overflow_detection failed, we are not using
-     an alternate signal stack.  SS_DISABLE will be set in oldstk,
-     and there is nothing to free in this case. */
-  if (! (oldstk.ss_flags & SS_DISABLE)) free(oldstk.ss_sp);
-  return 0;
-#else
-  return 0;
-#endif
-}
diff --git a/runtime/stacks.c b/runtime/stacks.c
deleted file mode 100644
index a1409b2abd7f..000000000000
--- a/runtime/stacks.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-#define CAML_INTERNALS
-
-/* To initialize and resize the stacks */
-
-#include <string.h>
-#include "caml/config.h"
-#include "caml/fail.h"
-#include "caml/misc.h"
-#include "caml/mlvalues.h"
-#include "caml/stacks.h"
-
-value caml_global_data = 0;
-
-uintnat caml_max_stack_size;            /* also used in gc_ctrl.c */
-
-void caml_init_stack (uintnat initial_max_size)
-{
-  Caml_state->stack_low = (value *) caml_stat_alloc(Stack_size);
-  Caml_state->stack_high = Caml_state->stack_low + Stack_size / sizeof (value);
-  Caml_state->stack_threshold =
-    Caml_state->stack_low + Stack_threshold / sizeof (value);
-  Caml_state->extern_sp = Caml_state->stack_high;
-  Caml_state->trapsp = Caml_state->stack_high;
-  Caml_state->trap_barrier = Caml_state->stack_high + 1;
-  caml_max_stack_size = initial_max_size;
-  caml_gc_message (0x08, "Initial stack limit: %"
-                   ARCH_INTNAT_PRINTF_FORMAT "uk bytes\n",
-                   caml_max_stack_size / 1024 * sizeof (value));
-}
-
-void caml_realloc_stack(asize_t required_space)
-{
-  asize_t size;
-  value * new_low, * new_high, * new_sp;
-
-  CAMLassert(Caml_state->extern_sp >= Caml_state->stack_low);
-  size = Caml_state->stack_high - Caml_state->stack_low;
-  do {
-    if (size >= caml_max_stack_size) caml_raise_stack_overflow();
-    size *= 2;
-  } while (size < Caml_state->stack_high - Caml_state->extern_sp
-                  + required_space);
-  caml_gc_message (0x08, "Growing stack to %"
-                         ARCH_INTNAT_PRINTF_FORMAT "uk bytes\n",
-                   (uintnat) size * sizeof(value) / 1024);
-  new_low = (value *) caml_stat_alloc(size * sizeof(value));
-  new_high = new_low + size;
-
-#define shift(ptr) \
-    ((char *) new_high - ((char *) Caml_state->stack_high - (char *) (ptr)))
-
-  new_sp = (value *) shift(Caml_state->extern_sp);
-  memmove((char *) new_sp,
-          (char *) Caml_state->extern_sp,
-          (Caml_state->stack_high - Caml_state->extern_sp) * sizeof(value));
-  caml_stat_free(Caml_state->stack_low);
-  Caml_state->trapsp = (value *) shift(Caml_state->trapsp);
-  Caml_state->trap_barrier = (value *) shift(Caml_state->trap_barrier);
-  Caml_state->stack_low = new_low;
-  Caml_state->stack_high = new_high;
-  Caml_state->stack_threshold =
-    Caml_state->stack_low + Stack_threshold / sizeof (value);
-  Caml_state->extern_sp = new_sp;
-
-#undef shift
-}
-
-CAMLprim value caml_ensure_stack_capacity(value required_space)
-{
-  asize_t req = Long_val(required_space);
-  if (Caml_state->extern_sp - req < Caml_state->stack_low)
-    caml_realloc_stack(req);
-  return Val_unit;
-}
-
-void caml_change_max_stack_size (uintnat new_max_size)
-{
-  asize_t size = Caml_state->stack_high - Caml_state->extern_sp
-                 + Stack_threshold / sizeof (value);
-
-  if (new_max_size < size) new_max_size = size;
-  if (new_max_size != caml_max_stack_size){
-    caml_gc_message (0x08, "Changing stack limit to %"
-                     ARCH_INTNAT_PRINTF_FORMAT "uk bytes\n",
-                     new_max_size * sizeof (value) / 1024);
-  }
-  caml_max_stack_size = new_max_size;
-}
-
-CAMLexport uintnat (*caml_stack_usage_hook)(void) = NULL;
-
-uintnat caml_stack_usage(void)
-{
-  uintnat sz;
-  sz = Caml_state->stack_high - Caml_state->extern_sp;
-  if (caml_stack_usage_hook != NULL)
-    sz += (*caml_stack_usage_hook)();
-  return sz;
-}
diff --git a/runtime/startup_aux.c b/runtime/startup_aux.c
index 3353004e4350..771267e1213d 100644
--- a/runtime/startup_aux.c
+++ b/runtime/startup_aux.c
@@ -28,64 +28,48 @@
 #endif
 #include "caml/osdeps.h"
 #include "caml/startup_aux.h"
-
+#include "caml/prims.h"
 
 #ifdef _WIN32
 extern void caml_win32_unregister_overflow_detection (void);
 #endif
 
-CAMLexport header_t *caml_atom_table = NULL;
+/* Configuration parameters and flags */
+
+static struct caml_params params;
+const struct caml_params* const caml_params = &params;
 
-/* Initialize the atom table */
-void caml_init_atom_table(void)
+static void init_startup_params(void)
 {
-  caml_stat_block b;
-  int i;
-
-  /* PR#9128: We need to give the atom table its own page to make sure
-     it does not share a page with a non-value, which would break code
-     which depend on the correctness of the page table. For example,
-     if the atom table shares a page with bytecode, then functions in
-     the runtime may decide to follow a code pointer in a closure, as
-     if it were a pointer to a value.
-
-     We add 1 padding at the end of the atom table because the atom
-     pointer actually points to the word *following* the corresponding
-     entry in the table (the entry is an empty block *header*).
-  */
-  asize_t request = (256 + 1) * sizeof(header_t);
-  request = (request + Page_size - 1) / Page_size * Page_size;
-  caml_atom_table =
-    caml_stat_alloc_aligned_noexc(request, 0, &b);
-
-  for(i = 0; i < 256; i++) {
-    caml_atom_table[i] = Make_header(0, i, Caml_black);
-  }
-  if (caml_page_table_add(In_static_data,
-                          caml_atom_table, caml_atom_table + 256 + 1) != 0) {
-    caml_fatal_error("not enough memory for initial page table");
+#ifndef NATIVE_CODE
+  char_os * cds_file;
+#endif
+
+  params.init_percent_free = Percent_free_def;
+  params.init_max_percent_free = Max_percent_free_def;
+  params.init_minor_heap_wsz = Minor_heap_def;
+  params.init_heap_chunk_sz = Heap_chunk_def;
+  params.init_heap_wsz = Init_heap_def;
+  params.init_custom_major_ratio = Custom_major_ratio_def;
+  params.init_custom_minor_ratio = Custom_minor_ratio_def;
+  params.init_custom_minor_max_bsz = Custom_minor_max_bsz_def;
+  params.init_max_stack_wsz = Max_stack_def;
+  params.init_fiber_wsz = (Stack_threshold * 2) / sizeof(value);
+#ifdef DEBUG
+  params.verb_gc = 0x3F;
+#endif
+#ifndef NATIVE_CODE
+  cds_file = caml_secure_getenv(T("CAML_DEBUG_FILE"));
+  if (cds_file != NULL) {
+    params.cds_file = caml_stat_strdup_os(cds_file);
   }
+#endif
+  params.trace_level = 0;
+  params.cleanup_on_exit = 0;
+  params.print_magic = 0;
+  params.print_config = 0;
 }
 
-
-/* Parse the OCAMLRUNPARAM environment variable. */
-
-uintnat caml_init_percent_free = Percent_free_def;
-uintnat caml_init_max_percent_free = Max_percent_free_def;
-uintnat caml_init_minor_heap_wsz = Minor_heap_def;
-uintnat caml_init_heap_chunk_sz = Heap_chunk_def;
-uintnat caml_init_heap_wsz = Init_heap_def;
-uintnat caml_init_max_stack_wsz = Max_stack_def;
-uintnat caml_init_major_window = Major_window_def;
-uintnat caml_init_custom_major_ratio = Custom_major_ratio_def;
-uintnat caml_init_custom_minor_ratio = Custom_minor_ratio_def;
-uintnat caml_init_custom_minor_max_bsz = Custom_minor_max_bsz_def;
-uintnat caml_init_policy = Allocation_policy_def;
-extern int caml_parser_trace;
-uintnat caml_trace_level = 0;
-int caml_cleanup_on_exit = 0;
-
-
 static void scanmult (char_os *opt, uintnat *var)
 {
   char_os mult = ' ';
@@ -103,34 +87,39 @@ static void scanmult (char_os *opt, uintnat *var)
 void caml_parse_ocamlrunparam(void)
 {
   char_os *opt = caml_secure_getenv (T("OCAMLRUNPARAM"));
-  uintnat p;
+
+  init_startup_params();
 
   if (opt == NULL) opt = caml_secure_getenv (T("CAMLRUNPARAM"));
 
   if (opt != NULL){
     while (*opt != '\0'){
       switch (*opt++){
-      case 'a': scanmult (opt, &caml_init_policy); break;
-      case 'b': scanmult (opt, &p); caml_record_backtraces(p); break;
-      case 'c': scanmult (opt, &p); caml_cleanup_on_exit = (p != 0); break;
-      case 'h': scanmult (opt, &caml_init_heap_wsz); break;
-      case 'H': scanmult (opt, &caml_use_huge_pages); break;
-      case 'i': scanmult (opt, &caml_init_heap_chunk_sz); break;
-      case 'l': scanmult (opt, &caml_init_max_stack_wsz); break;
-      case 'M': scanmult (opt, &caml_init_custom_major_ratio); break;
-      case 'm': scanmult (opt, &caml_init_custom_minor_ratio); break;
-      case 'n': scanmult (opt, &caml_init_custom_minor_max_bsz); break;
-      case 'o': scanmult (opt, &caml_init_percent_free); break;
-      case 'O': scanmult (opt, &caml_init_max_percent_free); break;
-      case 'p': scanmult (opt, &p); caml_parser_trace = (p != 0); break;
+      //case 'a': scanmult (opt, &p); caml_set_allocation_policy (p); break;
+      case 'b': scanmult (opt, &params.backtrace_enabled); break;
+      case 'c': scanmult (opt, &params.cleanup_on_exit); break;
+      case 'e': scanmult (opt, &params.eventlog_enabled); break;
+      case 'f': scanmult (opt, &params.init_fiber_wsz); break;
+      case 'h': scanmult (opt, &params.init_heap_wsz); break;
+      //case 'H': scanmult (opt, &caml_use_huge_pages); break;
+      case 'i': scanmult (opt, &params.init_heap_chunk_sz); break;
+      case 'l': scanmult (opt, &params.init_max_stack_wsz); break;
+      case 'M': scanmult (opt, &params.init_custom_major_ratio); break;
+      case 'm': scanmult (opt, &params.init_custom_minor_ratio); break;
+      case 'n': scanmult (opt, &params.init_custom_minor_max_bsz); break;
+      case 'o': scanmult (opt, &params.init_percent_free); break;
+      case 'O': scanmult (opt, &params.init_max_percent_free); break;
+      case 'p': scanmult (opt, &params.parser_trace); break;
       case 'R': break; /*  see stdlib/hashtbl.mli */
-      case 's': scanmult (opt, &caml_init_minor_heap_wsz); break;
-      case 't': scanmult (opt, &caml_trace_level); break;
-      case 'v': scanmult (opt, &caml_verb_gc); break;
-      case 'w': scanmult (opt, &caml_init_major_window); break;
+      case 's': scanmult (opt, &params.init_minor_heap_wsz); break;
+      case 't': scanmult (opt, &params.trace_level); break;
+      case 'v': scanmult (opt, &params.verb_gc); break;
+      case 'V': scanmult (opt, &params.verify_heap); break;
+      //case 'w': scanmult (opt, &caml_init_major_window); break;
       case 'W': scanmult (opt, &caml_runtime_warnings); break;
       case ',': continue;
       }
+      --opt; /* to handle patterns like ",b=1" */
       while (*opt != '\0'){
         if (*opt++ == ',') break;
       }
@@ -196,3 +185,15 @@ CAMLexport void caml_shutdown(void)
 
   shutdown_happened = 1;
 }
+
+void caml_init_exe_name(const char_os* exe_name)
+{
+  params.exe_name = exe_name;
+}
+
+void caml_init_section_table(const char* section_table,
+                             asize_t section_table_size)
+{
+  params.section_table = section_table;
+  params.section_table_size = section_table_size;
+}
diff --git a/runtime/startup_byt.c b/runtime/startup_byt.c
index de7549748ac1..57466e7de1dc 100644
--- a/runtime/startup_byt.c
+++ b/runtime/startup_byt.c
@@ -32,15 +32,16 @@
 #include "caml/alloc.h"
 #include "caml/backtrace.h"
 #include "caml/callback.h"
+#include "caml/codefrag.h"
 #include "caml/custom.h"
 #include "caml/debugger.h"
-#include "caml/domain.h"
+#include "caml/domain_state.h"
 #include "caml/dynlink.h"
 #include "caml/eventlog.h"
 #include "caml/exec.h"
 #include "caml/fail.h"
+#include "caml/fiber.h"
 #include "caml/fix_code.h"
-#include "caml/freelist.h"
 #include "caml/gc_ctrl.h"
 #include "caml/instrtrace.h"
 #include "caml/interp.h"
@@ -55,7 +56,6 @@
 #include "caml/printexc.h"
 #include "caml/reverse.h"
 #include "caml/signals.h"
-#include "caml/stacks.h"
 #include "caml/sys.h"
 #include "caml/startup.h"
 #include "caml/startup_aux.h"
@@ -72,8 +72,6 @@
 #endif
 
 static char magicstr[EXEC_MAGIC_LENGTH+1];
-static int print_magic = 0;
-static int print_config = 0;
 
 /* Print the specified error message followed by an end-of-line and exit */
 static void error(char *msg, ...)
@@ -105,7 +103,7 @@ static int read_trailer(int fd, struct exec_trailer *trail)
   memcpy(magicstr, trail->magic, EXEC_MAGIC_LENGTH);
   magicstr[EXEC_MAGIC_LENGTH] = 0;
 
-  if (print_magic) {
+  if (caml_params->print_magic) {
     printf("%s\n", magicstr);
     exit(0);
   }
@@ -301,6 +299,9 @@ static void do_print_help(void)
 static int parse_command_line(char_os **argv)
 {
   int i, j, len, parsed;
+  /* cast to make caml_params mutable; this assumes we are only called
+     by one thread at startup */
+  struct caml_params* params = (struct caml_params*)caml_params;
 
   for(i = 1; argv[i] != NULL && argv[i][0] == '-'; i++) {
     len = strlen_os(argv[i]);
@@ -312,10 +313,10 @@ static int parse_command_line(char_os **argv)
         return i + 1;
         break;
       case 't':
-        ++ caml_trace_level; /* ignored unless DEBUG mode */
+        params->trace_level += 1; /* ignored unless DEBUG mode */
         break;
       case 'v':
-        caml_verb_gc = 0x001+0x004+0x008+0x010+0x020;
+        params->verb_gc = 0x001+0x004+0x008+0x010+0x020;
         break;
       case 'p':
         for (j = 0; caml_names_of_builtin_cprim[j] != NULL; j++)
@@ -334,7 +335,7 @@ static int parse_command_line(char_os **argv)
         }
         break;
       case 'm':
-        print_magic = 1;
+        params->print_magic = 1;
         break;
       case 'M':
         printf("%s\n", EXEC_MAGIC);
@@ -356,7 +357,7 @@ static int parse_command_line(char_os **argv)
         do_print_help();
         exit(0);
       } else if (!strcmp_os(argv[i], T("-config"))) {
-        print_config = 1;
+        params->print_config = 1;
       } else {
         parsed = 0;
       }
@@ -410,12 +411,7 @@ static void do_print_config(void)
 #else
          "false");
 #endif
-  printf("no_naked_pointers: %s\n",
-#ifdef NO_NAKED_POINTERS
-         "true");
-#else
-         "false");
-#endif
+  printf("no_naked_pointers: true\n");
   printf("profinfo: %s\n"
          "profinfo_width: %d\n",
 #ifdef WITH_PROFINFO
@@ -448,7 +444,7 @@ extern void caml_signal_thread(void * lpParam);
 #if defined(_MSC_VER) && __STDC_SECURE_LIB__ >= 200411L
 
 /* PR 4887: avoid crash box of windows runtime on some system calls */
-extern void caml_install_invalid_parameter_handler();
+extern void caml_install_invalid_parameter_handler(void);
 
 #endif
 
@@ -465,25 +461,25 @@ CAMLexport void caml_main(char_os **argv)
   char_os * exe_name, * proc_self_exe;
 
   /* Initialize the domain */
-  caml_init_domain();
+  CAML_INIT_DOMAIN_STATE;
 
   /* Determine options */
-#ifdef DEBUG
-  caml_verb_gc = 0x3F;
-#endif
   caml_parse_ocamlrunparam();
   CAML_EVENTLOG_INIT();
 #ifdef DEBUG
   caml_gc_message (-1, "### OCaml runtime: debug mode ###\n");
 #endif
-  if (!caml_startup_aux(/* pooling */ caml_cleanup_on_exit))
+  if (!caml_startup_aux(/* pooling */ caml_params->cleanup_on_exit))
     return;
 
+  caml_init_codefrag();
+
   caml_init_locale();
 #if defined(_MSC_VER) && __STDC_SECURE_LIB__ >= 200411L
   caml_install_invalid_parameter_handler();
 #endif
   caml_init_custom_operations();
+  caml_init_os_params();
   caml_ext_table_init(&caml_shared_libs_path, 8);
 
   /* Determine position of bytecode file */
@@ -506,7 +502,7 @@ CAMLexport void caml_main(char_os **argv)
 
   if (fd < 0) {
     pos = parse_command_line(argv);
-    if (print_config) {
+    if (caml_params->print_config) {
       do_print_config();
       exit(0);
     }
@@ -538,14 +534,8 @@ CAMLexport void caml_main(char_os **argv)
   /* Read the table of contents (section descriptors) */
   caml_read_section_descriptors(fd, &trail);
   /* Initialize the abstract machine */
-  caml_init_gc (caml_init_minor_heap_wsz, caml_init_heap_wsz,
-                caml_init_heap_chunk_sz, caml_init_percent_free,
-                caml_init_max_percent_free, caml_init_major_window,
-                caml_init_custom_major_ratio, caml_init_custom_minor_ratio,
-                caml_init_custom_minor_max_bsz, caml_init_policy);
-  caml_init_stack (caml_init_max_stack_wsz);
-  caml_init_atom_table();
-  caml_init_backtrace();
+  caml_init_gc ();
+  Caml_state->external_raise = NULL;
   /* Initialize the interpreter */
   caml_interprete(NULL, 0);
   /* Initialize the debugger, if needed */
@@ -566,18 +556,17 @@ CAMLexport void caml_main(char_os **argv)
   /* Load the globals */
   caml_seek_section(fd, &trail, "DATA");
   chan = caml_open_descriptor_in(fd);
-  Lock(chan);
-  caml_global_data = caml_input_val(chan);
-  Unlock(chan);
+  /* TODO: do we need multicore Lock here */
+  caml_modify_generational_global_root(&caml_global_data, caml_input_val(chan));
+  /* TODO: do we need multicore Unlock here */
   caml_close_channel(chan); /* this also closes fd */
   caml_stat_free(trail.section);
-  /* Ensure that the globals are in the major heap. */
-  caml_oldify_one (caml_global_data, &caml_global_data);
-  caml_oldify_mopup ();
   /* Initialize system libraries */
   caml_sys_init(exe_name, argv + pos);
   /* Load debugging info, if b>=2 */
   caml_load_main_debug_info();
+  /* ensure all globals are in major heap */
+  caml_minor_collection();
 #ifdef _WIN32
   /* Start a thread to handle signals */
   if (caml_secure_getenv(T("CAMLSIGPIPE")))
@@ -587,13 +576,13 @@ CAMLexport void caml_main(char_os **argv)
   caml_debugger(PROGRAM_START, Val_unit);
   res = caml_interprete(caml_start_code, caml_code_size);
   if (Is_exception_result(res)) {
-    Caml_state->exn_bucket = Extract_exception(res);
+    value exn = Extract_exception(res);
     if (caml_debugger_in_use) {
-      Caml_state->extern_sp = &Caml_state->exn_bucket; /* The debugger needs the
-                                            exception value.*/
+      Caml_state->current_stack->sp = &exn; /* The debugger needs the
+                                               exception value.*/
       caml_debugger(UNCAUGHT_EXC, Val_unit);
     }
-    caml_fatal_uncaught_exception(Caml_state->exn_bucket);
+    caml_fatal_uncaught_exception(exn);
   }
 }
 
@@ -606,45 +595,40 @@ CAMLexport value caml_startup_code_exn(
            int pooling,
            char_os **argv)
 {
-  char_os * cds_file;
   char_os * exe_name;
 
   /* Initialize the domain */
-  caml_init_domain();
+  CAML_INIT_DOMAIN_STATE;
+
   /* Determine options */
-#ifdef DEBUG
-  caml_verb_gc = 0x3F;
-#endif
   caml_parse_ocamlrunparam();
   CAML_EVENTLOG_INIT();
 #ifdef DEBUG
   caml_gc_message (-1, "### OCaml runtime: debug mode ###\n");
 #endif
-  if (caml_cleanup_on_exit)
+  if (caml_params->cleanup_on_exit)
     pooling = 1;
   if (!caml_startup_aux(pooling))
     return Val_unit;
 
+  caml_init_codefrag();
+
   caml_init_locale();
 #if defined(_MSC_VER) && __STDC_SECURE_LIB__ >= 200411L
   caml_install_invalid_parameter_handler();
 #endif
   caml_init_custom_operations();
-  cds_file = caml_secure_getenv(T("CAML_DEBUG_FILE"));
-  if (cds_file != NULL) {
-    caml_cds_file = caml_stat_strdup_os(cds_file);
-  }
+  caml_init_os_params();
+
+  /* Initialize the abstract machine */
+  caml_init_gc ();
   exe_name = caml_executable_name();
   if (exe_name == NULL) exe_name = caml_search_exe_in_path(argv[0]);
-  /* Initialize the abstract machine */
-  caml_init_gc (caml_init_minor_heap_wsz, caml_init_heap_wsz,
-                caml_init_heap_chunk_sz, caml_init_percent_free,
-                caml_init_max_percent_free, caml_init_major_window,
-                caml_init_custom_major_ratio, caml_init_custom_minor_ratio,
-                caml_init_custom_minor_max_bsz, caml_init_policy);
-  caml_init_stack (caml_init_max_stack_wsz);
-  caml_init_atom_table();
-  caml_init_backtrace();
+  Caml_state->external_raise = NULL;
+  caml_sys_init(exe_name, argv);
+  /* Load debugging info, if b>=2 */
+  caml_load_main_debug_info();
+  Caml_state->external_raise = NULL;
   /* Initialize the interpreter */
   caml_interprete(NULL, 0);
   /* Initialize the debugger, if needed */
@@ -660,13 +644,11 @@ CAMLexport value caml_startup_code_exn(
   /* Use the builtin table of primitives */
   caml_build_primitive_table_builtin();
   /* Load the globals */
-  caml_global_data = caml_input_value_from_block(data, data_size);
-  /* Ensure that the globals are in the major heap. */
-  caml_oldify_one (caml_global_data, &caml_global_data);
-  caml_oldify_mopup ();
+  caml_modify_generational_global_root
+    (&caml_global_data, caml_input_value_from_block(data, data_size));
+  caml_minor_collection(); /* ensure all globals are in major heap */
   /* Record the sections (for caml_get_section_table in meta.c) */
-  caml_section_table = section_table;
-  caml_section_table_size = section_table_size;
+  caml_init_section_table(section_table, section_table_size);
   /* Initialize system libraries */
   caml_sys_init(exe_name, argv);
   /* Load debugging info, if b>=2 */
@@ -689,12 +671,12 @@ CAMLexport void caml_startup_code(
                               section_table, section_table_size,
                               pooling, argv);
   if (Is_exception_result(res)) {
-    Caml_state->exn_bucket = Extract_exception(res);
+    value exn = Extract_exception(res);
     if (caml_debugger_in_use) {
-      Caml_state->extern_sp = &Caml_state->exn_bucket; /* The debugger needs the
-                                            exception value.*/
+      Caml_state->current_stack->sp = &exn; /* The debugger needs the
+                                               exception value.*/
       caml_debugger(UNCAUGHT_EXC, Val_unit);
     }
-    caml_fatal_uncaught_exception(Caml_state->exn_bucket);
+    caml_fatal_uncaught_exception(exn);
   }
 }
diff --git a/runtime/startup_nat.c b/runtime/startup_nat.c
index b75732596af9..5743aebf3f80 100644
--- a/runtime/startup_nat.c
+++ b/runtime/startup_nat.c
@@ -20,14 +20,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "caml/callback.h"
-#include "caml/backtrace.h"
 #include "caml/custom.h"
 #include "caml/codefrag.h"
 #include "caml/debugger.h"
-#include "caml/domain.h"
 #include "caml/eventlog.h"
+#include "caml/fiber.h"
 #include "caml/fail.h"
-#include "caml/freelist.h"
 #include "caml/gc.h"
 #include "caml/gc_ctrl.h"
 #include "caml/intext.h"
@@ -47,28 +45,16 @@
 extern int caml_parser_trace;
 extern char caml_system__code_begin, caml_system__code_end;
 
-/* Initialize the atom table and the static data and code area limits. */
+/* Initialize the static data and code area limits. */
 
 struct segment { char * begin; char * end; };
 
-static void init_static(void)
+static void init_segments(void)
 {
-  extern struct segment caml_data_segments[], caml_code_segments[];
-
+  extern struct segment caml_code_segments[];
   char * caml_code_area_start, * caml_code_area_end;
   int i;
 
-  caml_init_atom_table ();
-
-  for (i = 0; caml_data_segments[i].begin != 0; i++) {
-    /* PR#5509: we must include the zero word at end of data segment,
-       because pointers equal to caml_data_segments[i].end are static data. */
-    if (caml_page_table_add(In_static_data,
-                            caml_data_segments[i].begin,
-                            caml_data_segments[i].end + sizeof(value)) != 0)
-      caml_fatal_error("not enough memory for initial page table");
-  }
-
   caml_code_area_start = caml_code_segments[0].begin;
   caml_code_area_end = caml_code_segments[0].end;
   for (i = 1; caml_code_segments[i].begin != 0; i++) {
@@ -99,50 +85,40 @@ extern void caml_win32_overflow_detection (void);
 #if defined(_MSC_VER) && __STDC_SECURE_LIB__ >= 200411L
 
 /* PR 4887: avoid crash box of windows runtime on some system calls */
-extern void caml_install_invalid_parameter_handler();
+extern void caml_install_invalid_parameter_handler(void);
 
 #endif
 
 value caml_startup_common(char_os **argv, int pooling)
 {
   char_os * exe_name, * proc_self_exe;
-  value res;
-  char tos;
 
   /* Initialize the domain */
-  caml_init_domain();
+  CAML_INIT_DOMAIN_STATE;
+
   /* Determine options */
-#ifdef DEBUG
-  caml_verb_gc = 0x3F;
-#endif
   caml_parse_ocamlrunparam();
-  CAML_EVENTLOG_INIT();
 #ifdef DEBUG
   caml_gc_message (-1, "### OCaml runtime: debug mode ###\n");
 #endif
-  if (caml_cleanup_on_exit)
+  if (caml_params->cleanup_on_exit)
     pooling = 1;
   if (!caml_startup_aux(pooling))
     return Val_unit;
 
-  caml_init_frame_descriptors();
+  caml_init_codefrag();
   caml_init_locale();
 #if defined(_MSC_VER) && __STDC_SECURE_LIB__ >= 200411L
   caml_install_invalid_parameter_handler();
 #endif
   caml_init_custom_operations();
-  Caml_state->top_of_stack = &tos;
-  caml_init_gc (caml_init_minor_heap_wsz, caml_init_heap_wsz,
-                caml_init_heap_chunk_sz, caml_init_percent_free,
-                caml_init_max_percent_free, caml_init_major_window,
-                caml_init_custom_major_ratio, caml_init_custom_minor_ratio,
-                caml_init_custom_minor_max_bsz, caml_init_policy);
-  init_static();
-  caml_init_signals();
+  caml_init_os_params();
+  caml_init_gc ();
+
+  init_segments();
 #ifdef _WIN32
   caml_win32_overflow_detection();
 #endif
-  caml_init_backtrace();
   caml_debugger_init (); /* force debugger.o stub to be linked */
   exe_name = argv[0];
   if (exe_name == NULL) exe_name = T("");
@@ -153,13 +129,12 @@ value caml_startup_common(char_os **argv, int pooling)
     exe_name = caml_search_exe_in_path(exe_name);
   caml_sys_init(exe_name, argv);
   if (sigsetjmp(caml_termination_jmpbuf.buf, 0)) {
-    caml_terminate_signals();
     if (caml_termination_hook != NULL) caml_termination_hook(NULL);
     return Val_unit;
   }
-  res = caml_start_program(Caml_state);
-  caml_terminate_signals();
-  return res;
+
+  caml_maybe_expand_stack();
+  return caml_start_program(Caml_state);
 }
 
 value caml_startup_exn(char_os **argv)
diff --git a/runtime/sync.c b/runtime/sync.c
new file mode 100644
index 000000000000..d9e54999dffb
--- /dev/null
+++ b/runtime/sync.c
@@ -0,0 +1,235 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                 OCaml                                  */
+/*                                                                        */
+/*          Xavier Leroy and Damien Doligez, INRIA Rocquencourt           */
+/*                                                                        */
+/*   Copyright 1996 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#define CAML_INTERNALS
+
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+
+#include "caml/alloc.h"
+#include "caml/custom.h"
+#include "caml/domain_state.h"
+#include "caml/fail.h"
+#include "caml/memory.h"
+
+#include "caml/sync.h"
+#include "caml/eventlog.h"
+
+/* Mutex operations */
+
+static int sync_mutex_create(sync_mutex * res)
+{
+  int rc;
+  pthread_mutexattr_t attr;
+  sync_mutex m;
+
+  rc = pthread_mutexattr_init(&attr);
+  if (rc != 0) goto error1;
+  rc = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+  if (rc != 0) goto error2;
+  m = caml_stat_alloc_noexc(sizeof(pthread_mutex_t));
+  if (m == NULL) { rc = ENOMEM; goto error2; }
+  rc = pthread_mutex_init(m, &attr);
+  if (rc != 0) goto error3;
+  pthread_mutexattr_destroy(&attr);
+  *res = m;
+  return 0;
+error3:
+  caml_stat_free(m);
+error2:
+  pthread_mutexattr_destroy(&attr);
+error1:
+  return rc;
+}
+
+static int sync_mutex_destroy(sync_mutex m)
+{
+  int rc;
+  rc = pthread_mutex_destroy(m);
+  caml_stat_free(m);
+  return rc;
+}
+
+static void caml_mutex_finalize(value wrapper)
+{
+  sync_mutex_destroy(Mutex_val(wrapper));
+}
+
+static int caml_mutex_compare(value wrapper1, value wrapper2)
+{
+  sync_mutex mut1 = Mutex_val(wrapper1);
+  sync_mutex mut2 = Mutex_val(wrapper2);
+  return mut1 == mut2 ? 0 : mut1 < mut2 ? -1 : 1;
+}
+
+static intnat caml_mutex_hash(value wrapper)
+{
+  return (intnat) (Mutex_val(wrapper));
+}
+
+static const struct custom_operations caml_mutex_ops = {
+  "_mutex",
+  caml_mutex_finalize,
+  caml_mutex_compare,
+  caml_mutex_hash,
+  custom_serialize_default,
+  custom_deserialize_default,
+  custom_compare_ext_default,
+  custom_fixed_length_default
+};
+
+CAMLprim value caml_ml_mutex_new(value unit)        /* ML */
+{
+  sync_mutex mut = NULL;
+  value wrapper;
+
+  sync_check_error(sync_mutex_create(&mut), "Mutex.create");
+  wrapper = caml_alloc_custom(&caml_mutex_ops, sizeof(pthread_mutex_t *),
+                              0, 1);
+  Mutex_val(wrapper) = mut;
+  return wrapper;
+}
+
+CAMLprim value caml_ml_mutex_lock(value wrapper)     /* ML */
+{
+  sync_retcode retcode;
+  sync_mutex mut = Mutex_val(wrapper);
+
+  /* PR#4351: first try to acquire mutex without releasing the master lock */
+  if (sync_mutex_trylock(mut) == MUTEX_PREVIOUSLY_UNLOCKED) return Val_unit;
+  /* If unsuccessful, block on mutex */
+  Begin_root(wrapper)
+    caml_enter_blocking_section();
+    retcode = sync_mutex_lock(mut);
+    caml_leave_blocking_section();
+  End_roots();
+  sync_check_error(retcode, "Mutex.lock");
+  return Val_unit;
+}
+
+CAMLprim value caml_ml_mutex_unlock(value wrapper)           /* ML */
+{
+  sync_retcode retcode;
+  sync_mutex mut = Mutex_val(wrapper);
+  /* PR#4351: no need to release and reacquire master lock */
+  retcode = sync_mutex_unlock(mut);
+  sync_check_error(retcode, "Mutex.unlock");
+  return Val_unit;
+}
+
+CAMLprim value caml_ml_mutex_try_lock(value wrapper)           /* ML */
+{
+  sync_mutex mut = Mutex_val(wrapper);
+  sync_retcode retcode;
+  retcode = sync_mutex_trylock(mut);
+  if (retcode == MUTEX_ALREADY_LOCKED) return Val_false;
+  sync_check_error(retcode, "Mutex.try_lock");
+  return Val_true;
+}
+
+
+/* Conditions operations */
+
+static int sync_condvar_create(sync_condvar * res)
+{
+  int rc;
+  sync_condvar c = caml_stat_alloc_noexc(sizeof(pthread_cond_t));
+  if (c == NULL) return ENOMEM;
+  rc = pthread_cond_init(c, NULL);
+  if (rc != 0) { caml_stat_free(c); return rc; }
+  *res = c;
+  return 0;
+}
+
+static int sync_condvar_destroy(sync_condvar c)
+{
+  int rc;
+  rc = pthread_cond_destroy(c);
+  caml_stat_free(c);
+  return rc;
+}
+
+static void caml_condition_finalize(value wrapper)
+{
+  sync_condvar_destroy(Condition_val(wrapper));
+}
+
+static int caml_condition_compare(value wrapper1, value wrapper2)
+{
+  sync_condvar cond1 = Condition_val(wrapper1);
+  sync_condvar cond2 = Condition_val(wrapper2);
+  return cond1 == cond2 ? 0 : cond1 < cond2 ? -1 : 1;
+}
+
+static intnat caml_condition_hash(value wrapper)
+{
+  return (intnat) (Condition_val(wrapper));
+}
+
+static struct custom_operations caml_condition_ops = {
+  "_condition",
+  caml_condition_finalize,
+  caml_condition_compare,
+  caml_condition_hash,
+  custom_serialize_default,
+  custom_deserialize_default,
+  custom_compare_ext_default,
+  custom_fixed_length_default
+};
+
+CAMLprim value caml_ml_condition_new(value unit)        /* ML */
+{
+  value wrapper;
+  sync_condvar cond = NULL;
+
+  sync_check_error(sync_condvar_create(&cond), "Condition.create");
+  wrapper = caml_alloc_custom(&caml_condition_ops, sizeof(sync_condvar *),
+                              0, 1);
+  Condition_val(wrapper) = cond;
+  return wrapper;
+}
+
+CAMLprim value caml_ml_condition_wait(value wcond, value wmut)     /* ML */
+{
+  sync_condvar cond = Condition_val(wcond);
+  sync_mutex mut = Mutex_val(wmut);
+  sync_retcode retcode;
+
+  CAML_EV_BEGIN(EV_DOMAIN_CONDITION_WAIT);
+  Begin_roots2(wcond, wmut)
+    caml_enter_blocking_section();
+    retcode = sync_condvar_wait(cond, mut);
+    caml_leave_blocking_section();
+  End_roots();
+  sync_check_error(retcode, "Condition.wait");
+  CAML_EV_END(EV_DOMAIN_CONDITION_WAIT);
+
+  return Val_unit;
+}
+
+CAMLprim value caml_ml_condition_signal(value wrapper)           /* ML */
+{
+  sync_check_error(sync_condvar_signal(Condition_val(wrapper)),
+                 "Condition.signal");
+  return Val_unit;
+}
+
+CAMLprim value caml_ml_condition_broadcast(value wrapper)           /* ML */
+{
+  sync_check_error(sync_condvar_broadcast(Condition_val(wrapper)),
+                 "Condition.broadcast");
+  return Val_unit;
+}
diff --git a/runtime/sys.c b/runtime/sys.c
index a77fd9d1cfb4..9967e77782c5 100644
--- a/runtime/sys.c
+++ b/runtime/sys.c
@@ -51,15 +51,16 @@
 #include "caml/gc_ctrl.h"
 #include "caml/major_gc.h"
 #include "caml/io.h"
-#include "caml/misc.h"
 #include "caml/mlvalues.h"
 #include "caml/osdeps.h"
 #include "caml/signals.h"
-#include "caml/stacks.h"
+#include "caml/fiber.h"
 #include "caml/sys.h"
-#include "caml/version.h"
+#include "caml/startup.h"
 #include "caml/callback.h"
 #include "caml/startup_aux.h"
+#include "caml/major_gc.h"
+#include "caml/shared_heap.h"
 
 static char * error_message(void)
 {
@@ -86,9 +87,9 @@ CAMLexport void caml_sys_error(value arg)
     mlsize_t err_len = strlen(err);
     mlsize_t arg_len = caml_string_length(arg);
     str = caml_alloc_string(arg_len + 2 + err_len);
-    memmove(&Byte(str, 0), String_val(arg), arg_len);
-    memmove(&Byte(str, arg_len), ": ", 2);
-    memmove(&Byte(str, arg_len + 2), err, err_len);
+    memcpy(&Byte(str, 0), String_val(arg), arg_len);
+    memcpy(&Byte(str, arg_len), ": ", 2);
+    memcpy(&Byte(str, arg_len + 2), err, err_len);
   }
   caml_raise_sys_error(str);
   CAMLnoreturn;
@@ -115,57 +116,63 @@ static void caml_sys_check_path(value name)
 
 CAMLexport void caml_do_exit(int retcode)
 {
-  if ((caml_verb_gc & 0x400) != 0) {
-    /* cf caml_gc_counters */
-    double minwords = Caml_state->stat_minor_words
-      + (double) (Caml_state->young_end - Caml_state->young_ptr);
-    double prowords = Caml_state->stat_promoted_words;
-    double majwords =
-      Caml_state->stat_major_words + (double) caml_allocated_words;
-    double allocated_words = minwords + majwords - prowords;
-    intnat mincoll = Caml_state->stat_minor_collections;
-    intnat majcoll = Caml_state->stat_major_collections;
-    intnat heap_words = Caml_state->stat_heap_wsz;
-    intnat heap_chunks = Caml_state->stat_heap_chunks;
-    intnat top_heap_words = Caml_state->stat_top_heap_wsz;
-    intnat cpct = Caml_state->stat_compactions;
-    intnat forcmajcoll = Caml_state->stat_forced_major_collections;
-    caml_gc_message(0x400, "allocated_words: %.0f\n", allocated_words);
-    caml_gc_message(0x400, "minor_words: %.0f\n", minwords);
-    caml_gc_message(0x400, "promoted_words: %.0f\n", prowords);
-    caml_gc_message(0x400, "major_words: %.0f\n", majwords);
-    caml_gc_message(0x400, "minor_collections: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
-                    mincoll);
-    caml_gc_message(0x400, "major_collections: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
-                    majcoll);
-    caml_gc_message(0x400, "heap_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
-                    heap_words);
-    caml_gc_message(0x400, "heap_chunks: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
-                    heap_chunks);
-    caml_gc_message(0x400, "top_heap_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
-                    top_heap_words);
-    caml_gc_message(0x400, "compactions: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
-                    cpct);
-    caml_gc_message(0x400,
-                    "forced_major_collections: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
-                    forcmajcoll);
+  caml_domain_state* domain_state = Caml_state;
+  struct gc_stats s;
+
+  if ((caml_params->verb_gc & 0x400) != 0) {
+    caml_sample_gc_stats(&s);
+    {
+      /* cf caml_gc_counters */
+      double minwords = s.minor_words
+        + (double) (domain_state->young_end - domain_state->young_ptr);
+      double majwords = s.major_words + (double) domain_state->allocated_words;
+      double allocated_words = minwords + majwords - s.promoted_words;
+      intnat heap_words =
+        s.major_heap.pool_words + s.major_heap.large_words;
+      intnat top_heap_words =
+        s.major_heap.pool_max_words + s.major_heap.large_max_words;
+
+      if (heap_words == 0) {
+        heap_words = Wsize_bsize(caml_heap_size(Caml_state->shared_heap));
+      }
+
+      if (top_heap_words == 0) {
+        top_heap_words = caml_top_heap_words(Caml_state->shared_heap);
+      }
+
+      caml_gc_message(0x400, "allocated_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+                      (intnat)allocated_words);
+      caml_gc_message(0x400, "minor_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+                      (intnat) minwords);
+      caml_gc_message(0x400, "promoted_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+                      (intnat) s.promoted_words);
+      caml_gc_message(0x400, "major_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+                      (intnat) majwords);
+      caml_gc_message(0x400,
+          "minor_collections: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+          (intnat) s.minor_collections);
+      caml_gc_message(0x400,
+          "major_collections: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+          domain_state->stat_major_collections);
+      caml_gc_message(0x400,
+          "forced_major_collections: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+          (intnat)s.forced_major_collections);
+      caml_gc_message(0x400, "heap_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+                      heap_words);
+      caml_gc_message(0x400, "top_heap_words: %"ARCH_INTNAT_PRINTF_FORMAT"d\n",
+                      top_heap_words);
+      caml_gc_message(0x400, "mean_space_overhead: %lf\n",
+                      caml_mean_space_overhead());
+    }
   }
 
 #ifndef NATIVE_CODE
   caml_debugger(PROGRAM_EXIT, Val_unit);
 #endif
-  if (caml_cleanup_on_exit)
+  if (caml_params->cleanup_on_exit)
     caml_shutdown();
 #ifdef _WIN32
   caml_restore_win32_terminal();
-#endif
-  caml_terminate_signals();
-#ifdef NAKED_POINTERS_CHECKER
-  if (retcode == 0 && caml_naked_pointers_detected) {
-    fprintf (stderr, "\nOut-of-heap pointers were detected by the runtime.\n"
-                     "The process would otherwise have terminated normally.\n");
-    retcode = 70; /* EX_SOFTWARE; see sysexits.h */
-  }
 #endif
   exit(retcode);
 }
@@ -189,7 +196,7 @@ CAMLprim value caml_sys_exit(value retcode)
 #endif
 #endif
 
-static int sys_open_flags[] = {
+const static int sys_open_flags[] = {
   O_RDONLY, O_WRONLY, O_APPEND | O_WRONLY, O_CREAT, O_TRUNC, O_EXCL,
   O_BINARY, O_TEXT, O_NONBLOCK
 };
@@ -416,14 +423,13 @@ CAMLprim value caml_sys_getenv(value var)
   return val;
 }
 
-char_os * caml_exe_name;
 static value main_argv;
 
 CAMLprim value caml_sys_get_argv(value unit)
 {
   CAMLparam0 ();   /* unit is unused */
   CAMLlocal2 (exe_name, res);
-  exe_name = caml_copy_string_of_os(caml_exe_name);
+  exe_name = caml_copy_string_of_os(caml_params->exe_name);
   res = caml_alloc_small(2, 0);
   Field(res, 0) = exe_name;
   Field(res, 1) = main_argv;
@@ -443,7 +449,7 @@ CAMLprim value caml_sys_modify_argv(value new_argv)
 
 CAMLprim value caml_sys_executable_name(value unit)
 {
-  return caml_copy_string_of_os(caml_exe_name);
+  return caml_copy_string_of_os(caml_params->exe_name);
 }
 
 void caml_sys_init(char_os * exe_name, char_os **argv)
@@ -456,7 +462,7 @@ void caml_sys_init(char_os * exe_name, char_os **argv)
   caml_setup_win32_terminal();
 #endif
 #endif
-  caml_exe_name = exe_name;
+  caml_init_exe_name(exe_name);
   main_argv = caml_alloc_array((void *)caml_copy_string_of_os,
                                (char const **) argv);
   caml_register_generational_global_root(&main_argv);
@@ -719,12 +725,3 @@ CAMLprim value caml_sys_isatty(value chan)
 
   return ret;
 }
-
-CAMLprim value caml_sys_const_naked_pointers_checked(value unit)
-{
-#ifdef NAKED_POINTERS_CHECKER
-  return Val_true;
-#else
-  return Val_false;
-#endif
-}
diff --git a/runtime/unix.c b/runtime/unix.c
index 21715a761ee8..fa4297936ebb 100644
--- a/runtime/unix.c
+++ b/runtime/unix.c
@@ -24,10 +24,10 @@
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
 #include <errno.h>
 #include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <fcntl.h>
 #include "caml/config.h"
 #if defined(SUPPORT_DYNAMIC_LINKING) && !defined(BUILDING_LIBCAMLRUNS)
@@ -49,6 +49,7 @@
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
 #endif
+#include <pthread.h>
 #include "caml/fail.h"
 #include "caml/memory.h"
 #include "caml/misc.h"
@@ -57,6 +58,7 @@
 #include "caml/sys.h"
 #include "caml/io.h"
 #include "caml/alloc.h"
+#include "caml/platform.h"
 
 #ifndef S_ISREG
 #define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG)
@@ -434,3 +436,29 @@ int caml_num_rows_fd(int fd)
   return -1;
 #endif
 }
+
+int caml_thread_setname(const char* name)
+{
+#ifdef __APPLE__
+  pthread_setname_np(name);
+  return 0;
+#else
+#ifdef _GNU_SOURCE
+  int ret;
+  pthread_t self = pthread_self();
+
+  ret = pthread_setname_np(self, name);
+  if (ret == ERANGE)
+    return -1;
+  return 0;
+#else /* not glibc, not apple */
+  return 0;
+#endif
+#endif
+}
+
+void caml_init_os_params(void)
+{
+  caml_sys_pagesize = sysconf(_SC_PAGESIZE);
+  return;
+}
diff --git a/runtime/weak.c b/runtime/weak.c
index dc0d061270a4..46af88f9361e 100644
--- a/runtime/weak.c
+++ b/runtime/weak.c
@@ -20,116 +20,50 @@
 #include <string.h>
 
 #include "caml/alloc.h"
+#include "caml/domain.h"
 #include "caml/fail.h"
 #include "caml/major_gc.h"
 #include "caml/memory.h"
 #include "caml/mlvalues.h"
+#include "caml/shared_heap.h"
 #include "caml/weak.h"
-#include "caml/minor_gc.h"
-#include "caml/signals.h"
-#include "caml/eventlog.h"
-
-value caml_ephe_list_head = 0;
-
-static value ephe_dummy = 0;
-value caml_ephe_none = (value) &ephe_dummy;
-
-#define CAMLassert_valid_ephemeron(eph) do{                             \
-    CAMLassert (Is_in_heap (eph));                                      \
-    CAMLassert (Tag_val(eph) == Abstract_tag);                          \
-    CAMLassert (CAML_EPHE_FIRST_KEY <= Wosize_val (eph));               \
-}while(0)
-
-#define CAMLassert_valid_offset(eph, offset) do{                        \
-    CAMLassert_valid_ephemeron(eph);                                    \
-    CAMLassert (0 <= offset);                                           \
-    CAMLassert (offset < Wosize_val (eph) - CAML_EPHE_FIRST_KEY);       \
-}while(0)
-
-#ifdef DEBUG
-#define CAMLassert_not_dead_value(v) do{                              \
-    value __v = v;                                                    \
-    if (caml_gc_phase == Phase_clean                                  \
-        && Is_block(__v)                                              \
-        && Is_in_heap (__v)) {                                        \
-      if (Tag_val (__v) == Infix_tag) __v -= Infix_offset_val (__v);  \
-      CAMLassert ( !Is_white_val(__v) );                              \
-    }                                                                 \
-}while(0)
-#else
-#define CAMLassert_not_dead_value(v)
-#endif
-
-CAMLexport mlsize_t caml_ephemeron_num_keys(value eph)
-{
-  CAMLassert_valid_ephemeron(eph);
-  return Wosize_val (eph) - CAML_EPHE_FIRST_KEY;
-}
-
-/* The minor heap is considered alive. Outside minor and major heap it is
-   considered alive (out of reach of the GC). */
-Caml_inline int Test_if_its_white(value x){
-  CAMLassert (x != caml_ephe_none);
-#ifdef NO_NAKED_POINTERS
-  if (!Is_block(x) || Is_young (x)) return 0;
-#else
-  if (!Is_block(x) || !Is_in_heap(x)) return 0;
-#endif
-  if (Tag_val(x) == Infix_tag) x -= Infix_offset_val(x);
-  return Is_white_val(x);
-}
 
-/* If it is not white during clean phase it is dead, i.e it will be swept */
-Caml_inline int Is_Dead_during_clean(value x)
-{
-  CAMLassert (caml_gc_phase == Phase_clean);
-  return Test_if_its_white(x);
-}
+value caml_dummy[] =
+  {(value)Make_header(0,Abstract_tag, NOT_MARKABLE),
+   Val_unit};
+value caml_ephe_none = (value)&caml_dummy[1];
 
-/** caml_ephe_none is considered as not white  */
-Caml_inline int Is_White_During_Mark(value x)
-{
-  CAMLassert (caml_gc_phase == Phase_mark);
-  if (x == caml_ephe_none ) return 0;
-  return Test_if_its_white(x);
-}
+#define None_val (Val_int(0))
+#define Some_tag 0
 
-/** The minor heap doesn't have to be marked, outside they should
-    already be black. Remains the value in the heap to mark.
-*/
-Caml_inline int Must_be_Marked_during_mark(value x)
+struct caml_ephe_info* caml_alloc_ephe_info (void)
 {
-  CAMLassert (x != caml_ephe_none);
-  CAMLassert (caml_gc_phase == Phase_mark);
-#ifdef NO_NAKED_POINTERS
-  return Is_block (x) && !Is_young (x);
-#else
-  return Is_block (x) && Is_in_heap (x);
-#endif
+  struct caml_ephe_info* e =
+    caml_stat_alloc_noexc (sizeof(struct caml_ephe_info));
+  if(e != NULL)
+    memset (e, 0, sizeof(struct caml_ephe_info));
+  return e;
 }
 
-/* [len] is a number of words (fields) */
-CAMLexport value caml_ephemeron_create (mlsize_t len)
+/* [len] is a value that represents a number of words (fields) */
+CAMLprim value caml_ephe_create (value len)
 {
   mlsize_t size, i;
   value res;
+  caml_domain_state* domain_state = Caml_state;
 
-  CAMLassert(len <= CAML_EPHE_MAX_WOSIZE);
-  size = len + CAML_EPHE_FIRST_KEY;
+  size = Long_val (len)
+       + 1 /* weak_list */
+       + 1 /* the value */;
   if (size < CAML_EPHE_FIRST_KEY || size > Max_wosize)
     caml_invalid_argument ("Weak.create");
   res = caml_alloc_shr (size, Abstract_tag);
-  for (i = 1; i < size; i++) Field (res, i) = caml_ephe_none;
-  Field (res, CAML_EPHE_LINK_OFFSET) = caml_ephe_list_head;
-  caml_ephe_list_head = res;
-  return res;
-}
 
-CAMLprim value caml_ephe_create (value len)
-{
-  value res = caml_ephemeron_create(Long_val(len));
-  // run memprof callbacks
-  return caml_process_pending_actions_with_root(res);
+  Ephe_link(res) = domain_state->ephe_info->live;
+  domain_state->ephe_info->live = res;
+  for (i = CAML_EPHE_DATA_OFFSET; i < size; i++)
+    Field(res, i) = caml_ephe_none;
+  return res;
 }
 
 CAMLprim value caml_weak_create (value len)
@@ -167,203 +101,167 @@ CAMLprim value caml_weak_create (value len)
 
  */
 
-#define None_val (Val_int(0))
-#define Some_tag 0
-
-/* If we are in Phase_clean we need to check if the key
+/* If we are in Phase_sweep_ephe we need to check if the key
    that is going to disappear is dead and so should trigger a cleaning
  */
-static void do_check_key_clean(value ar, mlsize_t offset)
+static void do_check_key_clean(value e, mlsize_t offset)
 {
   value elt;
   CAMLassert (offset >= CAML_EPHE_FIRST_KEY);
-  CAMLassert (caml_gc_phase == Phase_clean);
-  elt = Field (ar, offset);
-  if (elt != caml_ephe_none && Is_Dead_during_clean(elt)){
-    Field(ar, offset) = caml_ephe_none;
-    Field(ar, CAML_EPHE_DATA_OFFSET) = caml_ephe_none;
-  };
-}
 
-/* If we are in Phase_clean we need to do as if the key is empty when
-   it will be cleaned during this phase */
-Caml_inline int is_ephe_key_none(value ar, mlsize_t offset)
-{
-  value elt = Field (ar, offset);
-  if (elt == caml_ephe_none){
-    return 1;
-  }else if (caml_gc_phase == Phase_clean && Is_Dead_during_clean(elt)){
-    Field(ar, offset) = caml_ephe_none;
-    Field(ar, CAML_EPHE_DATA_OFFSET) = caml_ephe_none;
-    return 1;
-  } else {
-    return 0;
+  if (caml_gc_phase != Phase_sweep_ephe) return;
+
+  elt = Field(e, offset);
+  if (elt != caml_ephe_none && Is_block (elt) && !Is_young (elt)) {
+    if (Tag_val(elt) == Infix_tag) elt -= Infix_offset_val(elt);
+    if (is_unmarked(elt)) {
+      Field(e, offset) = caml_ephe_none;
+      Field(e,CAML_EPHE_DATA_OFFSET) = caml_ephe_none;
+    }
   }
 }
 
-static void do_set (value ar, mlsize_t offset, value v)
-{
-  if (Is_block (v) && Is_young (v)){
-    /* modified version of caml_modify */
-    value old = Field (ar, offset);
-    Field (ar, offset) = v;
-    if (!(Is_block (old) && Is_young (old))){
-      add_to_ephe_ref_table (Caml_state->ephe_ref_table, ar, offset);
+void caml_ephe_clean (value v) {
+  value child;
+  int release_data = 0;
+  mlsize_t size, i;
+  header_t hd;
+
+  if (caml_gc_phase != Phase_sweep_ephe) return;
+
+  hd = Hd_val(v);
+  size = Wosize_hd (hd);
+  for (i = CAML_EPHE_FIRST_KEY; i < size; i++) {
+    child = Field(v, i);
+  ephemeron_again:
+    if (child != caml_ephe_none && Is_block(child)) {
+      if (Tag_val (child) == Forward_tag) {
+        value f = Forward_val (child);
+        if (Is_block(f)) {
+          if (Tag_val(f) == Forward_tag || Tag_val(f) == Lazy_tag ||
+              Tag_val(f) == Forcing_tag || Tag_val(f) == Double_tag) {
+            /* Do not short-circuit the pointer */
+          } else {
+            Field(v, i) = child = f;
+            if (Is_block (f) && Is_young (f))
+              add_to_ephe_ref_table(&Caml_state->minor_tables->ephe_ref, v, i);
+            goto ephemeron_again;
+          }
+        }
+      }
+      if (Tag_val (child) == Infix_tag) child -= Infix_offset_val (child);
+      if (!Is_young (child) && is_unmarked(child)) {
+        release_data = 1;
+        Field(v, i) = caml_ephe_none;
+      }
     }
-  }else{
-    Field (ar, offset) = v;
   }
-}
 
-CAMLexport void caml_ephemeron_set_key(value ar, mlsize_t offset, value k)
-{
-  CAMLassert_valid_offset(ar, offset);
-  CAMLassert (Is_in_heap (ar));
-
-  offset += CAML_EPHE_FIRST_KEY;
-
-  if( caml_gc_phase == Phase_mark
-      && caml_ephe_list_pure
-      && Field(ar, CAML_EPHE_DATA_OFFSET) != caml_ephe_none
-      && !Is_white_val(ar)
-      && Is_White_During_Mark(Field(ar, offset))
-      && !Is_White_During_Mark(k)){
-    /* the ephemeron could be in the set (2) only because of a white key and not
-       have one anymore after set */
-    caml_darken(Field(ar, CAML_EPHE_DATA_OFFSET), NULL);
-  };
-  if(caml_gc_phase == Phase_clean) do_check_key_clean(ar, offset);
-  do_set (ar, offset, k);
+  child = Field(v, CAML_EPHE_DATA_OFFSET);
+  if (child != caml_ephe_none) {
+    if (release_data) {
+      Field(v, CAML_EPHE_DATA_OFFSET) = caml_ephe_none;
+    } else {
+      CAMLassert (!Is_block(child) || !is_unmarked(child));
+    }
+  }
 }
 
-CAMLprim value caml_ephe_set_key (value ar, value n, value el)
+static void clean_field (value e, mlsize_t offset)
 {
-  caml_ephemeron_set_key(ar, Long_val(n), el);
-  return Val_unit;
+  if (offset == CAML_EPHE_DATA_OFFSET)
+    caml_ephe_clean(e);
+  else
+    do_check_key_clean(e, offset);
 }
 
-CAMLexport void caml_ephemeron_unset_key(value ar, mlsize_t offset)
+static void do_set (value e, mlsize_t offset, value v)
 {
-  CAMLassert_valid_offset(ar, offset);
-  CAMLassert (Is_in_heap (ar));
-
-  offset += CAML_EPHE_FIRST_KEY;
-
-  if( caml_gc_phase == Phase_mark
-      && caml_ephe_list_pure
-      && Field(ar, CAML_EPHE_DATA_OFFSET) != caml_ephe_none
-      && !Is_white_val(ar)
-      && Is_White_During_Mark(Field(ar, offset)) ){
-    /* the ephemeron could be in the set (2) only because of this white key and
-       not have one anymore after unsetting it */
-    caml_darken(Field(ar, CAML_EPHE_DATA_OFFSET), NULL);
-  };
-
-  if(caml_gc_phase == Phase_clean) do_check_key_clean(ar, offset);
-  Field (ar, offset) = caml_ephe_none;
+  if (Is_block(v) && Is_young(v)) {
+    value old = Field(e, offset);
+    Field(e, offset) = v;
+    if (!(Is_block(old) && Is_young(old)))
+      add_to_ephe_ref_table (&Caml_state->minor_tables->ephe_ref,
+                             e, offset);
+  } else {
+    Field(e, offset) = v;
+  }
 }
 
-CAMLprim value caml_ephe_unset_key (value ar, value n)
+static value ephe_set_field (value e, mlsize_t offset, value el)
 {
-  caml_ephemeron_unset_key(ar, Long_val(n));
-  return Val_unit;
+  CAMLparam2(e,el);
+
+  clean_field(e, offset);
+  do_set(e, offset, el);
+  CAMLreturn(Val_unit);
 }
 
-/* deprecated (03/2016) */
-value caml_ephe_set_key_option (value ar, value n, value el)
+CAMLprim value caml_ephe_set_key (value e, value n, value el)
 {
-  if (Is_block (el)){
-    CAMLassert (Wosize_val (el) == 1);
-    caml_ephe_set_key(ar, n, Field (el, 0));
-  }else{
-    CAMLassert (el == None_val);
-    caml_ephe_unset_key(ar, n);
+  mlsize_t offset = Long_val (n) + CAML_EPHE_FIRST_KEY;
+
+  if (offset < CAML_EPHE_FIRST_KEY || offset >= Wosize_val (e)){
+    caml_invalid_argument ("Weak.set");
   }
-  return Val_unit;
+  return ephe_set_field (e, offset, el);
 }
 
-/* deprecated (03/2016) */
-CAMLprim value caml_weak_set (value ar, value n, value el)
+CAMLprim value caml_ephe_unset_key (value e, value n)
 {
-  return caml_ephe_set_key_option(ar, n, el);
+  return caml_ephe_set_key (e, n, caml_ephe_none);
 }
 
-CAMLexport void caml_ephemeron_set_data (value ar, value el)
+value caml_ephe_set_key_option (value e, value n, value el)
 {
-  value old_data;
-  CAMLassert_valid_ephemeron(ar);
-
-  old_data = Field (ar, CAML_EPHE_DATA_OFFSET);
-  if (caml_gc_phase == Phase_mark && !Is_White_During_Mark(old_data))
-    caml_darken (el, NULL);
-  if (caml_gc_phase == Phase_clean){
-    /* During this phase since we don't know which ephemerons have been
-       cleaned we always need to check it. */
-    caml_ephe_clean(ar);
-  };
-  do_set (ar, CAML_EPHE_DATA_OFFSET, el);
+  if (el != None_val && Is_block (el)) {
+    return caml_ephe_set_key (e, n, Field(el, 0));
+  } else {
+    return caml_ephe_unset_key (e, n);
+  }
 }
 
-CAMLprim value caml_ephe_set_data (value ar, value el)
+CAMLprim value caml_weak_set (value ar, value n, value el)
 {
-  caml_ephemeron_set_data (ar, el);
-  return Val_unit;
+  return caml_ephe_set_key_option(ar,n,el);
 }
 
-CAMLexport void caml_ephemeron_unset_data (value ar)
+CAMLprim value caml_ephe_set_data (value e, value el)
 {
-  CAMLassert_valid_ephemeron(ar);
-
-  Field (ar, CAML_EPHE_DATA_OFFSET) = caml_ephe_none;
+  return ephe_set_field (e, CAML_EPHE_DATA_OFFSET, el);
 }
 
-CAMLprim value caml_ephe_unset_data (value ar)
+CAMLprim value caml_ephe_unset_data (value e)
 {
-  caml_ephemeron_unset_data (ar);
-  return Val_unit;
+  return caml_ephe_set_data(e, caml_ephe_none);
 }
 
-static value optionalize(int status, value *x)
+static value ephe_get_field (value e, mlsize_t offset)
 {
-  CAMLparam0();
-  CAMLlocal2(res, v);
-  if(status) {
-    v = *x;
-    res = caml_alloc_small (1, Some_tag);
-    Field (res, 0) = v;
-  } else {
-    res = None_val;
-  }
-  // run memprof callbacks both for the option we are allocating here
-  // and the calling function.
-  caml_process_pending_actions();
-  CAMLreturn(res);
-}
+  CAMLparam1(e);
+  CAMLlocal2 (res, elt);
 
-CAMLexport int caml_ephemeron_get_key (value ar, mlsize_t offset, value *key)
-{
-  value elt;
-  CAMLassert_valid_offset(ar, offset);
-
-  offset += CAML_EPHE_FIRST_KEY;
+  clean_field(e, offset);
+  elt = Field(e, offset);
 
-  if (is_ephe_key_none(ar, offset)){
-    return 0;
-  }else{
-    elt = Field (ar, offset);
-    if (caml_gc_phase == Phase_mark && Must_be_Marked_during_mark(elt)){
-      caml_darken (elt, NULL);
-    }
-    *key = elt;
-    CAMLassert_not_dead_value(elt);
-    return 1;
+  if (elt == caml_ephe_none) {
+    res = None_val;
+  } else {
+    elt = Field(e, offset);
+    caml_darken (0, elt, 0);
+    res = caml_alloc_shr (1, Some_tag);
+    caml_initialize(&Field(res, 0), elt);
   }
+  CAMLreturn (res);
 }
 
-CAMLprim value caml_ephe_get_key (value ar, value n)
+CAMLprim value caml_ephe_get_key (value e, value n)
 {
-  value data;
-  return optionalize(caml_ephemeron_get_key(ar, Long_val(n), &data), &data);
+  mlsize_t offset = Long_val (n) + CAML_EPHE_FIRST_KEY;
+  if (offset < CAML_EPHE_FIRST_KEY || offset >= Wosize_val (e)){
+    caml_invalid_argument ("Weak.get");
+  }
+  return ephe_get_field (e, offset);
 }
 
 CAMLprim value caml_weak_get (value ar, value n)
@@ -371,307 +269,161 @@ CAMLprim value caml_weak_get (value ar, value n)
   return caml_ephe_get_key(ar, n);
 }
 
-CAMLexport int caml_ephemeron_get_data (value ar, value *data)
+static value ephe_get_field_copy (value e, mlsize_t offset)
 {
-  value elt;
-  CAMLassert_valid_ephemeron(ar);
-
-  if(caml_gc_phase == Phase_clean) caml_ephe_clean(ar);
-  elt = Field (ar, CAML_EPHE_DATA_OFFSET);
-  if (elt == caml_ephe_none){
-    return 0;
-  }else{
-    if (caml_gc_phase == Phase_mark && Must_be_Marked_during_mark(elt)){
-      caml_darken (elt, NULL);
-    }
-    *data = elt;
-    CAMLassert_not_dead_value(elt);
-    return 1;
-  }
-}
+  CAMLparam1 (e);
+  CAMLlocal2 (res, elt);
+  mlsize_t i, infix_offs = 0;
+  value v; /* Caution: this is NOT a local root. */
+  value f;
 
-CAMLprim value caml_ephe_get_data (value ar)
-{
-  value data;
-  return optionalize(caml_ephemeron_get_data(ar, &data), &data);
-}
+  clean_field(e, offset);
+  v = Field(e, offset);
+  if (v == caml_ephe_none) CAMLreturn (None_val);
 
-static void copy_value(value src, value dst)
-{
-  mlsize_t sz, i;
-  sz = Wosize_val(src);
-  if (Tag_val (src) >= No_scan_tag) {
-    /* Direct copy */
-    memcpy (Bp_val (dst), Bp_val (src), Bsize_wsize (sz));
-    return;
-  }
-  i = 0;
-  if (Tag_val (src) == Closure_tag) {
-    /* Direct copy of the code pointers and closure info fields */
-    i = Start_env_closinfo(Closinfo_val(src));
-    memcpy (Bp_val (dst), Bp_val (src), Bsize_wsize (i));
-  }
-  /* Field-by-field copy and darkening of the remaining fields */
-  for (/*nothing*/; i < sz; i++){
-    value f = Field (src, i);
-    if (caml_gc_phase == Phase_mark && Must_be_Marked_during_mark(f)){
-      caml_darken (f, NULL);
+  /** Don't copy custom_block #7279 */
+  if (Is_block(v) && Tag_val(v) != Custom_tag) {
+    if (Tag_val(v) == Infix_tag) {
+      infix_offs = Infix_offset_val(v);
+      v -= infix_offs;
     }
-    caml_modify (&Field (dst, i), f);
-  }
-}
+    elt = caml_alloc (Wosize_val(v), Tag_val(v));
 
-CAMLexport int caml_ephemeron_get_key_copy(value ar, mlsize_t offset,
-                                           value *key)
-{
-  mlsize_t loop = 0, infix_offs;
-  CAMLparam1(ar);
-  value elt = Val_unit, v; /* Caution: they are NOT local roots. */
-  CAMLassert_valid_offset(ar, offset);
-
-  offset += CAML_EPHE_FIRST_KEY;
-
-  while(1) {
-    if(is_ephe_key_none(ar, offset)) CAMLreturn(0);
-    v = Field (ar, offset);
-    /** Don't copy custom_block #7279 */
-    if(!(Is_block (v) && Is_in_value_area(v) && Tag_val(v) != Custom_tag)) {
-      if ( caml_gc_phase == Phase_mark && Must_be_Marked_during_mark(v) ){
-        caml_darken (v, NULL);
-      };
-      *key = v;
-      CAMLreturn(1);
-    }
-    infix_offs = Tag_val(v) == Infix_tag ? Infix_offset_val(v) : 0;
-    v -= infix_offs;
-    if (elt != Val_unit &&
-        Wosize_val(v) == Wosize_val(elt) && Tag_val(v) == Tag_val(elt)) {
-      /* The allocation may trigger a finaliser that change the tag
-         and size of the block. Therefore, in addition to checking
-         that the pointer is still alive, we have to check that it
-         still has the same tag and size.
-       */
-      CAMLassert_not_dead_value(v);
-      copy_value(v, elt);
-      *key = elt + infix_offs;
-      CAMLreturn(1);
+    clean_field(e, offset);
+    v = Field(e, offset);
+    if (v == caml_ephe_none) CAMLreturn (None_val);
+
+    if (Tag_val(v) == Infix_tag) {
+      infix_offs = Infix_offset_val(v);
+      v -= infix_offs;
     }
 
-    CAMLassert(loop < 10);
-    if(8 == loop){ /** One minor gc must be enough */
-      elt = Val_unit;
-      CAML_EV_COUNTER (EV_C_FORCE_MINOR_WEAK, 1);
-      caml_minor_collection ();
+    if (Tag_val(v) < No_scan_tag) {
+      i = 0;
+      if (Tag_val (v) == Closure_tag) {
+        /* Direct copy of the code pointers and closure info fields */
+        i = Start_env_closinfo(Closinfo_val(v));
+        memcpy (Bp_val (elt), Bp_val (v), Bsize_wsize (i));
+      }
+      /* Field-by-field copy and darkening of the remaining fields */
+      for (/*nothing*/; i < Wosize_val(v); i++) {
+        f = Field(v, i);
+        caml_darken (0, f, 0);
+        Store_field(elt, i, f);
+      }
     } else {
-      /* cases where loop is between 0 to 7 and where loop is equal to 9 */
-      elt = caml_alloc (Wosize_val (v), Tag_val (v));
-      /* The GC may erase, move or even change v during this call to
-         caml_alloc. */
+      memmove (Bp_val(elt), Bp_val(v), Bosize_val(v));
     }
-    ++loop;
+  } else {
+    Field(e, offset) = elt = v;
   }
+  res = caml_alloc_shr (1, Some_tag);
+  caml_initialize(&Field(res, 0), elt + infix_offs);
+  CAMLreturn(res);
 }
 
-CAMLprim value caml_ephe_get_key_copy (value ar, value n)
+CAMLprim value caml_ephe_get_key_copy (value e, value n)
 {
-  value key;
-  int status = caml_ephemeron_get_key_copy(ar, Long_val(n), &key);
-  return optionalize(status, &key);
+  mlsize_t offset = Long_val (n) + CAML_EPHE_FIRST_KEY;
+  if (offset < CAML_EPHE_FIRST_KEY || offset >= Wosize_val (e)){
+    caml_invalid_argument ("Weak.get");
+  }
+  return ephe_get_field_copy(e, offset);
 }
 
-CAMLprim value caml_weak_get_copy (value ar, value n)
-{
-  return caml_ephe_get_key_copy(ar, n);
+CAMLprim value caml_weak_get_copy (value e, value n){
+  return caml_ephe_get_key_copy(e,n);
 }
 
-CAMLexport int caml_ephemeron_get_data_copy (value ar, value *data)
+CAMLprim value caml_ephe_get_data (value e)
 {
-  mlsize_t loop = 0, infix_offs;
-  CAMLparam1 (ar);
-  value elt = Val_unit, v; /* Caution: they are NOT local roots. */
-  CAMLassert_valid_ephemeron(ar);
-
-  while(1) {
-    if (caml_gc_phase == Phase_clean) caml_ephe_clean(ar);
-    v = Field (ar, CAML_EPHE_DATA_OFFSET);
-    if (v == caml_ephe_none) CAMLreturn(0);
-    /** Don't copy custom_block #7279 */
-    if (!(Is_block (v) && Is_in_value_area(v) && Tag_val(v) != Custom_tag)) {
-      if ( caml_gc_phase == Phase_mark && Must_be_Marked_during_mark(v) ){
-        caml_darken (v, NULL);
-      };
-      *data = v;
-      CAMLreturn(1);
-    }
-    infix_offs = Tag_val(v) == Infix_tag ? Infix_offset_val(v) : 0;
-    v -= infix_offs;
-    if (elt != Val_unit &&
-        Wosize_val(v) == Wosize_val(elt) && Tag_val(v) == Tag_val(elt)) {
-      /** cf caml_ephemeron_get_key_copy */
-      CAMLassert_not_dead_value(v);
-      copy_value(v, elt);
-      *data = elt + infix_offs;
-      CAMLreturn(1);
-    }
-
-    CAMLassert(loop < 10);
-    if(8 == loop){ /** One minor gc must be enough */
-      elt = Val_unit;
-      CAML_EV_COUNTER (EV_C_FORCE_MINOR_WEAK, 1);
-      caml_minor_collection ();
-    } else {
-      /* cases where loop is between 0 to 7 and where loop is equal to 9 */
-      elt = caml_alloc (Wosize_val (v), Tag_val (v));
-      /** cf caml_ephemeron_get_key_copy */
-    }
-    ++loop;
-  }
+  return ephe_get_field (e, CAML_EPHE_DATA_OFFSET);
 }
 
-
-CAMLprim value caml_ephe_get_data_copy (value ar)
+CAMLprim value caml_ephe_get_data_copy (value e)
 {
-  value data;
-  int status = caml_ephemeron_get_data_copy(ar, &data);
-  return optionalize(status, &data);
+  return ephe_get_field_copy (e, CAML_EPHE_DATA_OFFSET);
 }
 
-CAMLexport int caml_ephemeron_key_is_set(value ar, mlsize_t offset)
+static value ephe_check_field (value e, mlsize_t offset)
 {
-  CAMLassert_valid_offset(ar, offset);
+  CAMLparam1(e);
+  CAMLlocal1(v);
 
-  offset += CAML_EPHE_FIRST_KEY;
-  return !is_ephe_key_none(ar, offset);
+  clean_field(e, offset);
+  v = Field(e, offset);
+  CAMLreturn(Val_bool(v != caml_ephe_none));
 }
 
-CAMLprim value caml_ephe_check_key (value ar, value n)
+CAMLprim value caml_ephe_check_key (value e, value n)
 {
-  return Val_bool (caml_ephemeron_key_is_set(ar, Long_val(n)));
+  mlsize_t offset = Long_val (n) + CAML_EPHE_FIRST_KEY;
+  if (offset < CAML_EPHE_FIRST_KEY || offset >= Wosize_val (e)){
+    caml_invalid_argument ("Weak.check");
+  }
+  return ephe_check_field (e, offset);
 }
 
-CAMLprim value caml_weak_check (value ar, value n)
+CAMLprim value caml_weak_check (value e, value n)
 {
-  return caml_ephe_check_key(ar, n);
+  return caml_ephe_check_key(e,n);
 }
 
-CAMLexport int caml_ephemeron_data_is_set (value ar)
+CAMLprim value caml_ephe_check_data (value e)
 {
-  CAMLassert_valid_ephemeron(ar);
-
-  if(caml_gc_phase == Phase_clean) caml_ephe_clean(ar);
-  return Field (ar, CAML_EPHE_DATA_OFFSET) != caml_ephe_none;
+  return ephe_check_field (e, CAML_EPHE_DATA_OFFSET);
 }
 
-CAMLprim value caml_ephe_check_data (value ar)
+static value ephe_blit_field (value es, mlsize_t offset_s,
+                              value ed, mlsize_t offset_d, mlsize_t length)
 {
-  return Val_bool (caml_ephemeron_data_is_set(ar));
-}
+  CAMLparam2(es,ed);
+  CAMLlocal1(ar);
+  long i;
 
-CAMLexport void caml_ephemeron_blit_key(value ars, mlsize_t offset_s,
-                                        value ard, mlsize_t offset_d,
-                                        mlsize_t length)
-{
-  intnat i; /** intnat because the second for-loop stops with i == -1 */
-  int dest_has_white_value;
-  if (length == 0) return;
-  CAMLassert_valid_offset(ars, offset_s);
-  CAMLassert_valid_offset(ard, offset_d);
-  CAMLassert(length <= Wosize_val(ars) - CAML_EPHE_FIRST_KEY);
-  CAMLassert(length <= Wosize_val(ard) - CAML_EPHE_FIRST_KEY);
-  CAMLassert(offset_s <= Wosize_val(ars) - CAML_EPHE_FIRST_KEY - length);
-  CAMLassert(offset_d <= Wosize_val(ard) - CAML_EPHE_FIRST_KEY - length);
-
-  offset_s += CAML_EPHE_FIRST_KEY;
-  offset_d += CAML_EPHE_FIRST_KEY;
-
-  if ( caml_gc_phase == Phase_mark
-       && caml_ephe_list_pure
-       && Field(ard, CAML_EPHE_DATA_OFFSET) != caml_ephe_none
-       && !Is_white_val(ard)
-       && !Is_White_During_Mark(Field(ard, CAML_EPHE_DATA_OFFSET))
-       ){
-    /* We check here if darkening of the data of the destination is needed
-       because the destination could be in (2). Indeed a white key could
-       disappear from the destination after blitting and being in (2) requires
-       if the ephemeron is alive without white key to have a black or none
-       data. */
-
-    dest_has_white_value = 0;
-
-    for(i = 0; i < length; i++){
-      dest_has_white_value |= Is_White_During_Mark(Field(ard, offset_d + i));
-    };
-    /* test if the destination can't be in set (2) because of the keys that are
-       going to be set */
-    if(!dest_has_white_value) goto No_darkening;
-    for(i = 0; i < length; i++){
-      /* test if the source is going to bring a white key to replace the one
-         set */
-      if(Is_White_During_Mark(Field(ars, offset_s + i))) goto No_darkening;
-    };
-    /* the destination ephemeron could be in the set (2) because of a white key
-        replaced and not have one anymore after. */
-    caml_darken(Field(ard, CAML_EPHE_DATA_OFFSET),NULL);
-  }
-  No_darkening:
-
-  if (caml_gc_phase == Phase_clean){
-    caml_ephe_clean_partial(ars, offset_s, offset_s + length);
-    /* We don't need to clean the keys that are about to be overwritten,
-       except when cleaning them could result in releasing the data,
-       which can't happen if data is already released. */
-    if (Field (ard, CAML_EPHE_DATA_OFFSET) != caml_ephe_none)
-      caml_ephe_clean_partial(ard, offset_d, offset_d + length);
-  }
-  if (offset_d < offset_s){
-    for (i = 0; i < length; i++){
-      do_set (ard, offset_d + i, Field (ars, offset_s + i));
+  if (length == 0) CAMLreturn(Val_unit);
+
+  caml_ephe_clean(es);
+  caml_ephe_clean(ed);
+
+  if (offset_d < offset_s) {
+    for (i = 0; i < length; i++) {
+      caml_darken(0, Field(es, (offset_s + i)), 0);
+      do_set(ed, offset_d + i, Field(es, (offset_s + i)));
     }
-  }else{
-    for (i = length - 1; i >= 0; i--){
-      do_set (ard, offset_d + i,  Field (ars, offset_s + i));
+  } else {
+    for (i = length - 1; i >= 0; i--) {
+      caml_darken(0, Field(es, (offset_s + i)), 0);
+      do_set(ed, offset_d + i, Field(es, (offset_s + i)));
     }
   }
+  CAMLreturn(Val_unit);
 }
 
-CAMLprim value caml_ephe_blit_key (value ars, value ofs,
-                                   value ard, value ofd, value len)
+CAMLprim value caml_ephe_blit_key (value es, value ofs,
+                                   value ed, value ofd, value len)
 {
-  if (Long_val(len) == 0) return Val_unit;
+  mlsize_t offset_s = Long_val (ofs) + CAML_EPHE_FIRST_KEY;
+  mlsize_t offset_d = Long_val (ofd) + CAML_EPHE_FIRST_KEY;
+  mlsize_t length = Long_val (len);
 
-  caml_ephemeron_blit_key(ars,Long_val(ofs),ard,Long_val(ofd),Long_val(len));
-  return Val_unit;
-}
-
-CAMLprim value caml_weak_blit (value ars, value ofs,
-                      value ard, value ofd, value len)
-{
-  return caml_ephe_blit_key (ars, ofs, ard, ofd, len);
+  if (offset_s < CAML_EPHE_FIRST_KEY || offset_s + length > Wosize_val (es)){
+    caml_invalid_argument ("Weak.blit");
+  }
+  if (offset_d < CAML_EPHE_FIRST_KEY || offset_d + length > Wosize_val (ed)){
+    caml_invalid_argument ("Weak.blit");
+  }
+  return ephe_blit_field (es, offset_s, ed, offset_d, length);
 }
 
-CAMLexport void caml_ephemeron_blit_data (value ars, value ard)
+CAMLprim value caml_ephe_blit_data (value es, value ed)
 {
-  value data, old_data;
-  CAMLassert_valid_ephemeron(ars);
-  CAMLassert_valid_ephemeron(ard);
-
-  if(caml_gc_phase == Phase_clean) {
-    caml_ephe_clean(ars);
-    caml_ephe_clean(ard);
-  };
-
-  data = Field (ars, CAML_EPHE_DATA_OFFSET);
-  old_data = Field (ard, CAML_EPHE_DATA_OFFSET);
-  if (caml_gc_phase == Phase_mark &&
-      data != caml_ephe_none &&
-      !Is_White_During_Mark(old_data))
-    caml_darken (data, NULL);
-
-  do_set (ard, CAML_EPHE_DATA_OFFSET, data);
+  return ephe_blit_field (es, CAML_EPHE_DATA_OFFSET,
+                          ed, CAML_EPHE_DATA_OFFSET, 1);
 }
 
-CAMLprim value caml_ephe_blit_data (value ars, value ard)
+CAMLprim value caml_weak_blit (value es, value ofs,
+                      value ed, value ofd, value len)
 {
-  caml_ephemeron_blit_data(ars, ard);
-  return Val_unit;
+  return caml_ephe_blit_key (es, ofs, ed, ofd, len);
 }
diff --git a/runtime/win32.c b/runtime/win32.c
index 77e5f39fb057..12d6f0893a29 100644
--- a/runtime/win32.c
+++ b/runtime/win32.c
@@ -39,6 +39,9 @@
 #include <errno.h>
 #include <string.h>
 #include <signal.h>
+#if defined(DEBUG) || defined(NATIVE_CODE)
+#include <dbghelp.h>
+#endif
 #include "caml/alloc.h"
 #include "caml/codefrag.h"
 #include "caml/fail.h"
@@ -49,6 +52,8 @@
 #include "caml/signals.h"
 #include "caml/sys.h"
 #include "caml/winsupport.h"
+#include "caml/startup_aux.h"
+#include "caml/platform.h"
 
 #include "caml/config.h"
 
@@ -95,7 +100,7 @@ int caml_read_fd(int fd, int flags, void * buf, int n)
 {
   int retcode;
   if ((flags & CHANNEL_FLAG_FROM_SOCKET) == 0) {
-    caml_enter_blocking_section_no_pending();
+    caml_enter_blocking_section();
     retcode = read(fd, buf, n);
     /* Large reads from console can fail with ENOMEM.  Reduce requested size
        and try again. */
@@ -105,7 +110,7 @@ int caml_read_fd(int fd, int flags, void * buf, int n)
     caml_leave_blocking_section();
     if (retcode == -1) caml_sys_io_error(NO_ARG);
   } else {
-    caml_enter_blocking_section_no_pending();
+    caml_enter_blocking_section();
     retcode = recv((SOCKET) _get_osfhandle(fd), buf, n, 0);
     caml_leave_blocking_section();
     if (retcode == -1) caml_win32_sys_error(WSAGetLastError());
@@ -117,12 +122,12 @@ int caml_write_fd(int fd, int flags, void * buf, int n)
 {
   int retcode;
   if ((flags & CHANNEL_FLAG_FROM_SOCKET) == 0) {
-    caml_enter_blocking_section_no_pending();
+    caml_enter_blocking_section();
     retcode = write(fd, buf, n);
     caml_leave_blocking_section();
     if (retcode == -1) caml_sys_io_error(NO_ARG);
   } else {
-    caml_enter_blocking_section_no_pending();
+    caml_enter_blocking_section();
     retcode = send((SOCKET) _get_osfhandle(fd), buf, n, 0);
     caml_leave_blocking_section();
     if (retcode == -1) caml_win32_sys_error(WSAGetLastError());
@@ -230,7 +235,7 @@ void * caml_dlopen(wchar_t * libname, int for_execution, int global)
   int flags = (global ? FLEXDLL_RTLD_GLOBAL : 0);
   if (!for_execution) flags |= FLEXDLL_RTLD_NOEXEC;
   handle = flexdll_wdlopen(libname, flags);
-  if ((handle != NULL) && ((caml_verb_gc & 0x100) != 0)) {
+  if ((handle != NULL) && ((caml_params->verb_gc & 0x100) != 0)) {
     flexdll_dump_exports(handle);
     fflush(stdout);
   }
@@ -645,7 +650,7 @@ static void invalid_parameter_handler(const wchar_t* expression,
 }
 
 
-void caml_install_invalid_parameter_handler()
+void caml_install_invalid_parameter_handler(void)
 {
   _set_invalid_parameter_handler(invalid_parameter_handler);
 }
@@ -1087,3 +1092,70 @@ CAMLexport clock_t caml_win32_clock(void)
   clocks_per_sec = INT64_LITERAL(10000000U) / (ULONGLONG)CLOCKS_PER_SEC;
   return (clock_t)(total / clocks_per_sec);
 }
+
+int caml_thread_setname(const char* name)
+{
+  int ret;
+  /* XXX Duplicates unix.c, but MSVC will add some specific code here */
+  pthread_t self = pthread_self();
+
+  ret = pthread_setname_np(self, name);
+  if (ret == ERANGE)
+    return -1;
+  return 0;
+}
+
+static LARGE_INTEGER frequency;
+static LARGE_INTEGER clock_offset;
+typedef void (WINAPI *LPFN_GETSYSTEMTIME) (LPFILETIME);
+
+void caml_init_os_params(void)
+{
+  SYSTEM_INFO si;
+  LPFN_GETSYSTEMTIME pGetSystemTime;
+  FILETIME stamp;
+  ULARGE_INTEGER now;
+  LARGE_INTEGER counter;
+
+  /* Get the system page size */
+  GetSystemInfo(&si);
+  caml_sys_pagesize = si.dwPageSize;
+
+  /* Get the number of nanoseconds for each tick in QueryPerformanceCounter */
+  QueryPerformanceFrequency(&frequency);
+  /* Convert the frequency to the duration of 1 tick in ns */
+  frequency.QuadPart = 1000000000LL / frequency.QuadPart;
+
+  /* Get the current time as accurately as we can.
+     GetSystemTimePreciseAsFileTime is available on Windows 8 / Server 2012+ and
+     gives <1us precision. For Windows 7 and earlier, which is only accurate to
+     10-100ms. */
+  pGetSystemTime =
+    (LPFN_GETSYSTEMTIME)GetProcAddress(GetModuleHandle(L"kernel32"),
+                                       "GetSystemTimePreciseAsFileTime");
+  if (!pGetSystemTime)
+    pGetSystemTime = GetSystemTimeAsFileTime;
+
+  /* Get the time and the performance counter. Get the performance counter first
+     to ensure no quantum effects */
+  QueryPerformanceCounter(&counter);
+  pGetSystemTime(&stamp);
+
+  now.LowPart = stamp.dwLowDateTime;
+  now.HighPart = stamp.dwHighDateTime;
+
+  /* Convert a FILETIME in 100ns ticks since 1 January 1601 to
+     ns since 1 Jan 1970. */
+  clock_offset.QuadPart =
+    ((now.QuadPart - INT64_LITERAL(0x19DB1DED53E8000U)) * 100);
+
+  /* Get the offset between QueryPerformanceCounter and
+     GetSystemTimePreciseAsFileTime in order to return a true timestamp, rather
+     than just a monotonic time source */
+  clock_offset.QuadPart -= (counter.QuadPart * frequency.QuadPart);
+
+  GetSystemTimePreciseAsFileTime(&stamp);
+  now.LowPart = stamp.dwLowDateTime;
+  now.HighPart = stamp.dwHighDateTime;
+  now.QuadPart *= 100;
+}
diff --git a/stdlib/.depend b/stdlib/.depend
index 06cb5627bce5..08a386621956 100644
--- a/stdlib/.depend
+++ b/stdlib/.depend
@@ -193,6 +193,14 @@ stdlib__Complex.cmo : complex.ml \
 stdlib__Complex.cmx : complex.ml \
     stdlib__Complex.cmi
 stdlib__Complex.cmi : complex.mli
+stdlib__Condition.cmo : condition.ml \
+    stdlib__Mutex.cmi \
+    stdlib__Condition.cmi
+stdlib__Condition.cmx : condition.ml \
+    stdlib__Mutex.cmx \
+    stdlib__Condition.cmi
+stdlib__Condition.cmi : condition.mli \
+    stdlib__Mutex.cmi
 stdlib__Digest.cmo : digest.ml \
     stdlib__String.cmi \
     stdlib__Char.cmi \
@@ -204,6 +212,33 @@ stdlib__Digest.cmx : digest.ml \
     stdlib__Bytes.cmx \
     stdlib__Digest.cmi
 stdlib__Digest.cmi : digest.mli
+stdlib__Domain.cmo : domain.ml \
+    stdlib__Sys.cmi \
+    stdlib.cmi \
+    stdlib__Obj.cmi \
+    stdlib__Mutex.cmi \
+    stdlib__Atomic.cmi \
+    stdlib__Array.cmi \
+    stdlib__Domain.cmi
+stdlib__Domain.cmx : domain.ml \
+    stdlib__Sys.cmx \
+    stdlib.cmx \
+    stdlib__Obj.cmx \
+    stdlib__Mutex.cmx \
+    stdlib__Atomic.cmx \
+    stdlib__Array.cmx \
+    stdlib__Domain.cmi
+stdlib__Domain.cmi : domain.mli
+stdlib__EffectHandlers.cmo : effectHandlers.ml \
+    stdlib__Printexc.cmi \
+    stdlib__Obj.cmi \
+    stdlib__EffectHandlers.cmi
+stdlib__EffectHandlers.cmx : effectHandlers.ml \
+    stdlib__Printexc.cmx \
+    stdlib__Obj.cmx \
+    stdlib__EffectHandlers.cmi
+stdlib__EffectHandlers.cmi : effectHandlers.mli \
+    stdlib__Printexc.cmi
 stdlib__Either.cmo : either.ml \
     stdlib__Either.cmi
 stdlib__Either.cmx : either.ml \
@@ -240,7 +275,7 @@ stdlib__Filename.cmo : filename.ml \
     stdlib__Random.cmi \
     stdlib__Printf.cmi \
     stdlib__List.cmi \
-    stdlib__Lazy.cmi \
+    stdlib__Domain.cmi \
     stdlib__Buffer.cmi \
     stdlib__Filename.cmi
 stdlib__Filename.cmx : filename.ml \
@@ -249,7 +284,7 @@ stdlib__Filename.cmx : filename.ml \
     stdlib__Random.cmx \
     stdlib__Printf.cmx \
     stdlib__List.cmx \
-    stdlib__Lazy.cmx \
+    stdlib__Domain.cmx \
     stdlib__Buffer.cmx \
     stdlib__Filename.cmi
 stdlib__Filename.cmi : filename.mli
@@ -277,6 +312,7 @@ stdlib__Format.cmo : format.ml \
     stdlib__List.cmi \
     stdlib__Int.cmi \
     stdlib__Either.cmi \
+    stdlib__Domain.cmi \
     camlinternalFormatBasics.cmi \
     camlinternalFormat.cmi \
     stdlib__Bytes.cmi \
@@ -291,6 +327,7 @@ stdlib__Format.cmx : format.ml \
     stdlib__List.cmx \
     stdlib__Int.cmx \
     stdlib__Either.cmx \
+    stdlib__Domain.cmx \
     camlinternalFormatBasics.cmx \
     camlinternalFormat.cmx \
     stdlib__Bytes.cmx \
@@ -300,6 +337,7 @@ stdlib__Format.cmi : format.mli \
     stdlib.cmi \
     stdlib__Seq.cmi \
     stdlib__Either.cmi \
+    stdlib__Domain.cmi \
     stdlib__Buffer.cmi
 stdlib__Fun.cmo : fun.ml \
     stdlib__Printexc.cmi \
@@ -346,8 +384,8 @@ stdlib__Hashtbl.cmo : hashtbl.ml \
     stdlib__Seq.cmi \
     stdlib__Random.cmi \
     stdlib__Obj.cmi \
-    stdlib__Lazy.cmi \
     stdlib__Int.cmi \
+    stdlib__Domain.cmi \
     stdlib__Array.cmi \
     stdlib__Hashtbl.cmi
 stdlib__Hashtbl.cmx : hashtbl.ml \
@@ -356,8 +394,8 @@ stdlib__Hashtbl.cmx : hashtbl.ml \
     stdlib__Seq.cmx \
     stdlib__Random.cmx \
     stdlib__Obj.cmx \
-    stdlib__Lazy.cmx \
     stdlib__Int.cmx \
+    stdlib__Domain.cmx \
     stdlib__Array.cmx \
     stdlib__Hashtbl.cmi
 stdlib__Hashtbl.cmi : hashtbl.mli \
@@ -476,6 +514,11 @@ stdlib__MoreLabels.cmi : moreLabels.mli \
     stdlib__Seq.cmi \
     stdlib__Map.cmi \
     stdlib__Hashtbl.cmi
+stdlib__Mutex.cmo : mutex.ml \
+    stdlib__Mutex.cmi
+stdlib__Mutex.cmx : mutex.ml \
+    stdlib__Mutex.cmi
+stdlib__Mutex.cmi : mutex.mli
 stdlib__Nativeint.cmo : nativeint.ml \
     stdlib__Sys.cmi \
     stdlib.cmi \
@@ -584,6 +627,7 @@ stdlib__Random.cmo : random.ml \
     stdlib__Int64.cmi \
     stdlib__Int32.cmi \
     stdlib__Int.cmi \
+    stdlib__Domain.cmi \
     stdlib__Digest.cmi \
     stdlib__Char.cmi \
     stdlib__Array.cmi \
@@ -595,6 +639,7 @@ stdlib__Random.cmx : random.ml \
     stdlib__Int64.cmx \
     stdlib__Int32.cmx \
     stdlib__Int.cmx \
+    stdlib__Domain.cmx \
     stdlib__Digest.cmx \
     stdlib__Char.cmx \
     stdlib__Array.cmx \
@@ -635,6 +680,15 @@ stdlib__Scanf.cmx : scanf.ml \
     stdlib__Scanf.cmi
 stdlib__Scanf.cmi : scanf.mli \
     stdlib.cmi
+stdlib__Semaphore.cmo : semaphore.ml \
+    stdlib__Mutex.cmi \
+    stdlib__Condition.cmi \
+    stdlib__Semaphore.cmi
+stdlib__Semaphore.cmx : semaphore.ml \
+    stdlib__Mutex.cmx \
+    stdlib__Condition.cmx \
+    stdlib__Semaphore.cmi
+stdlib__Semaphore.cmi : semaphore.mli
 stdlib__Seq.cmo : seq.ml \
     stdlib__Lazy.cmi \
     stdlib__Either.cmi \
diff --git a/stdlib/StdlibModules b/stdlib/StdlibModules
index fb846b097382..12c546b068f2 100644
--- a/stdlib/StdlibModules
+++ b/stdlib/StdlibModules
@@ -40,11 +40,12 @@ STDLIB_MODULE_BASENAMES = \
   seq option result bool char uchar \
   list int bytes string unit marshal array float int32 int64 nativeint \
   lexing parsing set map stack queue stream buffer \
-  camlinternalFormat printf arg atomic \
+  atomic mutex condition semaphore domain \
+  camlinternalFormat printf arg \
   printexc fun gc digest random hashtbl weak \
   format scanf callback camlinternalOO oo camlinternalMod genlex ephemeron \
   filename complex arrayLabels listLabels bytesLabels stringLabels moreLabels \
-  stdLabels bigarray in_channel out_channel
+  stdLabels bigarray in_channel out_channel effectHandlers
 
 STDLIB_PREFIXED_MODULES = \
   $(filter-out stdlib camlinternal%, $(STDLIB_MODULE_BASENAMES))
diff --git a/stdlib/camlinternalAtomic.ml b/stdlib/camlinternalAtomic.ml
index b7e74a53faec..7cb95547514d 100644
--- a/stdlib/camlinternalAtomic.ml
+++ b/stdlib/camlinternalAtomic.ml
@@ -2,7 +2,8 @@
 (*                                                                        *)
 (*                                 OCaml                                  *)
 (*                                                                        *)
-(*             Gabriel Scherer, projet Partout, INRIA Paris-Saclay        *)
+(*                 Stephen Dolan, University of Cambridge                 *)
+(*          Guillaume Munch-Maccagnoni, projet Gallinette, INRIA          *)
 (*                                                                        *)
 (*   Copyright 2020 Institut National de Recherche en Informatique et     *)
 (*     en Automatique.                                                    *)
@@ -15,46 +16,17 @@
 
 (* CamlinternalAtomic is a dependency of Stdlib, so it is compiled with
    -nopervasives. *)
-external ( == ) : 'a -> 'a -> bool = "%eq"
-external ( + ) : int -> int -> int = "%addint"
-external ignore : 'a -> unit = "%ignore"
-
-(* We are not reusing ('a ref) directly to make it easier to reason
-   about atomicity if we wish to: even in a sequential implementation,
-   signals and other asynchronous callbacks might break atomicity. *)
-type 'a t = {mutable v: 'a}
-
-let make v = {v}
-let get r = r.v
-let set r v = r.v <- v
-
-(* The following functions are set to never be inlined: Flambda is
-   allowed to move surrounding code inside the critical section,
-   including allocations. *)
+type !'a t
 
-let[@inline never] exchange r v =
-  (* BEGIN ATOMIC *)
-  let cur = r.v in
-  r.v <- v;
-  (* END ATOMIC *)
-  cur
-
-let[@inline never] compare_and_set r seen v =
-  (* BEGIN ATOMIC *)
-  let cur = r.v in
-  if cur == seen then (
-    r.v <- v;
-    (* END ATOMIC *)
-    true
-  ) else
-    false
-
-let[@inline never] fetch_and_add r n =
-  (* BEGIN ATOMIC *)
-  let cur = r.v in
-  r.v <- (cur + n);
-  (* END ATOMIC *)
-  cur
+(* Atomic is a dependency of Stdlib, so it is compiled with
+   -nopervasives. *)
+external make : 'a -> 'a t = "%makemutable"
+external get : 'a t -> 'a = "%atomic_load"
+external exchange : 'a t -> 'a -> 'a = "%atomic_exchange"
+external compare_and_set : 'a t -> 'a -> 'a -> bool = "%atomic_cas"
+external fetch_and_add : int t -> int -> int = "%atomic_fetch_add"
+external ignore : 'a -> unit = "%ignore"
 
+let set r x = ignore (exchange r x)
 let incr r = ignore (fetch_and_add r 1)
 let decr r = ignore (fetch_and_add r (-1))
diff --git a/stdlib/camlinternalLazy.ml b/stdlib/camlinternalLazy.ml
index f03272e62124..b8764064c5de 100644
--- a/stdlib/camlinternalLazy.ml
+++ b/stdlib/camlinternalLazy.ml
@@ -19,55 +19,78 @@ type 'a t = 'a lazy_t
 
 exception Undefined
 
-let raise_undefined = Obj.repr (fun () -> raise Undefined)
+(* [update_to_forcing blk] tries to update a [blk] with [lazy_tag] to
+   [forcing_tag] using compare-and-swap (CAS), taking care to handle concurrent
+   marking of the header word by a concurrent GC thread. Returns [0] if the
+   CAS is successful. If the CAS fails, then the tag was observed to be
+   something other than [lazy_tag] due to a concurrent mutator. In this case,
+   the function returns [1]. *)
+external update_to_forcing : Obj.t -> int =
+  "caml_lazy_update_to_forcing" [@@noalloc]
 
-external make_forward : Obj.t -> Obj.t -> unit = "caml_obj_make_forward"
+(* [reset_to_lazy blk] expects [blk] to be a lazy object with [Obj.forcing_tag]
+   and updates the tag to [Obj.lazy_tag], taking care to handle concurrent
+   marking of this object's header by a concurrent GC thread. *)
+external reset_to_lazy : Obj.t -> unit = "caml_lazy_reset_to_lazy" [@@noalloc]
 
-(* Assume [blk] is a block with tag lazy *)
-let force_lazy_block (blk : 'arg lazy_t) =
-  let closure = (Obj.obj (Obj.field (Obj.repr blk) 0) : unit -> 'arg) in
-  Obj.set_field (Obj.repr blk) 0 raise_undefined;
+(* [update_to_forward blk] expects [blk] to be a lazy object with
+   [Obj.forcing_tag] and updates the tag to [Obj.forward_tag], taking care to
+   handle concurrent marking of this object's header by a concurrent GC thread.
+ *)
+external update_to_forward : Obj.t -> unit =
+  "caml_lazy_update_to_forward" [@@noalloc]
+
+(* Assumes [blk] is a block with tag forcing *)
+let do_force_block blk =
+  let b = Obj.repr blk in
+  let closure = (Obj.obj (Obj.field b 0) : unit -> 'arg) in
+  Obj.set_field b 0 (Obj.repr ()); (* Release the closure *)
   try
     let result = closure () in
-    make_forward (Obj.repr blk) (Obj.repr result);
+    Obj.set_field b 0 (Obj.repr result);
+    update_to_forward b;
     result
   with e ->
-    Obj.set_field (Obj.repr blk) 0 (Obj.repr (fun () -> raise e));
+    Obj.set_field b 0 (Obj.repr (fun () -> raise e));
+    reset_to_lazy b;
     raise e
 
-
-(* Assume [blk] is a block with tag lazy *)
-let force_val_lazy_block (blk : 'arg lazy_t) =
-  let closure = (Obj.obj (Obj.field (Obj.repr blk) 0) : unit -> 'arg) in
-  Obj.set_field (Obj.repr blk) 0 raise_undefined;
+(* Assumes [blk] is a block with tag forcing *)
+let do_force_val_block blk =
+  let b = Obj.repr blk in
+  let closure = (Obj.obj (Obj.field b 0) : unit -> 'arg) in
+  Obj.set_field b 0 (Obj.repr ()); (* Release the closure *)
   let result = closure () in
-  make_forward (Obj.repr blk) (Obj.repr result);
+  Obj.set_field b 0 (Obj.repr result);
+  update_to_forward b;
   result
 
+(* Called by [force_gen] *)
+let force_gen_lazy_block ~only_val (blk : 'arg lazy_t) =
+  (* We expect the tag to be [lazy_tag], but may be other tags due to
+     concurrent forcing of lazy values. *)
+  match update_to_forcing (Obj.repr blk) with
+  | 0 when only_val -> do_force_val_block blk
+  | 0 -> do_force_block blk
+  | _ -> raise Undefined
 
-(* [force] is not used, since [Lazy.force] is declared as a primitive
-   whose code inlines the tag tests of its argument, except when afl
-   instrumentation is turned on. *)
+(* used in the %lazy_force primitive *)
+let force_lazy_block blk = force_gen_lazy_block ~only_val:false blk
 
-let force (lzv : 'arg lazy_t) =
-  (* Using [Sys.opaque_identity] prevents two potential problems:
-     - If the value is known to have Forward_tag, then its tag could have
-       changed during GC, so that information must be forgotten (see GPR#713
-       and issue #7301)
-     - If the value is known to be immutable, then if the compiler
-       cannot prove that the last branch is not taken it will issue a
-       warning 59 (modification of an immutable value) *)
+(* [force_gen ~only_val:false] is not used, since [Lazy.force] is
+   declared as a primitive whose code inlines the tag tests of its
+   argument, except when afl instrumentation is turned on. *)
+let force_gen ~only_val (lzv : 'arg lazy_t) =
   let lzv = Sys.opaque_identity lzv in
   let x = Obj.repr lzv in
+  (* START no safe points. If a GC occurs here, then the object [x] may be
+     short-circuited, and getting the first field of [x] would get us the wrong
+     value. Luckily, the compiler does not insert GC safe points at this place,
+     so it is ok. *)
   let t = Obj.tag x in
-  if t = Obj.forward_tag then (Obj.obj (Obj.field x 0) : 'arg) else
-  if t <> Obj.lazy_tag then (Obj.obj x : 'arg)
-  else force_lazy_block lzv
-
-
-let force_val (lzv : 'arg lazy_t) =
-  let x = Obj.repr lzv in
-  let t = Obj.tag x in
-  if t = Obj.forward_tag then (Obj.obj (Obj.field x 0) : 'arg) else
-  if t <> Obj.lazy_tag then (Obj.obj x : 'arg)
-  else force_val_lazy_block lzv
+  if t = Obj.forward_tag then
+    (Obj.obj (Obj.field x 0) : 'arg)
+  (* END no safe points *)
+  else if t = Obj.forcing_tag then raise Undefined
+  else if t <> Obj.lazy_tag then (Obj.obj x : 'arg)
+  else force_gen_lazy_block ~only_val lzv
diff --git a/stdlib/camlinternalLazy.mli b/stdlib/camlinternalLazy.mli
index 7d04087f44a3..ac8b7a8bee2d 100644
--- a/stdlib/camlinternalLazy.mli
+++ b/stdlib/camlinternalLazy.mli
@@ -17,13 +17,10 @@
     All functions in this module are for system use only, not for the
     casual user. *)
 
-exception Undefined
-
 type 'a t = 'a lazy_t
 
-val force_lazy_block : 'a lazy_t -> 'a
+exception Undefined
 
-val force_val_lazy_block : 'a lazy_t -> 'a
+val force_lazy_block : 'a lazy_t -> 'a
 
-val force : 'a lazy_t -> 'a
-val force_val : 'a lazy_t -> 'a
+val force_gen : only_val:bool -> 'a lazy_t -> 'a
diff --git a/otherlibs/systhreads/condition.ml b/stdlib/condition.ml
similarity index 82%
rename from otherlibs/systhreads/condition.ml
rename to stdlib/condition.ml
index 9a014528abf0..2b0c87ed6568 100644
--- a/otherlibs/systhreads/condition.ml
+++ b/stdlib/condition.ml
@@ -14,7 +14,7 @@
 (**************************************************************************)
 
 type t
-external create: unit -> t = "caml_condition_new"
-external wait: t -> Mutex.t -> unit = "caml_condition_wait"
-external signal: t -> unit = "caml_condition_signal"
-external broadcast: t -> unit = "caml_condition_broadcast"
+external create: unit -> t = "caml_ml_condition_new"
+external wait: t -> Mutex.t -> unit = "caml_ml_condition_wait"
+external signal: t -> unit = "caml_ml_condition_signal"
+external broadcast: t -> unit = "caml_ml_condition_broadcast"
diff --git a/otherlibs/systhreads/condition.mli b/stdlib/condition.mli
similarity index 100%
rename from otherlibs/systhreads/condition.mli
rename to stdlib/condition.mli
diff --git a/stdlib/domain.ml b/stdlib/domain.ml
new file mode 100644
index 000000000000..fd9538516ff2
--- /dev/null
+++ b/stdlib/domain.ml
@@ -0,0 +1,224 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       *)
+(*                 Stephen Dolan, University of Cambridge                 *)
+(*                   Tom Kelly, OCaml Labs Consultancy                    *)
+(*                                                                        *)
+(*   Copyright 2019 Indian Institute of Technology, Madras                *)
+(*   Copyright 2014 University of Cambridge                               *)
+(*   Copyright 2021 OCaml Labs Consultancy Ltd                            *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+module Raw = struct
+  (* Low-level primitives provided by the runtime *)
+  type t = private int
+  external spawn : (unit -> unit) -> Mutex.t -> t
+    = "caml_domain_spawn"
+  external self : unit -> t
+    = "caml_ml_domain_id"
+  external cpu_relax : unit -> unit
+    = "caml_ml_domain_cpu_relax"
+end
+
+let cpu_relax () = Raw.cpu_relax ()
+
+type id = Raw.t
+
+type 'a state =
+| Running
+| Joining of ('a, exn) result option ref
+| Finished of ('a, exn) result
+| Joined
+
+type 'a t = {
+  domain : Raw.t;
+  termination_mutex: Mutex.t;
+  state: 'a state Atomic.t }
+
+
+module DLS = struct
+
+  type dls_state = Obj.t array
+
+  let unique_value = Obj.repr (ref 0)
+
+  external get_dls_state : unit -> dls_state = "%dls_get"
+
+  external set_dls_state : dls_state -> unit =
+    "caml_domain_dls_set" [@@noalloc]
+
+  let create_dls () =
+    let st = Array.make 8 unique_value in
+    set_dls_state st
+
+  let _ = create_dls ()
+
+  type 'a key = int * (unit -> 'a)
+
+  let key_counter = Atomic.make 0
+
+  let new_key f =
+    let k = Atomic.fetch_and_add key_counter 1 in
+    (k, f)
+
+  (* If necessary, grow the current domain's local state array such that [idx]
+   * is a valid index in the array. *)
+  let maybe_grow idx =
+    let st = get_dls_state () in
+    let sz = Array.length st in
+    if idx < sz then st
+    else begin
+      let rec compute_new_size s =
+        if idx < s then s else compute_new_size (2 * s)
+      in
+      let new_sz = compute_new_size sz in
+      let new_st = Array.make new_sz unique_value in
+      Array.blit st 0 new_st 0 sz;
+      set_dls_state new_st;
+      new_st
+    end
+
+  let set (idx, _init) x =
+    let st = maybe_grow idx in
+    (* [Sys.opaque_identity] ensures that flambda does not look at the type of
+     * [x], which may be a [float] and conclude that the [st] is a float array.
+     * We do not want OCaml's float array optimisation kicking in here. *)
+    st.(idx) <- Obj.repr (Sys.opaque_identity x)
+
+  let get (idx, init) =
+    let st = maybe_grow idx in
+    let v = st.(idx) in
+    if v == unique_value then
+      let v' = Obj.repr (init ()) in
+      st.(idx) <- (Sys.opaque_identity v');
+      Obj.magic v'
+    else Obj.magic v
+
+end
+
+(* first spawn, domain startup and at exit functionality *)
+let first_domain_spawned = Atomic.make false
+
+let first_spawn_function = ref (fun () -> ())
+
+let at_first_spawn f =
+  if Atomic.get first_domain_spawned then
+    raise (Invalid_argument "First domain already spawned")
+  else begin
+    let old_f = !first_spawn_function in
+    let new_f () = f (); old_f () in
+    first_spawn_function := new_f
+  end
+
+let do_at_first_spawn () =
+  if not (Atomic.get first_domain_spawned) then begin
+    Atomic.set first_domain_spawned true;
+    !first_spawn_function();
+    (* Release the old function *)
+    first_spawn_function := (fun () -> ())
+  end
+
+let exit_function = Atomic.make (fun () -> ())
+
+let rec at_exit f =
+  let wrapped_f () = try f () with _ -> () in
+  let old_exit = Atomic.get exit_function in
+  let new_exit () = wrapped_f (); old_exit () in
+  let success = Atomic.compare_and_set exit_function old_exit new_exit in
+  if success then
+    Stdlib.at_exit wrapped_f
+  else at_exit f
+
+let do_at_exit () = (Atomic.get exit_function) ()
+
+let startup_function = Atomic.make (fun () -> ())
+
+let rec at_startup f =
+  let old_startup = Atomic.get startup_function in
+  let new_startup () = f (); old_startup () in
+  let success =
+    Atomic.compare_and_set startup_function old_startup new_startup
+  in
+  if success then
+    ()
+  else
+    at_startup f
+
+(* Spawn and join functionality *)
+exception Retry
+let rec spin f =
+  try f () with Retry ->
+      cpu_relax ();
+      spin f
+
+let cas r vold vnew =
+  if not (Atomic.compare_and_set r vold vnew) then raise Retry
+
+let spawn f =
+  do_at_first_spawn ();
+  (* the termination_mutex is used to block a joining thread *)
+  let termination_mutex = Mutex.create () in
+  let state = Atomic.make Running in
+  let at_startup = Atomic.get startup_function in
+  let body () =
+    let result = match DLS.create_dls (); at_startup (); f () with
+      | x -> Ok x
+      | exception ex -> Error ex
+    in
+    do_at_exit ();
+    spin (fun () ->
+      match Atomic.get state with
+      | Running ->
+         cas state Running (Finished result)
+      | Joining x as old ->
+         cas state old Joined;
+         x := Some result
+      | Joined | Finished _ ->
+         failwith "internal error: I'm already finished?")
+  in
+  { domain = Raw.spawn body termination_mutex; termination_mutex; state }
+
+let termination_wait termination_mutex =
+  (* Raw.spawn returns with the mutex locked, so this will block if the
+     domain has not terminated yet *)
+  Mutex.lock termination_mutex;
+  Mutex.unlock termination_mutex
+
+let join { termination_mutex; state; _ } =
+  let res = spin (fun () ->
+    match Atomic.get state with
+    | Running -> begin
+      let x = ref None in
+      cas state Running (Joining x);
+      termination_wait termination_mutex;
+      match !x with
+      | None ->
+          failwith "internal error: termination signaled but result not passed"
+      | Some r -> r
+    end
+    | Finished x as old ->
+      cas state old Joined;
+      termination_wait termination_mutex;
+      x
+    | Joining _ | Joined ->
+      raise (Invalid_argument "This domain has already been joined")
+    )
+  in
+  match res with
+  | Ok x -> x
+  | Error ex -> raise ex
+
+let get_id { domain; _ } = domain
+
+let self () = Raw.self ()
+
+external set_name : string -> unit = "caml_ml_domain_set_name"
+
+let is_main_domain () = (self () :> int) == 0
diff --git a/stdlib/domain.mli b/stdlib/domain.mli
new file mode 100644
index 000000000000..02ac8dc42b27
--- /dev/null
+++ b/stdlib/domain.mli
@@ -0,0 +1,91 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       *)
+(*                 Stephen Dolan, University of Cambridge                 *)
+(*                   Tom Kelly, OCaml Labs Consultancy                    *)
+(*                                                                        *)
+(*   Copyright 2019 Indian Institute of Technology, Madras                *)
+(*   Copyright 2014 University of Cambridge                               *)
+(*   Copyright 2021 OCaml Labs Consultancy Ltd                            *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+type 'a t
+(** A domain of type ['a t] runs independently, eventually producing a
+    result of type 'a, or an exception *)
+
+val spawn : (unit -> 'a) -> 'a t
+(** [spawn f] creates a new domain that runs in parallel with the
+    current domain. *)
+
+val join : 'a t -> 'a
+(** [join d] blocks until domain [d] runs to completion.
+    If [d] results in a value, then that is returned by [join d].
+    If [d] raises an uncaught exception, then that is thrown by [join d].
+    Domains may only be joined once: subsequent uses of [join d]
+    raise Invalid_argument. *)
+
+type id = private int
+(** Domains have unique integer identifiers *)
+
+val get_id : 'a t -> id
+(** [get_id d] returns the identifier of the domain [d] *)
+
+val self : unit -> id
+(** [self ()] is the identifier of the currently running domain *)
+
+val at_first_spawn : (unit -> unit) -> unit
+(** Register the given function to be called before the first domain is
+    spawned.
+
+    @raise Invalid_argument if the first domain has already been spawned. *)
+
+val at_exit : (unit -> unit) -> unit
+(** Register the given function to be called at when a domain exits. This
+    function is also registered with {!Stdlib.at_exit}. If the registered
+    function raises an exception, the exceptions are ignored. *)
+
+val at_startup : (unit -> unit) -> unit
+(** Register the given function to be called when a domain starts. This
+    function is called before the callback specified to [spawn f] is
+    executed. *)
+
+val cpu_relax : unit -> unit
+(** If busy-waiting, calling cpu_relax () between iterations
+    will improve performance on some CPU architectures *)
+
+val set_name : string -> unit
+(** [set_name s] set the domain's thread name to [s]. [s] should not be longer
+    than 15 characters. If [s] is longer than 15 characters,
+    raise Invalid_argument. *)
+
+val is_main_domain : unit -> bool
+(** [is_main_domain ()] returns true if called from the initial domain. *)
+
+module DLS : sig
+(** Domain-local Storage *)
+
+    type 'a key
+    (** Type of a DLS key *)
+
+    val new_key : (unit -> 'a) -> 'a key
+    (** [new_key f] returns a new key bound to initialiser [f] for accessing
+        domain-local variable. *)
+
+    val set : 'a key -> 'a -> unit
+    (** [set k v] updates the calling domain's domain-local state to associate
+        the key [k] with value [v]. It overwrites any previous values associated
+        to [k], which cannot be restored later. *)
+
+    val get : 'a key -> 'a
+    (** [get k] returns [v] if a value [v] is associated to the key [k] on
+        the calling domain's domain-local state. Sets [k]'s value with its
+        initialiser and returns it otherwise. *)
+
+  end
diff --git a/stdlib/effectHandlers.ml b/stdlib/effectHandlers.ml
new file mode 100644
index 000000000000..25ac377fbe4a
--- /dev/null
+++ b/stdlib/effectHandlers.ml
@@ -0,0 +1,147 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       *)
+(*                                                                        *)
+(*   Copyright 2021 Indian Institute of Technology, Madras                *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+type _ eff = ..
+external perform : 'a eff -> 'a = "%perform"
+
+type ('a, 'b) stack
+
+external resume : ('a, 'b) stack -> ('c -> 'a) -> 'c -> 'b = "%resume"
+external runstack : ('a, 'b) stack -> ('c -> 'a) -> 'c -> 'b = "%runstack"
+
+module Deep = struct
+
+  type ('a,'b) continuation
+  type last_fiber
+
+  external take_cont_noexc : ('a, 'b) continuation -> ('a, 'b) stack =
+    "caml_continuation_use_noexc" [@@noalloc]
+  external alloc_stack :
+    ('a -> 'b) ->
+    (exn -> 'b) ->
+    ('c eff -> ('c, 'b) continuation -> last_fiber -> 'b) ->
+    ('a, 'b) stack = "caml_alloc_stack"
+
+  let continue k v = resume (take_cont_noexc k) (fun x -> x) v
+
+  let discontinue k e = resume (take_cont_noexc k) (fun e -> raise e) e
+
+  let discontinue_with_backtrace k e bt = resume (take_cont_noexc k) (fun e ->
+    Printexc.raise_with_backtrace e bt) e
+
+  type ('a,'b) handler =
+    { retc: 'a -> 'b;
+      exnc: exn -> 'b;
+      effc: 'c.'c eff -> (('c,'b) continuation -> 'b) option }
+
+  external reperform :
+    'a eff -> ('a, 'b) continuation -> last_fiber -> 'b = "%reperform"
+
+  let match_with comp arg handler =
+    let effc eff k last_fiber =
+      match handler.effc eff with
+      | Some f -> f k
+      | None -> reperform eff k last_fiber
+    in
+    let s = alloc_stack handler.retc handler.exnc effc in
+    runstack s comp arg
+
+  type 'a effect_handler =
+    { effc: 'b. 'b eff -> (('b,'a) continuation -> 'a) option }
+
+  let try_with comp arg handler =
+    let effc' eff k last_fiber =
+      match handler.effc eff with
+      | Some f -> f k
+      | None -> reperform eff k last_fiber
+    in
+    let s = alloc_stack (fun x -> x) (fun e -> raise e) effc' in
+    runstack s comp arg
+
+  external get_callstack :
+    ('a,'b) continuation -> int -> Printexc.raw_backtrace =
+    "caml_get_continuation_callstack"
+end
+
+module Shallow = struct
+
+  type ('a,'b) continuation
+  type last_fiber
+
+  external alloc_stack :
+    ('a -> 'b) ->
+    (exn -> 'b) ->
+    ('c eff -> ('c, 'b) continuation -> last_fiber -> 'b) ->
+    ('a, 'b) stack = "caml_alloc_stack"
+
+
+  let fiber : type a b. (a -> b) -> (a, b) continuation = fun f ->
+    let module M = struct type _ eff += Initial_setup__ : a eff end in
+    let exception E of (a,b) continuation in
+    let f' () = f (perform M.Initial_setup__) in
+    let error _ = failwith "impossible" in
+    let effc eff k _last_fiber =
+      match eff with
+      | M.Initial_setup__ -> raise (E k)
+      | _ -> error ()
+    in
+    let s = alloc_stack error error effc in
+    try Obj.magic (runstack s f' ()) with E k -> k
+
+  type ('a,'b) handler =
+    { retc: 'a -> 'b;
+      exnc: exn -> 'b;
+      effc: 'c.'c eff -> (('c,'a) continuation -> 'b) option }
+
+  external update_handler :
+    ('a,'b) continuation ->
+    ('b -> 'c) ->
+    (exn -> 'c) ->
+    ('d eff -> ('d,'b) continuation -> last_fiber -> 'c) ->
+    ('a,'c) stack = "caml_continuation_use_and_update_handler_noexc" [@@noalloc]
+
+  external reperform :
+    'a eff -> ('a, 'b) continuation -> last_fiber -> 'c = "%reperform"
+
+  let continue_with k v handler =
+    let effc eff k last_fiber =
+      match handler.effc eff with
+      | Some f -> f k
+      | None -> reperform eff k last_fiber
+    in
+    let stack = update_handler k handler.retc handler.exnc effc in
+    resume stack (fun x -> x) v
+
+  let discontinue_with k x handler =
+    let effc eff k last_fiber =
+      match handler.effc eff with
+      | Some f -> f k
+      | None -> reperform eff k last_fiber
+    in
+    let stack = update_handler k handler.retc handler.exnc effc in
+    resume stack (fun e -> raise e) x
+
+  let discontinue_with_backtrace k x bt handler =
+    let effc eff k last_fiber =
+      match handler.effc eff with
+      | Some f -> f k
+      | None -> reperform eff k last_fiber
+    in
+    let stack = update_handler k handler.retc handler.exnc effc in
+    resume stack (fun e -> Printexc.raise_with_backtrace e bt) x
+
+  external get_callstack :
+    ('a,'b) continuation -> int -> Printexc.raw_backtrace =
+    "caml_get_continuation_callstack"
+end
diff --git a/stdlib/effectHandlers.mli b/stdlib/effectHandlers.mli
new file mode 100644
index 000000000000..81e89d45abf4
--- /dev/null
+++ b/stdlib/effectHandlers.mli
@@ -0,0 +1,129 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*      KC Sivaramakrishnan, Indian Institute of Technology, Madras       *)
+(*                                                                        *)
+(*   Copyright 2021 Indian Institute of Technology, Madras                *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+type _ eff = ..
+(* Type of effects *)
+
+external perform : 'a eff -> 'a = "%perform"
+(** [perform e] performs an effect [e].
+
+    @raises Unhandled if there is no active handler. *)
+
+module Deep : sig
+  (** Deep handlers *)
+
+  type ('a,'b) continuation
+  (** [('a,'b) continuation] is a delimited continuation that expects a ['a]
+      value and returns a ['b] value. *)
+
+  val continue: ('a, 'b) continuation -> 'a -> 'b
+  (** [continue k x] resumes the continuation [k] by passing [x] to [k].
+
+      @raise Continuation_already_taken if the continuation has already been
+      resumed. *)
+
+  val discontinue: ('a, 'b) continuation -> exn -> 'b
+  (** [discontinue k e] resumes the continuation [k] by raising the
+      exception [e] in [k].
+
+      @raise Continuation_already_taken if the continuation has already been
+      resumed. *)
+
+  val discontinue_with_backtrace:
+    ('a, 'b) continuation -> exn -> Printexc.raw_backtrace -> 'b
+  (** [discontinue_with_backtrace k e bt] resumes the continuation [k] by
+      raising the exception [e] in [k] using [bt] as the origin for the
+      exception.
+
+      @raise Continuation_already_taken if the continuation has already been
+      resumed. *)
+
+  type ('a,'b) handler =
+    { retc: 'a -> 'b;
+      exnc: exn -> 'b;
+      effc: 'c.'c eff -> (('c,'b) continuation -> 'b) option }
+  (** [('a,'b) handler] is a handler record with three fields -- [retc]
+      is the value handler, [exnc] handles exceptions, and [effc] handles the
+      effects performed by the computation enclosed by the handler. *)
+
+  val match_with: ('c -> 'a) -> 'c -> ('a,'b) handler -> 'b
+  (** [match_with f v h] runs the computation [f v] in the handler [h]. *)
+
+  type 'a effect_handler =
+    { effc: 'b. 'b eff -> (('b, 'a) continuation -> 'a) option }
+  (** ['a effect_handler] is a deep handler with an identity value handler
+      [fun x -> x] and an exception handler that raises any exception
+      [fun e -> raise e]. *)
+
+  val try_with: ('b -> 'a) -> 'b -> 'a effect_handler -> 'a
+  (** [try_with f v h] runs the computation [f v] under the handler [h]. *)
+
+  external get_callstack :
+    ('a,'b) continuation -> int -> Printexc.raw_backtrace =
+    "caml_get_continuation_callstack"
+  (** [get_callstack c n] returns a description of the top of the call stack on
+      the continuation [c], with at most [n] entries. *)
+end
+
+module Shallow : sig
+  (* Shallow handlers *)
+
+  type ('a,'b) continuation
+  (** [('a,'b) continuation] is a delimited continuation that expects a ['a]
+      value and returns a ['b] value. *)
+
+  val fiber : ('a -> 'b) -> ('a, 'b) continuation
+  (** [fiber f] constructs a continuation that runs the computation [f]. *)
+
+  type ('a,'b) handler =
+    { retc: 'a -> 'b;
+      exnc: exn -> 'b;
+      effc: 'c.'c eff -> (('c,'a) continuation -> 'b) option }
+  (** [('a,'b) handler] is a handler record with three fields -- [retc]
+      is the value handler, [exnc] handles exceptions, and [effc] handles the
+      effects performed by the computation enclosed by the handler. *)
+
+  val continue_with : ('c,'a) continuation -> 'c -> ('a,'b) handler -> 'b
+  (** [continue_with k v h] resumes the continuation [k] with value [v] with
+      the handler [h].
+
+      @raise Continuation_already_taken if the continuation has already been
+      resumed.
+   *)
+
+  val discontinue_with : ('c,'a) continuation -> exn -> ('a,'b) handler -> 'b
+  (** [discontinue_with k e h] resumes the continuation [k] by raising the
+      exception [e] with the handler [h].
+
+      @raise Continuation_already_taken if the continuation has already been
+      resumed.
+   *)
+
+  val discontinue_with_backtrace :
+    ('a,'b) continuation -> exn -> Printexc.raw_backtrace ->
+    ('b,'c) handler -> 'c
+  (** [discontinue_with k e bt h] resumes the continuation [k] by raising the
+      exception [e] with the handler [h] using the raw backtrace [bt] as the
+      origin of the exception.
+
+      @raise Continuation_already_taken if the continuation has already been
+      resumed.
+   *)
+
+  external get_callstack :
+    ('a,'b) continuation -> int -> Printexc.raw_backtrace =
+    "caml_get_continuation_callstack"
+  (** [get_callstack c n] returns a description of the top of the call stack on
+      the continuation [c], with at most [n] entries. *)
+end
diff --git a/stdlib/ephemeron.ml b/stdlib/ephemeron.ml
index 5c5263890649..df838ebf2035 100644
--- a/stdlib/ephemeron.ml
+++ b/stdlib/ephemeron.ml
@@ -13,8 +13,6 @@
 (*                                                                        *)
 (**************************************************************************)
 
-[@@@ocaml.warning "-32"]
-
 module type SeededS = sig
 
   type key
@@ -30,20 +28,8 @@ module type SeededS = sig
   val find_all : 'a t -> key -> 'a list
   val replace : 'a t -> key -> 'a -> unit
   val mem : 'a t -> key -> bool
-  val iter : (key -> 'a -> unit) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val filter_map_inplace : (key -> 'a -> 'a option) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val fold : (key -> 'a -> 'b -> 'b) -> 'a t -> 'b -> 'b
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val length : 'a t -> int
   val stats : 'a t -> Hashtbl.statistics
-  val to_seq : 'a t -> (key * 'a) Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_keys : _ t -> key Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_values : 'a t -> 'a Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val add_seq : 'a t -> (key * 'a) Seq.t -> unit
   val replace_seq : 'a t -> (key * 'a) Seq.t -> unit
   val of_seq : (key * 'a) Seq.t -> 'a t
@@ -67,20 +53,8 @@ module type S = sig
   val find_all : 'a t -> key -> 'a list
   val replace : 'a t -> key -> 'a -> unit
   val mem : 'a t -> key -> bool
-  val iter : (key -> 'a -> unit) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val filter_map_inplace : (key -> 'a -> 'a option) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val fold : (key -> 'a -> 'b -> 'b) -> 'a t -> 'b -> 'b
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val length : 'a t -> int
   val stats : 'a t -> Hashtbl.statistics
-  val to_seq : 'a t -> (key * 'a) Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_keys : _ t -> key Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_values : 'a t -> 'a Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val add_seq : 'a t -> (key * 'a) Seq.t -> unit
   val replace_seq : 'a t -> (key * 'a) Seq.t -> unit
   val of_seq : (key * 'a) Seq.t -> 'a t
@@ -102,7 +76,6 @@ module GenHashTable = struct
     val hash: int -> t -> int
     val equal: 'a container -> t -> equal
     val get_data: 'a container -> 'a option
-    val get_key: 'a container -> t option
     val set_key_data: 'a container -> t -> 'a -> unit
     val check_key: 'a container -> bool
   end) : SeededS with type key = H.t
@@ -240,7 +213,7 @@ module GenHashTable = struct
 
     (** {!find} don't remove dead keys because it would be surprising for
         the user that a read-only function mutates the state (eg. concurrent
-        access). Same for {!iter}, {!fold}, {!mem}.
+        access). Same for {!mem}.
     *)
     let rec find_rec key hkey = function
       | Empty ->
@@ -346,59 +319,6 @@ module GenHashTable = struct
       | Cons(_hk, _c, rest) -> mem_in_bucket rest in
       mem_in_bucket h.data.(key_index h hkey)
 
-    let iter f h =
-      let rec do_bucket = function
-        | Empty ->
-            ()
-        | Cons(_, c, rest) ->
-            begin match H.get_key c, H.get_data c with
-            | None, _ | _, None -> ()
-            | Some k, Some d -> f k d
-            end; do_bucket rest in
-      let d = h.data in
-      for i = 0 to Array.length d - 1 do
-        do_bucket d.(i)
-      done
-
-    let fold f h init =
-      let rec do_bucket b accu =
-        match b with
-          Empty ->
-            accu
-        | Cons(_, c, rest) ->
-            let accu = begin match H.get_key c, H.get_data c with
-              | None, _ | _, None -> accu
-              | Some k, Some d -> f k d accu
-            end in
-            do_bucket rest accu  in
-      let d = h.data in
-      let accu = ref init in
-      for i = 0 to Array.length d - 1 do
-        accu := do_bucket d.(i) !accu
-      done;
-      !accu
-
-    let filter_map_inplace f h =
-      let rec do_bucket = function
-        | Empty ->
-            Empty
-        | Cons(hk, c, rest) ->
-            match H.get_key c, H.get_data c with
-            | None, _ | _, None ->
-                do_bucket rest
-            | Some k, Some d ->
-                match f k d with
-                | None ->
-                    do_bucket rest
-                | Some new_d ->
-                    H.set_key_data c k new_d;
-                    Cons(hk, c, do_bucket rest)
-      in
-      let d = h.data in
-      for i = 0 to Array.length d - 1 do
-        d.(i) <- do_bucket d.(i)
-      done
-
     let length h = h.size
 
     let rec bucket_length accu = function
@@ -443,29 +363,6 @@ module GenHashTable = struct
         max_bucket_length = mbl;
         bucket_histogram = histo }
 
-    let to_seq tbl =
-      (* capture current array, so that even if the table is resized we
-         keep iterating on the same array *)
-      let tbl_data = tbl.data in
-      (* state: index * next bucket to traverse *)
-      let rec aux i buck () = match buck with
-        | Empty ->
-            if i = Array.length tbl_data
-            then Seq.Nil
-            else aux(i+1) tbl_data.(i) ()
-        | Cons (_, c, next) ->
-            begin match H.get_key c, H.get_data c with
-              | None, _ | _, None -> aux i next ()
-              | Some key, Some data ->
-                  Seq.Cons ((key, data), aux i next)
-            end
-      in
-      aux 0 Empty
-
-    let to_seq_keys m = Seq.map fst (to_seq m)
-
-    let to_seq_values m = Seq.map snd (to_seq m)
-
     let add_seq tbl i =
       Seq.iter (fun (k,v) -> add tbl k v) i
 
@@ -497,20 +394,12 @@ module K1 = struct
   let create () : ('k,'d) t = ObjEph.create 1
 
   let get_key (t:('k,'d) t) : 'k option = obj_opt (ObjEph.get_key t 0)
-  let get_key_copy (t:('k,'d) t) : 'k option = obj_opt (ObjEph.get_key_copy t 0)
   let set_key (t:('k,'d) t) (k:'k) : unit = ObjEph.set_key t 0 (Obj.repr k)
-  let unset_key (t:('k,'d) t) : unit = ObjEph.unset_key t 0
   let check_key (t:('k,'d) t) : bool = ObjEph.check_key t 0
 
-  let blit_key (t1:('k,'d) t) (t2:('k,'d) t): unit =
-    ObjEph.blit_key t1 0 t2 0 1
-
   let get_data (t:('k,'d) t) : 'd option = obj_opt (ObjEph.get_data t)
-  let get_data_copy (t:('k,'d) t) : 'd option = obj_opt (ObjEph.get_data_copy t)
   let set_data (t:('k,'d) t) (d:'d) : unit = ObjEph.set_data t (Obj.repr d)
   let unset_data (t:('k,'d) t) : unit = ObjEph.unset_data t
-  let check_data (t:('k,'d) t) : bool = ObjEph.check_data t
-  let blit_data (t1:(_,'d) t) (t2:(_,'d) t) : unit = ObjEph.blit_data t1 t2
 
   let make key data =
     let eph = create () in
@@ -542,7 +431,6 @@ module K1 = struct
         | Some k' ->
             if H.equal k k' then GenHashTable.ETrue else GenHashTable.EFalse
       let get_data = get_data
-      let get_key = get_key
       let set_key_data c k d =
         unset_data c;
         set_key c k;
@@ -603,37 +491,19 @@ module K2 = struct
   let create () : ('k1,'k2,'d) t = ObjEph.create 2
 
   let get_key1 (t:('k1,'k2,'d) t) : 'k1 option = obj_opt (ObjEph.get_key t 0)
-  let get_key1_copy (t:('k1,'k2,'d) t) : 'k1 option =
-    obj_opt (ObjEph.get_key_copy t 0)
   let set_key1 (t:('k1,'k2,'d) t) (k:'k1) : unit =
     ObjEph.set_key t 0 (Obj.repr k)
-  let unset_key1 (t:('k1,'k2,'d) t) : unit = ObjEph.unset_key t 0
   let check_key1 (t:('k1,'k2,'d) t) : bool = ObjEph.check_key t 0
 
   let get_key2 (t:('k1,'k2,'d) t) : 'k2 option = obj_opt (ObjEph.get_key t 1)
-  let get_key2_copy (t:('k1,'k2,'d) t) : 'k2 option =
-    obj_opt (ObjEph.get_key_copy t 1)
   let set_key2 (t:('k1,'k2,'d) t) (k:'k2) : unit =
     ObjEph.set_key t 1 (Obj.repr k)
-  let unset_key2 (t:('k1,'k2,'d) t) : unit = ObjEph.unset_key t 1
   let check_key2 (t:('k1,'k2,'d) t) : bool = ObjEph.check_key t 1
 
-
-  let blit_key1 (t1:('k1,_,_) t) (t2:('k1,_,_) t) : unit =
-    ObjEph.blit_key t1 0 t2 0 1
-  let blit_key2 (t1:(_,'k2,_) t) (t2:(_,'k2,_) t) : unit =
-    ObjEph.blit_key t1 1 t2 1 1
-  let blit_key12 (t1:('k1,'k2,_) t) (t2:('k1,'k2,_) t) : unit =
-    ObjEph.blit_key t1 0 t2 0 2
-
   let get_data (t:('k1,'k2,'d) t) : 'd option = obj_opt (ObjEph.get_data t)
-  let get_data_copy (t:('k1,'k2,'d) t) : 'd option =
-    obj_opt (ObjEph.get_data_copy t)
   let set_data (t:('k1,'k2,'d) t) (d:'d) : unit =
     ObjEph.set_data t (Obj.repr d)
   let unset_data (t:('k1,'k2,'d) t) : unit = ObjEph.unset_data t
-  let check_data (t:('k1,'k2,'d) t) : bool = ObjEph.check_data t
-  let blit_data (t1:(_,_,'d) t) (t2:(_,_,'d) t) : unit = ObjEph.blit_data t1 t2
 
   let make key1 key2 data =
     let eph = create () in
@@ -674,10 +544,6 @@ module K2 = struct
             if H1.equal k1 k1' && H2.equal k2 k2'
             then GenHashTable.ETrue else GenHashTable.EFalse
       let get_data = get_data
-      let get_key c =
-        match get_key1 c, get_key2 c with
-        | None, _ | _ , None -> None
-        | Some k1', Some k2' -> Some (k1', k2')
       let set_key_data c (k1,k2) d =
         unset_data c;
         set_key1 c k1; set_key2 c k2;
@@ -746,22 +612,13 @@ module Kn = struct
   let length (k:('k,'d) t) : int = ObjEph.length k
 
   let get_key (t:('k,'d) t) (n:int) : 'k option = obj_opt (ObjEph.get_key t n)
-  let get_key_copy (t:('k,'d) t) (n:int) : 'k option =
-    obj_opt (ObjEph.get_key_copy t n)
   let set_key (t:('k,'d) t) (n:int) (k:'k) : unit =
     ObjEph.set_key t n (Obj.repr k)
-  let unset_key (t:('k,'d) t) (n:int) : unit = ObjEph.unset_key t n
   let check_key (t:('k,'d) t) (n:int) : bool = ObjEph.check_key t n
 
-  let blit_key (t1:('k,'d) t) (o1:int) (t2:('k,'d) t) (o2:int) (l:int) : unit =
-    ObjEph.blit_key t1 o1 t2 o2 l
-
   let get_data (t:('k,'d) t) : 'd option = obj_opt (ObjEph.get_data t)
-  let get_data_copy (t:('k,'d) t) : 'd option = obj_opt (ObjEph.get_data_copy t)
   let set_data (t:('k,'d) t) (d:'d) : unit = ObjEph.set_data t (Obj.repr d)
   let unset_data (t:('k,'d) t) : unit = ObjEph.unset_data t
-  let check_data (t:('k,'d) t) : bool = ObjEph.check_data t
-  let blit_data (t1:(_,'d) t) (t2:(_,'d) t) : unit = ObjEph.blit_data t1 t2
 
   let make keys data =
     let l = Array.length keys in
@@ -817,24 +674,6 @@ module Kn = struct
           in
           equal_array k c (len-1)
       let get_data = get_data
-      let get_key c =
-        let len = length c in
-        if len = 0 then Some [||]
-        else
-          match get_key c 0 with
-          | None -> None
-          | Some k0 ->
-              let rec fill a i =
-                if i < 1 then Some a
-                else
-                  match get_key c i with
-                  | None -> None
-                  | Some ki ->
-                      a.(i) <- ki;
-                      fill a (i-1)
-              in
-              let a = Array.make len k0 in
-              fill a (len-1)
       let set_key_data c k d =
         unset_data c;
         for i=0 to Array.length k -1 do
diff --git a/stdlib/ephemeron.mli b/stdlib/ephemeron.mli
index 4d57aadc650d..f62ef34755fc 100644
--- a/stdlib/ephemeron.mli
+++ b/stdlib/ephemeron.mli
@@ -70,9 +70,6 @@ module type S = sig
       the bindings are weak, even if [mem h k] is true, a subsequent
       [find h k] may raise [Not_found] because the garbage collector
       can run between the two.
-
-      Moreover, the table shouldn't be modified during a call to [iter].
-      Use [filter_map_inplace] in this case.
   *)
 
   type key
@@ -88,20 +85,8 @@ module type S = sig
   val find_all : 'a t -> key -> 'a list
   val replace : 'a t -> key -> 'a -> unit
   val mem : 'a t -> key -> bool
-  val iter : (key -> 'a -> unit) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val filter_map_inplace : (key -> 'a -> 'a option) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val fold : (key -> 'a -> 'b -> 'b) -> 'a t -> 'b -> 'b
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val length : 'a t -> int
   val stats : 'a t -> Hashtbl.statistics
-  val to_seq : 'a t -> (key * 'a) Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_keys : _ t -> key Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_values : 'a t -> 'a Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val add_seq : 'a t -> (key * 'a) Seq.t -> unit
   val replace_seq : 'a t -> (key * 'a) Seq.t -> unit
   val of_seq : (key * 'a) Seq.t -> 'a t
@@ -133,20 +118,8 @@ module type SeededS = sig
   val find_all : 'a t -> key -> 'a list
   val replace : 'a t -> key -> 'a -> unit
   val mem : 'a t -> key -> bool
-  val iter : (key -> 'a -> unit) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val filter_map_inplace : (key -> 'a -> 'a option) -> 'a t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val fold : (key -> 'a -> 'b -> 'b) -> 'a t -> 'b -> 'b
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val length : 'a t -> int
   val stats : 'a t -> Hashtbl.statistics
-  val to_seq : 'a t -> (key * 'a) Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_keys : _ t -> key Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  val to_seq_values : 'a t -> 'a Seq.t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
   val add_seq : 'a t -> (key * 'a) Seq.t -> unit
   val replace_seq : 'a t -> (key * 'a) Seq.t -> unit
   val of_seq : (key * 'a) Seq.t -> 'a t
@@ -163,96 +136,6 @@ end
 module K1 : sig
   type ('k,'d) t (** an ephemeron with one key *)
 
-  val create: unit -> ('k,'d) t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.create ()] creates an ephemeron with one key. The
-      data and the key are empty *)
-
-  val get_key: ('k,'d) t -> 'k option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.get_key eph] returns [None] if the key of [eph] is
-      empty, [Some x] (where [x] is the key) if it is full. *)
-
-  val get_key_copy: ('k,'d) t -> 'k option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.get_key_copy eph] returns [None] if the key of [eph] is
-      empty, [Some x] (where [x] is a (shallow) copy of the key) if
-      it is full. This function has the same GC friendliness as {!Weak.get_copy}
-
-      If the element is a custom block it is not copied.
-  *)
-
-  val set_key: ('k,'d) t -> 'k -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.set_key eph el] sets the key of [eph] to be a
-      (full) key to [el]
-  *)
-
-  val unset_key: ('k,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.unset_key eph el] sets the key of [eph] to be an
-      empty key. Since there is only one key, the ephemeron starts
-      behaving like a reference on the data. *)
-
-  val check_key: ('k,'d) t -> bool
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.check_key eph] returns [true] if the key of the [eph]
-      is full, [false] if it is empty. Note that even if
-      [Ephemeron.K1.check_key eph] returns [true], a subsequent
-      {!Ephemeron.K1.get_key}[eph] can return [None].
-  *)
-
-
-  val blit_key : ('k,_) t -> ('k,_) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.blit_key eph1 eph2] sets the key of [eph2] with
-      the key of [eph1]. Contrary to using {!Ephemeron.K1.get_key}
-      followed by {!Ephemeron.K1.set_key} or {!Ephemeron.K1.unset_key}
-      this function does not prevent the incremental GC from erasing
-      the value in its current cycle. *)
-
-  val get_data: ('k,'d) t -> 'd option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.get_data eph] returns [None] if the data of [eph] is
-      empty, [Some x] (where [x] is the data) if it is full. *)
-
-  val get_data_copy: ('k,'d) t -> 'd option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.get_data_copy eph] returns [None] if the data of [eph] is
-      empty, [Some x] (where [x] is a (shallow) copy of the data) if
-      it is full. This function has the same GC friendliness as {!Weak.get_copy}
-
-      If the element is a custom block it is not copied.
-  *)
-
-  val set_data: ('k,'d) t -> 'd -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.set_data eph el] sets the data of [eph] to be a
-      (full) data to [el]
-  *)
-
-  val unset_data: ('k,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.unset_data eph el] sets the key of [eph] to be an
-      empty key. The ephemeron starts behaving like a weak pointer.
-  *)
-
-  val check_data: ('k,'d) t -> bool
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.check_data eph] returns [true] if the data of the [eph]
-      is full, [false] if it is empty. Note that even if
-      [Ephemeron.K1.check_data eph] returns [true], a subsequent
-      {!Ephemeron.K1.get_data}[eph] can return [None].
-  *)
-
-  val blit_data : (_,'d) t -> (_,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** [Ephemeron.K1.blit_data eph1 eph2] sets the data of [eph2] with
-      the data of [eph1]. Contrary to using {!Ephemeron.K1.get_data}
-      followed by {!Ephemeron.K1.set_data} or {!Ephemeron.K1.unset_data}
-      this function does not prevent the incremental GC from erasing
-      the value in its current cycle. *)
-
   val make : 'k -> 'd -> ('k,'d) t
   (** [Ephemeron.K1.make k d] creates an ephemeron with key [k] and data [d]. *)
 
@@ -302,86 +185,6 @@ end
 module K2 : sig
   type ('k1,'k2,'d) t (** an ephemeron with two keys *)
 
-  val create: unit -> ('k1,'k2,'d) t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.create} *)
-
-  val get_key1: ('k1,'k2,'d) t -> 'k1 option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_key} *)
-
-  val get_key1_copy: ('k1,'k2,'d) t -> 'k1 option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_key_copy} *)
-
-  val set_key1: ('k1,'k2,'d) t -> 'k1 -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.set_key} *)
-
-  val unset_key1: ('k1,'k2,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.unset_key} *)
-
-  val check_key1: ('k1,'k2,'d) t ->  bool
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.check_key} *)
-
-  val get_key2: ('k1,'k2,'d) t -> 'k2 option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_key} *)
-
-  val get_key2_copy: ('k1,'k2,'d) t -> 'k2 option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_key_copy} *)
-
-  val set_key2: ('k1,'k2,'d) t -> 'k2 -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.set_key} *)
-
-  val unset_key2: ('k1,'k2,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.unset_key} *)
-
-  val check_key2: ('k1,'k2,'d) t -> bool
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.check_key} *)
-
-  val blit_key1: ('k1,_,_) t -> ('k1,_,_) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.blit_key} *)
-
-  val blit_key2: (_,'k2,_) t -> (_,'k2,_) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.blit_key} *)
-
-  val blit_key12: ('k1,'k2,_) t -> ('k1,'k2,_) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.blit_key} *)
-
-  val get_data: ('k1,'k2,'d) t -> 'd option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_data} *)
-
-  val get_data_copy: ('k1,'k2,'d) t -> 'd option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_data_copy} *)
-
-  val set_data: ('k1,'k2,'d) t -> 'd -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.set_data} *)
-
-  val unset_data: ('k1,'k2,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.unset_data} *)
-
-  val check_data: ('k1,'k2,'d) t -> bool
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.check_data} *)
-
-  val blit_data: ('k1,'k2,'d) t -> ('k1,'k2,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.blit_data} *)
-
   val make : 'k1 -> 'k2 -> 'd -> ('k1,'k2,'d) t
   (** Same as {!Ephemeron.K1.make} *)
 
@@ -436,58 +239,6 @@ module Kn : sig
   type ('k,'d) t (** an ephemeron with an arbitrary number of keys
                       of the same type *)
 
-  val create: int -> ('k,'d) t
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.create} *)
-
-  val get_key: ('k,'d) t -> int -> 'k option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_key} *)
-
-  val get_key_copy: ('k,'d) t -> int -> 'k option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_key_copy} *)
-
-  val set_key: ('k,'d) t -> int -> 'k -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.set_key} *)
-
-  val unset_key: ('k,'d) t -> int -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.unset_key} *)
-
-  val check_key: ('k,'d) t -> int ->  bool
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.check_key} *)
-
-  val blit_key: ('k,_) t -> int -> ('k,_) t -> int -> int -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.blit_key} *)
-
-  val get_data: ('k,'d) t -> 'd option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_data} *)
-
-  val get_data_copy: ('k,'d) t -> 'd option
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.get_data_copy} *)
-
-  val set_data: ('k,'d) t -> 'd -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.set_data} *)
-
-  val unset_data: ('k,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.unset_data} *)
-
-  val check_data: ('k,'d) t -> bool
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.check_data} *)
-
-  val blit_data: ('k,'d) t -> ('k,'d) t -> unit
-    [@@alert old_ephemeron_api "This function won't be available in 5.0"]
-  (** Same as {!Ephemeron.K1.blit_data} *)
-
   val make : 'k array -> 'd -> ('k,'d) t
   (** Same as {!Ephemeron.K1.make} *)
 
@@ -535,54 +286,3 @@ module Kn : sig
 
 end
 (** Ephemerons with arbitrary number of keys of the same type. *)
-
-module GenHashTable: sig
-  (** Define a hash table on generic containers which have a notion of
-      "death" and aliveness. If a binding is dead the hash table can
-      automatically remove it. *)
-
-  [@@@alert old_ephemeron_api "This module won't be available in 5.0"]
-
-  type equal =
-  | ETrue
-  | EFalse
-  | EDead (** the container is dead *)
-
-  module MakeSeeded(H:
-  sig
-    type t
-    (** keys *)
-
-    type 'a container
-    (** contains keys and the associated data *)
-
-    val hash: int -> t -> int
-    (** same as {!Hashtbl.SeededHashedType} *)
-
-    val equal: 'a container -> t -> equal
-    (** equality predicate used to compare a key with the one in a
-        container. Can return [EDead] if the keys in the container are
-        dead *)
-
-    val create: t -> 'a -> 'a container
-    (** [create key data] creates a container from
-        some initials keys and one data *)
-
-    val get_key: 'a container -> t option
-    (** [get_key cont] returns the keys if they are all alive *)
-
-    val get_data: 'a container -> 'a option
-    (** [get_data cont] returns the data if it is alive *)
-
-    val set_key_data: 'a container -> t -> 'a -> unit
-    (** [set_key_data cont] modifies the key and data *)
-
-    val check_key: 'a container -> bool
-    (** [check_key cont] checks if all the keys contained in the data
-        are alive *)
-  end) : SeededS with type key = H.t
-  (** Functor building an implementation of an hash table that use the container
-      for keeping the information given *)
-
-end
-(** Hash tables on generic containers with notion of death and aliveness. *)
diff --git a/stdlib/filename.ml b/stdlib/filename.ml
index a6f5692a8411..09410192ffc6 100644
--- a/stdlib/filename.ml
+++ b/stdlib/filename.ml
@@ -100,7 +100,9 @@ module Unix : SYSDEPS = struct
     && (String.length n < 2 || String.sub n 0 2 <> "./")
     && (String.length n < 3 || String.sub n 0 3 <> "../")
   let check_suffix name suff =
-    String.ends_with ~suffix:suff name
+    String.length name >= String.length suff &&
+    String.sub name (String.length name - String.length suff)
+                    (String.length suff) = suff
 
   let chop_suffix_opt ~suffix filename =
     let len_s = String.length suffix and len_f = String.length filename in
@@ -326,13 +328,13 @@ let remove_extension name =
 external open_desc: string -> open_flag list -> int -> int = "caml_sys_open"
 external close_desc: int -> unit = "caml_sys_close"
 
-let prng = lazy(Random.State.make_self_init ())
+let prng_key = Domain.DLS.new_key Random.State.make_self_init
 
 let temp_file_name temp_dir prefix suffix =
-  let rnd = (Random.State.bits (Lazy.force prng)) land 0xFFFFFF in
+  let random_state = Domain.DLS.get prng_key in
+  let rnd = (Random.State.bits random_state) land 0xFFFFFF in
   concat temp_dir (Printf.sprintf "%s%06x%s" prefix rnd suffix)
 
-
 let current_temp_dir_name = ref temp_dir_name
 
 let set_temp_dir_name s = current_temp_dir_name := s
diff --git a/stdlib/format.ml b/stdlib/format.ml
index 78fc01e59ce8..31314d3e995c 100644
--- a/stdlib/format.ml
+++ b/stdlib/format.ml
@@ -1022,6 +1022,56 @@ let std_formatter = formatter_of_out_channel Stdlib.stdout
 and err_formatter = formatter_of_out_channel Stdlib.stderr
 and str_formatter = formatter_of_buffer stdbuf
 
+(* Initialise domain local state *)
+module DLS = Domain.DLS
+
+let stdbuf_key = DLS.new_key pp_make_buffer
+let _ = DLS.set stdbuf_key stdbuf
+
+let str_formatter_key = DLS.new_key (fun () ->
+  formatter_of_buffer (DLS.get stdbuf_key))
+let _ = DLS.set str_formatter_key str_formatter
+
+let buffered_out_string key str ofs len =
+  Buffer.add_substring (Domain.DLS.get key) str ofs len
+
+let buffered_out_flush oc key () =
+  let buf = Domain.DLS.get key in
+  let len = Buffer.length buf in
+  let str = Buffer.contents buf in
+  output_substring oc str 0 len ;
+  Stdlib.flush oc;
+  Buffer.clear buf
+
+let std_buf_key = Domain.DLS.new_key (fun () -> Buffer.create pp_buffer_size)
+let err_buf_key = Domain.DLS.new_key (fun () -> Buffer.create pp_buffer_size)
+
+let std_formatter_key = DLS.new_key (fun () ->
+  let ppf =
+    pp_make_formatter (buffered_out_string std_buf_key)
+      (buffered_out_flush Stdlib.stdout std_buf_key) ignore ignore ignore
+  in
+  ppf.pp_out_newline <- display_newline ppf;
+  ppf.pp_out_spaces <- display_blanks ppf;
+  ppf.pp_out_indent <- display_indent ppf;
+  ppf)
+let _ = DLS.set std_formatter_key std_formatter
+
+let err_formatter_key = DLS.new_key (fun () ->
+  let ppf =
+    pp_make_formatter (buffered_out_string err_buf_key)
+      (buffered_out_flush Stdlib.stderr err_buf_key) ignore ignore ignore
+  in
+  ppf.pp_out_newline <- display_newline ppf;
+  ppf.pp_out_spaces <- display_blanks ppf;
+  ppf.pp_out_indent <- display_indent ppf;
+  ppf)
+let _ = DLS.set err_formatter_key err_formatter
+
+let get_std_formatter () = DLS.get std_formatter_key
+let get_err_formatter () = DLS.get err_formatter_key
+let get_str_formatter () = DLS.get str_formatter_key
+let get_stdbuf () = DLS.get stdbuf_key
 
 (* [flush_buffer_formatter buf ppf] flushes formatter [ppf],
    then returns the contents of buffer [buf] that is reset.
@@ -1033,9 +1083,25 @@ let flush_buffer_formatter buf ppf =
   Buffer.reset buf;
   s
 
-
 (* Flush [str_formatter] and get the contents of [stdbuf]. *)
-let flush_str_formatter () = flush_buffer_formatter stdbuf str_formatter
+let flush_str_formatter () =
+  let stdbuf = DLS.get stdbuf_key in
+  let str_formatter = DLS.get str_formatter_key in
+  flush_buffer_formatter stdbuf str_formatter
+
+let make_synchronized_formatter output flush =
+  DLS.new_key (fun () ->
+    let buf = Buffer.create pp_buffer_size in
+    let output' = Buffer.add_substring buf in
+    let flush' () =
+      output (Buffer.contents buf) 0 (Buffer.length buf);
+      Buffer.clear buf;
+      flush ()
+    in
+    make_formatter output' flush')
+
+let synchronized_formatter_of_out_channel oc =
+  make_synchronized_formatter (output_substring oc) (fun () -> flush oc)
 
 (*
   Symbolic pretty-printing
@@ -1104,83 +1170,85 @@ let formatter_of_symbolic_output_buffer sob =
 
 *)
 
-let open_hbox = pp_open_hbox std_formatter
-and open_vbox = pp_open_vbox std_formatter
-and open_hvbox = pp_open_hvbox std_formatter
-and open_hovbox = pp_open_hovbox std_formatter
-and open_box = pp_open_box std_formatter
-and close_box = pp_close_box std_formatter
-and open_tag = pp_open_tag std_formatter
-and close_tag = pp_close_tag std_formatter
-and open_stag = pp_open_stag std_formatter
-and close_stag = pp_close_stag std_formatter
-and print_as = pp_print_as std_formatter
-and print_string = pp_print_string std_formatter
-and print_bytes = pp_print_bytes std_formatter
-and print_int = pp_print_int std_formatter
-and print_float = pp_print_float std_formatter
-and print_char = pp_print_char std_formatter
-and print_bool = pp_print_bool std_formatter
-and print_break = pp_print_break std_formatter
-and print_cut = pp_print_cut std_formatter
-and print_space = pp_print_space std_formatter
-and force_newline = pp_force_newline std_formatter
-and print_flush = pp_print_flush std_formatter
-and print_newline = pp_print_newline std_formatter
-and print_if_newline = pp_print_if_newline std_formatter
-
-and open_tbox = pp_open_tbox std_formatter
-and close_tbox = pp_close_tbox std_formatter
-and print_tbreak = pp_print_tbreak std_formatter
-
-and set_tab = pp_set_tab std_formatter
-and print_tab = pp_print_tab std_formatter
-
-and set_margin = pp_set_margin std_formatter
-and get_margin = pp_get_margin std_formatter
-
-and set_max_indent = pp_set_max_indent std_formatter
-and get_max_indent = pp_get_max_indent std_formatter
-
-and set_geometry = pp_set_geometry std_formatter
-and safe_set_geometry = pp_safe_set_geometry std_formatter
-and get_geometry = pp_get_geometry std_formatter
-and update_geometry = pp_update_geometry std_formatter
-
-and set_max_boxes = pp_set_max_boxes std_formatter
-and get_max_boxes = pp_get_max_boxes std_formatter
-and over_max_boxes = pp_over_max_boxes std_formatter
-
-and set_ellipsis_text = pp_set_ellipsis_text std_formatter
-and get_ellipsis_text = pp_get_ellipsis_text std_formatter
-
-and set_formatter_out_channel =
-  pp_set_formatter_out_channel std_formatter
-
-and set_formatter_out_functions =
-  pp_set_formatter_out_functions std_formatter
-and get_formatter_out_functions =
-  pp_get_formatter_out_functions std_formatter
-
-and set_formatter_output_functions =
-  pp_set_formatter_output_functions std_formatter
-and get_formatter_output_functions =
-  pp_get_formatter_output_functions std_formatter
-
-and set_formatter_stag_functions =
-  pp_set_formatter_stag_functions std_formatter
-and get_formatter_stag_functions =
-  pp_get_formatter_stag_functions std_formatter
-and set_print_tags =
-  pp_set_print_tags std_formatter
-and get_print_tags =
-  pp_get_print_tags std_formatter
-and set_mark_tags =
-  pp_set_mark_tags std_formatter
-and get_mark_tags =
-  pp_get_mark_tags std_formatter
-and set_tags =
-  pp_set_tags std_formatter
+let open_hbox v = pp_open_hbox (DLS.get std_formatter_key) v
+and open_vbox v = pp_open_vbox (DLS.get std_formatter_key) v
+and open_hvbox v = pp_open_hvbox (DLS.get std_formatter_key) v
+and open_hovbox v = pp_open_hovbox (DLS.get std_formatter_key) v
+and open_box v = pp_open_box (DLS.get std_formatter_key) v
+and close_box v = pp_close_box (DLS.get std_formatter_key) v
+and open_tag v = pp_open_tag (DLS.get std_formatter_key) v
+and close_tag v = pp_close_tag (DLS.get std_formatter_key) v
+and open_stag v = pp_open_stag (DLS.get std_formatter_key) v
+and close_stag v = pp_close_stag (DLS.get std_formatter_key) v
+and print_as v w = pp_print_as (DLS.get std_formatter_key) v w
+and print_string v = pp_print_string (DLS.get std_formatter_key) v
+and print_bytes v = pp_print_bytes (DLS.get std_formatter_key) v
+and print_int v = pp_print_int (DLS.get std_formatter_key) v
+and print_float v = pp_print_float (DLS.get std_formatter_key) v
+and print_char v = pp_print_char (DLS.get std_formatter_key) v
+and print_bool v = pp_print_bool (DLS.get std_formatter_key) v
+and print_break v w = pp_print_break (DLS.get std_formatter_key) v w
+and print_cut v = pp_print_cut (DLS.get std_formatter_key) v
+and print_space v = pp_print_space (DLS.get std_formatter_key) v
+and force_newline v = pp_force_newline (DLS.get std_formatter_key) v
+and print_flush v = pp_print_flush (DLS.get std_formatter_key) v
+and print_newline v = pp_print_newline (DLS.get std_formatter_key) v
+and print_if_newline v = pp_print_if_newline (DLS.get std_formatter_key) v
+
+and open_tbox v = pp_open_tbox (DLS.get std_formatter_key) v
+and close_tbox v = pp_close_tbox (DLS.get std_formatter_key) v
+and print_tbreak v w = pp_print_tbreak (DLS.get std_formatter_key) v w
+
+and set_tab v = pp_set_tab (DLS.get std_formatter_key) v
+and print_tab v = pp_print_tab (DLS.get std_formatter_key) v
+
+and set_margin v = pp_set_margin (DLS.get std_formatter_key) v
+and get_margin v = pp_get_margin (DLS.get std_formatter_key) v
+
+and set_max_indent v = pp_set_max_indent (DLS.get std_formatter_key) v
+and get_max_indent v = pp_get_max_indent (DLS.get std_formatter_key) v
+
+and set_geometry ~max_indent ~margin =
+  pp_set_geometry (DLS.get std_formatter_key) ~max_indent ~margin
+and safe_set_geometry ~max_indent ~margin =
+  pp_safe_set_geometry (DLS.get std_formatter_key) ~max_indent ~margin
+and get_geometry v = pp_get_geometry (DLS.get std_formatter_key) v
+and update_geometry v = pp_update_geometry (DLS.get std_formatter_key) v
+
+and set_max_boxes v = pp_set_max_boxes (DLS.get std_formatter_key) v
+and get_max_boxes v = pp_get_max_boxes (DLS.get std_formatter_key) v
+and over_max_boxes v = pp_over_max_boxes (DLS.get std_formatter_key) v
+
+and set_ellipsis_text v = pp_set_ellipsis_text (DLS.get std_formatter_key) v
+and get_ellipsis_text v = pp_get_ellipsis_text (DLS.get std_formatter_key) v
+
+and set_formatter_out_channel v =
+  pp_set_formatter_out_channel (DLS.get std_formatter_key) v
+
+and set_formatter_out_functions v =
+  pp_set_formatter_out_functions (DLS.get std_formatter_key) v
+and get_formatter_out_functions v =
+  pp_get_formatter_out_functions (DLS.get std_formatter_key) v
+
+and set_formatter_output_functions v w =
+  pp_set_formatter_output_functions (DLS.get std_formatter_key) v w
+and get_formatter_output_functions v =
+  pp_get_formatter_output_functions (DLS.get std_formatter_key) v
+
+and set_formatter_stag_functions v =
+  pp_set_formatter_stag_functions (DLS.get std_formatter_key) v
+and get_formatter_stag_functions v =
+  pp_get_formatter_stag_functions (DLS.get std_formatter_key) v
+and set_print_tags v =
+  pp_set_print_tags (DLS.get std_formatter_key) v
+and get_print_tags v =
+  pp_get_print_tags (DLS.get std_formatter_key) v
+and set_mark_tags v =
+  pp_set_mark_tags (DLS.get std_formatter_key) v
+and get_mark_tags v =
+  pp_get_mark_tags (DLS.get std_formatter_key) v
+and set_tags v =
+  pp_set_tags (DLS.get std_formatter_key) v
 
 
 (* Convenience functions *)
@@ -1363,8 +1431,16 @@ let ifprintf _ppf (Format (fmt, _)) =
   make_iprintf ignore () fmt
 
 let fprintf ppf = kfprintf ignore ppf
-let printf fmt = fprintf std_formatter fmt
-let eprintf fmt = fprintf err_formatter fmt
+
+let printf (Format (fmt, _)) =
+  make_printf
+    (fun acc -> output_acc (DLS.get std_formatter_key) acc)
+    End_of_acc fmt
+
+let eprintf (Format (fmt, _)) =
+  make_printf
+    (fun acc -> output_acc (DLS.get err_formatter_key) acc)
+    End_of_acc fmt
 
 let kdprintf k (Format (fmt, _)) =
   make_printf
@@ -1398,11 +1474,26 @@ let asprintf fmt = kasprintf id fmt
 (* Flushing standard formatters at end of execution. *)
 
 let flush_standard_formatters () =
-  pp_print_flush std_formatter ();
-  pp_print_flush err_formatter ()
+  pp_print_flush (DLS.get std_formatter_key) ();
+  pp_print_flush (DLS.get err_formatter_key) ()
 
 let () = at_exit flush_standard_formatters
 
+let () = Domain.at_first_spawn (fun () ->
+  flush_standard_formatters ();
+
+  let fs = pp_get_formatter_out_functions std_formatter () in
+  pp_set_formatter_out_functions std_formatter
+    {fs with out_string = buffered_out_string std_buf_key;
+             out_flush = buffered_out_flush Stdlib.stdout std_buf_key};
+
+  let fs = pp_get_formatter_out_functions err_formatter () in
+  pp_set_formatter_out_functions err_formatter
+    {fs with out_string = buffered_out_string err_buf_key;
+             out_flush = buffered_out_flush Stdlib.stderr err_buf_key};
+
+  Domain.at_exit flush_standard_formatters)
+
 (*
 
   Deprecated stuff.
@@ -1423,13 +1514,13 @@ let pp_get_all_formatter_output_functions state () =
 
 
 (* Deprecated : subsumed by set_formatter_out_functions *)
-let set_all_formatter_output_functions =
-  pp_set_all_formatter_output_functions std_formatter
+let set_all_formatter_output_functions ~out =
+  pp_set_all_formatter_output_functions (DLS.get std_formatter_key) ~out
 
 
 (* Deprecated : subsumed by get_formatter_out_functions *)
-let get_all_formatter_output_functions =
-  pp_get_all_formatter_output_functions std_formatter
+let get_all_formatter_output_functions () =
+  pp_get_all_formatter_output_functions (DLS.get std_formatter_key) ()
 
 
 (* Deprecated : error prone function, do not use it.
@@ -1480,7 +1571,7 @@ let pp_get_formatter_tag_functions fmt () =
   let print_close_tag s = funs.print_close_stag (String_tag s) in
   {mark_open_tag; mark_close_tag; print_open_tag; print_close_tag}
 
-let set_formatter_tag_functions =
-  pp_set_formatter_tag_functions std_formatter
-and get_formatter_tag_functions =
-  pp_get_formatter_tag_functions std_formatter
+let set_formatter_tag_functions v =
+  pp_set_formatter_tag_functions (DLS.get std_formatter_key) v
+and get_formatter_tag_functions () =
+  pp_get_formatter_tag_functions (DLS.get std_formatter_key) ()
diff --git a/stdlib/format.mli b/stdlib/format.mli
index ae8a381968ee..09b11cbe2859 100644
--- a/stdlib/format.mli
+++ b/stdlib/format.mli
@@ -30,15 +30,27 @@
    - {!std_formatter} outputs to {{!Stdlib.stdout}stdout}
    - {!err_formatter} outputs to {{!Stdlib.stderr}stderr}
 
-   Most functions in the {!Format} module come in two variants:
-   a short version that operates on {!std_formatter} and the
-   generic version prefixed by [pp_] that takes a formatter
-   as its first argument.
+   Most functions in the {!Format} module come in two variants: a short version
+   that operates on the current domain's standard formatter as obtained using
+   {!get_std_formatter} and the generic version prefixed by [pp_] that takes a
+   formatter as its first argument. For the version that operates on the
+   current domain's standard formatter, the call to {!get_std_formatter} is
+   delayed until the last argument is received.
 
    More formatters can be created with {!formatter_of_out_channel},
-   {!formatter_of_buffer}, {!formatter_of_symbolic_output_buffer}
-   or using {{!section:formatter}custom formatters}.
+   {!formatter_of_buffer}, {!formatter_of_symbolic_output_buffer} or using
+   {{!section:formatter}custom formatters}.
 
+   Warning: Since {{!section:formatter}formatters} contain mutable state, it is
+   not thread-safe to use the same formatter on multiple domains in parallel
+   without synchronization.
+
+   If multiple domains write to the same output channel using the
+   predefined formatters (as obtained by {!get_std_formatter} or
+   {!get_err_formatter}), the output from the domains will be interleaved with
+   each other at points where the formatters are flushed, such as with
+   {!print_flush}. This synchronization is not performed by formatters obtained
+   from {!formatter_of_out_channel} (on the standard out channels or others).
 *)
 
 (** {1 Introduction}
@@ -944,21 +956,43 @@ val get_formatter_stag_functions : unit -> formatter_stag_functions
 
 val formatter_of_out_channel : out_channel -> formatter
 (** [formatter_of_out_channel oc] returns a new formatter writing
-  to the corresponding output channel [oc].
+    to the corresponding output channel [oc].
+*)
+
+val synchronized_formatter_of_out_channel :
+  out_channel -> formatter Domain.DLS.key
+(** [synchronized_formatter_of_out_channel oc] returns the key to the
+    domain-local state that holds the domain-local formatter for writing to the
+    corresponding output channel [oc].
+
+    When the formatter is used with multiple domains, the output from the
+    domains will be interleaved with each other at points where the formatter
+    is flushed, such as with {!print_flush}.
 *)
 
+
 val std_formatter : formatter
-(** The standard formatter to write to standard output.
+(** The initial domain's standard formatter to write to standard output.
 
   It is defined as {!formatter_of_out_channel} {!Stdlib.stdout}.
 *)
 
+val get_std_formatter : unit -> formatter
+(** [get_std_formatter ()] returns the current domain's standard formatter used
+    to write to standard output.
+*)
+
 val err_formatter : formatter
-(** A formatter to write to standard error.
+(** The initial domain's formatter to write to standard error.
 
   It is defined as {!formatter_of_out_channel} {!Stdlib.stderr}.
 *)
 
+val get_err_formatter : unit -> formatter
+(* [get_err_formatter ()] returns the current domain's formatter used to write
+   to standard error.
+*)
+
 val formatter_of_buffer : Buffer.t -> formatter
 (** [formatter_of_buffer b] returns a new formatter writing to
   buffer [b]. At the end of pretty-printing, the formatter must be flushed
@@ -967,17 +1001,26 @@ val formatter_of_buffer : Buffer.t -> formatter
 *)
 
 val stdbuf : Buffer.t
-(** The string buffer in which [str_formatter] writes. *)
+(** The initial domain's string buffer in which [str_formatter] writes. *)
+
+val get_stdbuf : unit -> Buffer.t
+(** [get_stdbuf ()] returns the current domain's string buffer in which the
+    current domain's string formatter writes. *)
 
 val str_formatter : formatter
-(** A formatter to output to the {!stdbuf} string buffer.
+(** The initial domain's formatter to output to the {!stdbuf} string buffer.
 
   [str_formatter] is defined as {!formatter_of_buffer} {!stdbuf}.
 *)
 
+val get_str_formatter : unit -> formatter
+(** The current domain's formatter to output to the current domains string
+    buffer.
+*)
+
 val flush_str_formatter : unit -> string
-(** Returns the material printed with [str_formatter], flushes
-  the formatter and resets the corresponding buffer.
+(** Returns the material printed with [str_formatter] of the current domain,
+    flushes the formatter and resets the corresponding buffer.
 *)
 
 val make_formatter :
@@ -992,6 +1035,17 @@ val make_formatter :
   returns a formatter to the {!Stdlib.out_channel} [oc].
 *)
 
+val make_synchronized_formatter :
+  (string -> int -> int -> unit) -> (unit -> unit) -> formatter Domain.DLS.key
+(** [make_synchronized_formatter out flush] returns the key to the domain-local
+    state that holds the domain-local formatter that outputs with function
+    [out], and flushes with function [flush].
+
+    When the formatter is used with multiple domains, the output from the
+    domains will be interleaved with each other at points where the formatter
+    is flushed, such as with {!print_flush}.
+*)
+
 val formatter_of_out_functions :
   formatter_out_functions -> formatter
 (** [formatter_of_out_functions out_funs] returns a new formatter that writes
@@ -1003,6 +1057,8 @@ val formatter_of_out_functions :
   @since 4.06.0
 *)
 
+
+
 (** {2:symbolic Symbolic pretty-printing} *)
 
 (**
@@ -1237,10 +1293,24 @@ val fprintf : formatter -> ('a, formatter, unit) format -> 'a
 *)
 
 val printf : ('a, formatter, unit) format -> 'a
-(** Same as [fprintf] above, but output on [std_formatter]. *)
+(** Same as [fprintf] above, but output on [get_std_formatter ()].
+
+    It is defined similarly to [fun fmt -> fprintf (get_std_formatter ()) fmt]
+    but delays calling [get_std_formatter] until after the final argument
+    required by the [format] is received. When used with multiple domains, the
+    output from the domains will be interleaved with each other at points where
+    the formatter is flushed, such as with {!print_flush}.
+*)
 
 val eprintf : ('a, formatter, unit) format -> 'a
-(** Same as [fprintf] above, but output on [err_formatter]. *)
+(** Same as [fprintf] above, but output on [get_err_formatter ()].
+
+    It is defined similarly to [fun fmt -> fprintf (get_err_formatter ()) fmt]
+    but delays calling [get_err_formatter] until after the final argument
+    required by the [format] is received. When used with multiple domains, the
+    output from the domains will be interleaved with each other at points where
+    the formatter is flushed, such as with {!print_flush}.
+*)
 
 val sprintf : ('a, unit, string) format -> 'a
 (** Same as [printf] above, but instead of printing on a formatter,
diff --git a/stdlib/gc.ml b/stdlib/gc.ml
index b4fc555b7fcb..64b1d8aff64b 100644
--- a/stdlib/gc.ml
+++ b/stdlib/gc.ml
@@ -61,9 +61,6 @@ external major : unit -> unit = "caml_gc_major"
 external full_major : unit -> unit = "caml_gc_full_major"
 external compact : unit -> unit = "caml_gc_compaction"
 external get_minor_free : unit -> int = "caml_get_minor_free"
-external get_bucket : int -> int = "caml_get_major_bucket" [@@noalloc]
-external get_credit : unit -> int = "caml_get_major_credit" [@@noalloc]
-external huge_fallback_count : unit -> int = "caml_gc_huge_fallback_count"
 external eventlog_pause : unit -> unit = "caml_eventlog_pause"
 external eventlog_resume : unit -> unit = "caml_eventlog_resume"
 
diff --git a/stdlib/gc.mli b/stdlib/gc.mli
index c073fcc2aed2..22acefd78b13 100644
--- a/stdlib/gc.mli
+++ b/stdlib/gc.mli
@@ -91,7 +91,8 @@ type stat =
     (** Number of forced full major collections completed since the program
         was started. @since 4.12.0 *)
 }
-(** The memory management counters are returned in a [stat] record.
+(** The memory management counters are returned in a [stat] record. These
+   counters give values for the whole program.
 
    The total amount of memory allocated by the program since it was started
    is (in words) [minor_words + major_words - promoted_words].  Multiply by
@@ -103,7 +104,9 @@ type control =
   { mutable minor_heap_size : int;
     [@ocaml.deprecated_mutable "Use {(Gc.get()) with Gc.minor_heap_size = ...}"]
     (** The size (in words) of the minor heap.  Changing
-       this parameter will trigger a minor collection.  Default: 256k. *)
+       this parameter will trigger a minor collection. The total size of the
+       minor heap used by this program will be this number multiplied by the
+       number of active domains. Default: 256k. *)
 
     mutable major_heap_increment : int;
     [@ocaml.deprecated_mutable
@@ -156,9 +159,8 @@ type control =
 
     mutable stack_limit : int;
     [@ocaml.deprecated_mutable "Use {(Gc.get()) with Gc.stack_limit = ...}"]
-    (** The maximum size of the stack (in words).  This is only
-       relevant to the byte-code runtime, as the native code runtime
-       uses the operating system's stack.  Default: 1024k. *)
+    (** The maximum size of the fiber stacks (in words).
+       Default: 1024k. *)
 
     mutable allocation_policy : int;
     [@ocaml.deprecated_mutable
@@ -245,24 +247,27 @@ type control =
 
 external stat : unit -> stat = "caml_gc_stat"
 (** Return the current values of the memory management counters in a
-   [stat] record.  This function examines every heap block to get the
-   statistics. *)
+   [stat] record that represent the program's total memory stats.
+   This function causes a full major collection. *)
 
 external quick_stat : unit -> stat = "caml_gc_quick_stat"
 (** Same as [stat] except that [live_words], [live_blocks], [free_words],
-    [free_blocks], [largest_free], and [fragments] are set to 0.  This
-    function is much faster than [stat] because it does not need to go
-    through the heap. *)
+    [free_blocks], [largest_free], and [fragments] are set to 0. Due to
+    per-domain buffers it may only represent the state of the program's
+    total memory usage since the last minor collection. This function is
+    much faster than [stat] because it does not need to trigger a full
+    major collection. *)
 
 external counters : unit -> float * float * float = "caml_gc_counters"
-(** Return [(minor_words, promoted_words, major_words)].  This function
-    is as fast as [quick_stat]. *)
+(** Return [(minor_words, promoted_words, major_words)] for the current
+    domain or potentially previous domains.  This function is as fast as
+    [quick_stat]. *)
 
 external minor_words : unit -> (float [@unboxed])
   = "caml_gc_minor_words" "caml_gc_minor_words_unboxed"
-(** Number of words allocated in the minor heap since the program was
-    started. This number is accurate in byte-code programs, but only an
-    approximation in programs compiled to native code.
+(** Number of words allocated in the minor heap by this domain or potentially
+    previous domains. This number is accurate in byte-code programs, but
+    only an approximation in programs compiled to native code.
 
     In native code this function does not allocate.
 
@@ -300,38 +305,19 @@ external compact : unit -> unit = "caml_gc_compaction"
 
 val print_stat : out_channel -> unit
 (** Print the current values of the memory management counters (in
-   human-readable form) into the channel argument. *)
+   human-readable form) of the total program into the channel argument. *)
 
 val allocated_bytes : unit -> float
-(** Return the total number of bytes allocated since the program was
-   started.  It is returned as a [float] to avoid overflow problems
+(** Return the number of bytes allocated by this domain and potentially
+   a previous domain. It is returned as a [float] to avoid overflow problems
    with [int] on 32-bit machines. *)
 
 external get_minor_free : unit -> int = "caml_get_minor_free"
-(** Return the current size of the free space inside the minor heap.
+(** Return the current size of the free space inside the minor heap of this
+   domain.
 
     @since 4.03.0 *)
 
-external get_bucket : int -> int = "caml_get_major_bucket" [@@noalloc]
-(** [get_bucket n] returns the current size of the [n]-th future bucket
-    of the GC smoothing system. The unit is one millionth of a full GC.
-    @raise Invalid_argument if [n] is negative, return 0 if n is larger
-    than the smoothing window.
-
-    @since 4.03.0 *)
-
-external get_credit : unit -> int = "caml_get_major_credit" [@@noalloc]
-(** [get_credit ()] returns the current size of the "work done in advance"
-    counter of the GC smoothing system. The unit is one millionth of a
-    full GC.
-
-    @since 4.03.0 *)
-
-external huge_fallback_count : unit -> int = "caml_gc_huge_fallback_count"
-(** Return the number of times we tried to map huge pages and had to fall
-    back to small pages. This is always 0 if [OCAMLRUNPARAM] contains [H=1].
-    @since 4.03.0 *)
-
 val finalise : ('a -> unit) -> 'a -> unit
 (** [finalise f v] registers [f] as a finalisation function for [v].
    [v] must be heap-allocated.  [f] will be called with [v] as
diff --git a/stdlib/hashtbl.ml b/stdlib/hashtbl.ml
index e37936b2f0ff..7006bc4a4b69 100644
--- a/stdlib/hashtbl.ml
+++ b/stdlib/hashtbl.ml
@@ -57,7 +57,7 @@ let randomized = ref randomized_default
 let randomize () = randomized := true
 let is_randomized () = !randomized
 
-let prng = lazy (Random.State.make_self_init())
+let prng_key = Domain.DLS.new_key Random.State.make_self_init
 
 (* Functions which appear before the functorial interface must either be
    independent of the hash function or take it as a parameter (see #2202 and
@@ -72,7 +72,9 @@ let rec power_2_above x n =
 
 let create ?(random = !randomized) initial_size =
   let s = power_2_above 16 initial_size in
-  let seed = if random then Random.State.bits (Lazy.force prng) else 0 in
+  let seed =
+    if random then Random.State.bits (Domain.DLS.get prng_key) else 0
+  in
   { initial_size = s; size = 0; seed = seed; data = Array.make s Empty }
 
 let clear h =
@@ -617,7 +619,7 @@ let of_seq i =
 let rebuild ?(random = !randomized) h =
   let s = power_2_above 16 (Array.length h.data) in
   let seed =
-    if random then Random.State.bits (Lazy.force prng)
+    if random then Random.State.bits (Domain.DLS.get prng_key)
     else if Obj.size (Obj.repr h) >= 4 then h.seed
     else 0 in
   let h' = {
diff --git a/stdlib/lazy.ml b/stdlib/lazy.ml
index 4bfba47e9a7c..ebc979b4cb10 100644
--- a/stdlib/lazy.ml
+++ b/stdlib/lazy.ml
@@ -50,13 +50,10 @@
 type 'a t = 'a CamlinternalLazy.t
 
 exception Undefined = CamlinternalLazy.Undefined
-
 external make_forward : 'a -> 'a lazy_t = "caml_lazy_make_forward"
-
 external force : 'a t -> 'a = "%lazy_force"
 
-
-let force_val = CamlinternalLazy.force_val
+let force_val l = CamlinternalLazy.force_gen ~only_val:true l
 
 let from_fun (f : unit -> 'arg) =
   let x = Obj.new_block Obj.lazy_tag 1 in
@@ -65,13 +62,13 @@ let from_fun (f : unit -> 'arg) =
 
 let from_val (v : 'arg) =
   let t = Obj.tag (Obj.repr v) in
-  if t = Obj.forward_tag || t = Obj.lazy_tag || t = Obj.double_tag then begin
+  if t = Obj.forward_tag || t = Obj.lazy_tag ||
+     t = Obj.forcing_tag || t = Obj.double_tag then begin
     make_forward v
   end else begin
     (Obj.magic v : 'arg t)
   end
 
-
 let is_val (l : 'arg t) = Obj.tag (Obj.repr l) <> Obj.lazy_tag
 
 let lazy_from_fun = from_fun
@@ -80,7 +77,6 @@ let lazy_from_val = from_val
 
 let lazy_is_val = is_val
 
-
 let map f x =
   lazy (f (force x))
 
diff --git a/stdlib/lazy.mli b/stdlib/lazy.mli
index 9ac748b2d84e..1639472d443b 100644
--- a/stdlib/lazy.mli
+++ b/stdlib/lazy.mli
@@ -16,54 +16,59 @@
 (** Deferred computations. *)
 
 type 'a t = 'a CamlinternalLazy.t
-(** A value of type ['a Lazy.t] is a deferred computation, called
-   a suspension, that has a result of type ['a].  The special
-   expression syntax [lazy (expr)] makes a suspension of the
-   computation of [expr], without computing [expr] itself yet.
-   "Forcing" the suspension will then compute [expr] and return its
-   result. Matching a suspension with the special pattern syntax
-   [lazy(pattern)] also computes the underlying expression and
-   tries to bind it to [pattern]:
-
-  {[
-    let lazy_option_map f x =
-    match x with
-    | lazy (Some x) -> Some (Lazy.force f x)
-    | _ -> None
-  ]}
-
-   Note: If lazy patterns appear in multiple cases in a pattern-matching,
-   lazy expressions may be forced even outside of the case ultimately selected
-   by the pattern matching. In the example above, the suspension [x] is always
-   computed.
-
-
-   Note: [lazy_t] is the built-in type constructor used by the compiler
-   for the [lazy] keyword.  You should not use it directly.  Always use
-   [Lazy.t] instead.
-
-   Note: [Lazy.force] is not thread-safe.  If you use this module in
-   a multi-threaded program, you will need to add some locks.
-
-   Note: if the program is compiled with the [-rectypes] option,
-   ill-founded recursive definitions of the form [let rec x = lazy x]
-   or [let rec x = lazy(lazy(...(lazy x)))] are accepted by the type-checker
-   and lead, when forced, to ill-formed values that trigger infinite
-   loops in the garbage collector and other parts of the run-time system.
-   Without the [-rectypes] option, such ill-founded recursive definitions
-   are rejected by the type-checker.
+(** A value of type ['a Lazy.t] is a deferred computation, called a suspension,
+    that has a result of type ['a]. The special expression syntax [lazy (expr)]
+    makes a suspension of the computation of [expr], without computing
+    [expr] itself yet. "Forcing" the suspension will then compute [expr] and
+    return its result. Matching a suspension with the special pattern syntax
+    [lazy(pattern)] also computes the underlying expression and tries to bind
+    it to [pattern]:
+
+    {[
+      let lazy_option_map f x =
+      match x with
+      | lazy (Some x) -> Some (Lazy.force f x)
+      | _ -> None
+    ]}
+
+    Note: If lazy patterns appear in multiple cases in a pattern-matching, lazy
+    expressions may be forced even outside of the case ultimately selected by
+    the pattern matching. In the example above, the suspension [x] is always
+    computed.
+
+    Note: [lazy_t] is the built-in type constructor used by the compiler for the
+    [lazy] keyword.  You should not use it directly.  Always use [Lazy.t]
+    instead.
+
+    Note: [Lazy.force] is not concurrency-safe. If you use this module with
+    multiple fibers, systhreads or domains, then you will need to add some
+    locks. The module however ensures memory-safety, and hence, concurrently
+    accessing this module will not lead to a crash but the behaviour is
+    unspecified.
+
+    Note: if the program is compiled with the [-rectypes] option,
+    ill-founded recursive definitions of the form [let rec x = lazy x]
+    or [let rec x = lazy(lazy(...(lazy x)))] are accepted by the type-checker
+    and lead, when forced, to ill-formed values that trigger infinite
+    loops in the garbage collector and other parts of the run-time system.
+    Without the [-rectypes] option, such ill-founded recursive definitions
+    are rejected by the type-checker.
 *)
 
 
 exception Undefined
+(** Raised when forcing a suspension concurrently from multiple fibers,
+    systhreads or domains, or when the suspension tries to force itself
+    recursively.
+*)
 
 external force : 'a t -> 'a = "%lazy_force"
-(** [force x] forces the suspension [x] and returns its result.
-   If [x] has already been forced, [Lazy.force x] returns the
-   same value again without recomputing it.  If it raised an exception,
-   the same exception is raised again.
-   @raise Undefined if the forcing of [x] tries to force [x] itself
-   recursively.
+(** [force x] forces the suspension [x] and returns its result. If [x] has
+    already been forced, [Lazy.force x] returns the same value again without
+    recomputing it.  If it raised an exception, the same exception is raised
+    again.
+
+    @raise Undefined (see {!Undefined}).
 *)
 
 (** {1 Iterators} *)
@@ -123,16 +128,17 @@ val from_fun : (unit -> 'a) -> 'a t
     @since 4.00.0 *)
 
 val force_val : 'a t -> 'a
-(** [force_val x] forces the suspension [x] and returns its
-    result. If [x] has already been forced, [force_val x]
-    returns the same value again without recomputing it.
+(** [force_val x] forces the suspension [x] and returns its result.  If [x]
+    has already been forced, [force_val x] returns the same value again
+    without recomputing it.
 
     If the computation of [x] raises an exception, it is unspecified
     whether [force_val x] raises the same exception or {!Undefined}.
     @raise Undefined if the forcing of [x] tries to force [x] itself
     recursively.
-*)
 
+    @raise Undefined (see {!Undefined}).
+*)
 
 (** {1 Deprecated} *)
 
diff --git a/otherlibs/systhreads/mutex.ml b/stdlib/mutex.ml
similarity index 84%
rename from otherlibs/systhreads/mutex.ml
rename to stdlib/mutex.ml
index 836109e761bb..01661b63313f 100644
--- a/otherlibs/systhreads/mutex.ml
+++ b/stdlib/mutex.ml
@@ -14,7 +14,7 @@
 (**************************************************************************)
 
 type t
-external create: unit -> t = "caml_mutex_new"
-external lock: t -> unit = "caml_mutex_lock"
-external try_lock: t -> bool = "caml_mutex_try_lock"
-external unlock: t -> unit = "caml_mutex_unlock"
+external create: unit -> t = "caml_ml_mutex_new"
+external lock: t -> unit = "caml_ml_mutex_lock"
+external try_lock: t -> bool = "caml_ml_mutex_try_lock"
+external unlock: t -> unit = "caml_ml_mutex_unlock"
diff --git a/otherlibs/systhreads/mutex.mli b/stdlib/mutex.mli
similarity index 100%
rename from otherlibs/systhreads/mutex.mli
rename to stdlib/mutex.mli
diff --git a/stdlib/obj.ml b/stdlib/obj.ml
index ee72b5759432..e4477d19ee2d 100644
--- a/stdlib/obj.ml
+++ b/stdlib/obj.ml
@@ -30,6 +30,9 @@ external size : t -> int = "%obj_size"
 external reachable_words : t -> int = "caml_obj_reachable_words"
 external field : t -> int -> t = "%obj_field"
 external set_field : t -> int -> t -> unit = "%obj_set_field"
+external compare_and_swap_field : t -> int -> t -> t -> bool
+  = "caml_obj_compare_and_swap"
+external is_shared : t -> bool = "caml_obj_is_shared"
 external floatarray_get : floatarray -> int -> float = "caml_floatarray_get"
 external floatarray_set :
     floatarray -> int -> float -> unit = "caml_floatarray_set"
@@ -47,8 +50,10 @@ external add_offset : t -> Int32.t -> t = "caml_obj_add_offset"
 external with_tag : int -> t -> t = "caml_obj_with_tag"
 
 let first_non_constant_constructor_tag = 0
-let last_non_constant_constructor_tag = 245
+let last_non_constant_constructor_tag = 243
 
+let forcing_tag = 244
+let cont_tag = 245
 let lazy_tag = 246
 let closure_tag = 247
 let object_tag = 248
diff --git a/stdlib/obj.mli b/stdlib/obj.mli
index 7a5260b845e3..afb08e5da767 100644
--- a/stdlib/obj.mli
+++ b/stdlib/obj.mli
@@ -57,6 +57,9 @@ external field : t -> int -> t = "%obj_field"
     be propagated.
 *)
 external set_field : t -> int -> t -> unit = "%obj_set_field"
+external compare_and_swap_field : t -> int -> t -> t -> bool
+  = "caml_obj_compare_and_swap"
+external is_shared : t -> bool = "caml_obj_is_shared"
 external set_tag : t -> int -> unit = "caml_obj_set_tag"
   [@@ocaml.deprecated "Use with_tag instead."]
 
@@ -82,6 +85,8 @@ external with_tag : int -> t -> t = "caml_obj_with_tag"
 val first_non_constant_constructor_tag : int
 val last_non_constant_constructor_tag : int
 
+val forcing_tag : int
+val cont_tag : int
 val lazy_tag : int
 val closure_tag : int
 val object_tag : int
diff --git a/stdlib/random.ml b/stdlib/random.ml
index 708403eed928..791ef968b9a8 100644
--- a/stdlib/random.ml
+++ b/stdlib/random.ml
@@ -196,7 +196,7 @@ end
 
 (* This is the state you get with [init 27182818] and then applying
    the "land 0x3FFFFFFF" filter to them.  See #5575, #5793, #5977. *)
-let default = {
+let mk_default () = {
   State.st = [|
       0x3ae2522b; 0x1d8d4634; 0x15b4fad0; 0x18b14ace; 0x12f8a3c4; 0x3b086c47;
       0x16d467d6; 0x101d91c7; 0x321df177; 0x0176c193; 0x1ff72bf1; 0x1e889109;
@@ -212,26 +212,28 @@ let default = {
   State.idx = 0;
 }
 
-let bits () = State.bits default
-let int bound = State.int default bound
-let full_int bound = State.full_int default bound
-let int32 bound = State.int32 default bound
-let nativeint bound = State.nativeint default bound
-let int64 bound = State.int64 default bound
-let float scale = State.float default scale
-let bool () = State.bool default
-let bits32 () = State.bits32 default
-let bits64 () = State.bits64 default
-let nativebits () = State.nativebits default
-
-let full_init seed = State.full_init default seed
-let init seed = State.full_init default [| seed |]
+let random_key = Domain.DLS.new_key mk_default
+
+let bits () = State.bits (Domain.DLS.get random_key)
+let int bound = State.int (Domain.DLS.get random_key) bound
+let full_int bound = State.full_int (Domain.DLS.get random_key) bound
+let int32 bound = State.int32 (Domain.DLS.get random_key) bound
+let nativeint bound = State.nativeint (Domain.DLS.get random_key) bound
+let int64 bound = State.int64 (Domain.DLS.get random_key) bound
+let float scale = State.float (Domain.DLS.get random_key) scale
+let bool () = State.bool (Domain.DLS.get random_key)
+let bits32 () = State.bits32 (Domain.DLS.get random_key)
+let bits64 () = State.bits64 (Domain.DLS.get random_key)
+let nativebits () = State.nativebits (Domain.DLS.get random_key)
+
+let full_init seed = State.full_init (Domain.DLS.get random_key) seed
+let init seed = State.full_init (Domain.DLS.get random_key) [| seed |]
 let self_init () = full_init (random_seed())
 
 (* Manipulating the current state. *)
 
-let get_state () = State.copy default
-let set_state s = State.assign default s
+let get_state () = State.copy (Domain.DLS.get random_key)
+let set_state s = State.assign (Domain.DLS.get random_key) s
 
 (********************
 
diff --git a/stdlib/random.mli b/stdlib/random.mli
index 208e44dda856..bc2213abb244 100644
--- a/stdlib/random.mli
+++ b/stdlib/random.mli
@@ -13,23 +13,31 @@
 (*                                                                        *)
 (**************************************************************************)
 
-(** Pseudo-random number generators (PRNG). *)
+(** Pseudo-random number generators (PRNG).
+
+    With multiple domains, by default, each domain is initialised with the same
+    random seed. Hence, each domain will yield the same sequence of numbers. If
+    you wish to have uncorrelated sequences of random numbers on each domain,
+    one way to achieve this is to call {!self_init} on each domain before
+    requesting random numbers.
+
+*)
 
 (** {1 Basic functions} *)
 
 val init : int -> unit
-(** Initialize the generator, using the argument as a seed.
-     The same seed will always yield the same sequence of numbers. *)
+(** Initialize the domain-local generator, using the argument as a seed.
+    The same seed will always yield the same sequence of numbers. *)
 
 val full_init : int array -> unit
 (** Same as {!Random.init} but takes more data as seed. *)
 
 val self_init : unit -> unit
-(** Initialize the generator with a random seed chosen
-   in a system-dependent way.  If [/dev/urandom] is available on
-   the host machine, it is used to provide a highly random initial
-   seed.  Otherwise, a less random seed is computed from system
-   parameters (current time, process IDs). *)
+(** Initialize the domain-local generator with a random seed chosen
+    in a system-dependent way.  If [/dev/urandom] is available on the host
+    machine, it is used to provide a highly random initial seed.  Otherwise, a
+    less random seed is computed from system parameters (current time, process
+    IDs, domain-local state). *)
 
 val bits : unit -> int
 (** Return 30 random bits in a nonnegative integer.
@@ -131,7 +139,8 @@ end
 
 
 val get_state : unit -> State.t
-(** Return the current state of the generator used by the basic functions. *)
+(** Return the current domain-local state of the generator used by the basic
+    functions. *)
 
 val set_state : State.t -> unit
-(** Set the state of the generator used by the basic functions. *)
+(** Set the domain-local state of the generator used by the basic functions. *)
diff --git a/otherlibs/systhreads/semaphore.ml b/stdlib/semaphore.ml
similarity index 100%
rename from otherlibs/systhreads/semaphore.ml
rename to stdlib/semaphore.ml
diff --git a/otherlibs/systhreads/semaphore.mli b/stdlib/semaphore.mli
similarity index 100%
rename from otherlibs/systhreads/semaphore.mli
rename to stdlib/semaphore.mli
diff --git a/stdlib/stdlib.ml b/stdlib/stdlib.ml
index 6268f3c59276..1e77c3eca3ac 100644
--- a/stdlib/stdlib.ml
+++ b/stdlib/stdlib.ml
@@ -571,66 +571,66 @@ let exit retcode =
 
 let _ = register_named_value "Pervasives.do_at_exit" do_at_exit
 
-external major : unit -> unit = "caml_gc_major"
-external naked_pointers_checked : unit -> bool
-  = "caml_sys_const_naked_pointers_checked"
-let () = if naked_pointers_checked () then at_exit major
-
 (*MODULE_ALIASES*)
-module Arg          = Arg
-module Array        = Array
-module ArrayLabels  = ArrayLabels
-module Atomic       = Atomic
-module Bigarray     = Bigarray
-module Bool         = Bool
-module Buffer       = Buffer
-module Bytes        = Bytes
-module BytesLabels  = BytesLabels
-module Callback     = Callback
-module Char         = Char
-module Complex      = Complex
-module Digest       = Digest
-module Either       = Either
-module Ephemeron    = Ephemeron
-module Filename     = Filename
-module Float        = Float
-module Format       = Format
-module Fun          = Fun
-module Gc           = Gc
-module Genlex       = Genlex
-module Hashtbl      = Hashtbl
-module In_channel   = In_channel
-module Int          = Int
-module Int32        = Int32
-module Int64        = Int64
-module Lazy         = Lazy
-module Lexing       = Lexing
-module List         = List
-module ListLabels   = ListLabels
-module Map          = Map
-module Marshal      = Marshal
-module MoreLabels   = MoreLabels
-module Nativeint    = Nativeint
-module Obj          = Obj
-module Oo           = Oo
-module Option       = Option
-module Out_channel  = Out_channel
-module Parsing      = Parsing
-module Pervasives   = Pervasives
-module Printexc     = Printexc
-module Printf       = Printf
-module Queue        = Queue
-module Random       = Random
-module Result       = Result
-module Scanf        = Scanf
-module Seq          = Seq
-module Set          = Set
-module Stack        = Stack
-module StdLabels    = StdLabels
-module Stream       = Stream
-module String       = String
-module StringLabels = StringLabels
-module Sys          = Sys
-module Uchar        = Uchar
-module Unit         = Unit
-module Weak         = Weak
+module Arg            = Arg
+module Array          = Array
+module ArrayLabels    = ArrayLabels
+module Atomic         = Atomic
+module Bigarray       = Bigarray
+module Bool           = Bool
+module Buffer         = Buffer
+module Bytes          = Bytes
+module BytesLabels    = BytesLabels
+module Callback       = Callback
+module Char           = Char
+module Complex        = Complex
+module Condition      = Condition
+module Digest         = Digest
+module Domain         = Domain
+module EffectHandlers = EffectHandlers
+module Either         = Either
+module Ephemeron      = Ephemeron
+module Filename       = Filename
+module Float          = Float
+module Format         = Format
+module Fun            = Fun
+module Gc             = Gc
+module Genlex         = Genlex
+module Hashtbl        = Hashtbl
+module In_channel     = In_channel
+module Int            = Int
+module Int32          = Int32
+module Int64          = Int64
+module Lazy           = Lazy
+module Lexing         = Lexing
+module List           = List
+module ListLabels     = ListLabels
+module Map            = Map
+module Marshal        = Marshal
+module MoreLabels     = MoreLabels
+module Mutex          = Mutex
+module Nativeint      = Nativeint
+module Obj            = Obj
+module Oo             = Oo
+module Option         = Option
+module Out_channel    = Out_channel
+module Parsing        = Parsing
+module Pervasives     = Pervasives
+module Printexc       = Printexc
+module Printf         = Printf
+module Queue          = Queue
+module Random         = Random
+module Result         = Result
+module Scanf          = Scanf
+module Semaphore      = Semaphore
+module Seq            = Seq
+module Set            = Set
+module Stack          = Stack
+module StdLabels      = StdLabels
+module Stream         = Stream
+module String         = String
+module StringLabels   = StringLabels
+module Sys            = Sys
+module Uchar          = Uchar
+module Unit           = Unit
+module Weak           = Weak
diff --git a/stdlib/stdlib.mli b/stdlib/stdlib.mli
index 0844af88b93b..9e6610374456 100644
--- a/stdlib/stdlib.mli
+++ b/stdlib/stdlib.mli
@@ -1376,66 +1376,71 @@ val do_at_exit : unit -> unit
 (** {1:modules Standard library modules } *)
 
 (*MODULE_ALIASES*)
-module Arg          = Arg
-module Array        = Array
-module ArrayLabels  = ArrayLabels
-module Atomic       = Atomic
-module Bigarray     = Bigarray
-module Bool         = Bool
-module Buffer       = Buffer
-module Bytes        = Bytes
-module BytesLabels  = BytesLabels
-module Callback     = Callback
-module Char         = Char
-module Complex      = Complex
-module Digest       = Digest
-module Either       = Either
-module Ephemeron    = Ephemeron
-module Filename     = Filename
-module Float        = Float
-module Format       = Format
-module Fun          = Fun
-module Gc           = Gc
-module Genlex       = Genlex
+module Arg            = Arg
+module Array          = Array
+module ArrayLabels    = ArrayLabels
+module Atomic         = Atomic
+module Bigarray       = Bigarray
+module Bool           = Bool
+module Buffer         = Buffer
+module Bytes          = Bytes
+module BytesLabels    = BytesLabels
+module Callback       = Callback
+module Char           = Char
+module Complex        = Complex
+module Condition      = Condition
+module Digest         = Digest
+module Domain         = Domain
+module EffectHandlers = EffectHandlers
+module Either         = Either
+module Ephemeron      = Ephemeron
+module Filename       = Filename
+module Float          = Float
+module Format         = Format
+module Fun            = Fun
+module Gc             = Gc
+module Genlex         = Genlex
 [@@deprecated "Use the camlp-streams library instead."]
-module Hashtbl      = Hashtbl
-module In_channel   = In_channel
-module Int          = Int
-module Int32        = Int32
-module Int64        = Int64
-module Lazy         = Lazy
-module Lexing       = Lexing
-module List         = List
-module ListLabels   = ListLabels
-module Map          = Map
-module Marshal      = Marshal
-module MoreLabels   = MoreLabels
-module Nativeint    = Nativeint
-module Obj          = Obj
-module Oo           = Oo
-module Option       = Option
-module Out_channel  = Out_channel
-module Parsing      = Parsing
-module Pervasives   = Pervasives
+module Hashtbl        = Hashtbl
+module In_channel     = In_channel
+module Int            = Int
+module Int32          = Int32
+module Int64          = Int64
+module Lazy           = Lazy
+module Lexing         = Lexing
+module List           = List
+module ListLabels     = ListLabels
+module Map            = Map
+module Marshal        = Marshal
+module MoreLabels     = MoreLabels
+module Mutex          = Mutex
+module Nativeint      = Nativeint
+module Obj            = Obj
+module Oo             = Oo
+module Option         = Option
+module Out_channel    = Out_channel
+module Parsing        = Parsing
+module Pervasives     = Pervasives
 [@@deprecated "Use Stdlib instead.\n\
 \n\
 If you need to stay compatible with OCaml < 4.07, you can use the \n\
 stdlib-shims library: https://github.com/ocaml/stdlib-shims"]
-module Printexc     = Printexc
-module Printf       = Printf
-module Queue        = Queue
-module Random       = Random
-module Result       = Result
-module Scanf        = Scanf
-module Seq          = Seq
-module Set          = Set
-module Stack        = Stack
-module StdLabels    = StdLabels
-module Stream       = Stream
+module Printexc       = Printexc
+module Printf         = Printf
+module Queue          = Queue
+module Random         = Random
+module Result         = Result
+module Scanf          = Scanf
+module Semaphore      = Semaphore
+module Seq            = Seq
+module Set            = Set
+module Stack          = Stack
+module StdLabels      = StdLabels
+module Stream         = Stream
 [@@deprecated "Use the camlp-streams library instead."]
-module String       = String
-module StringLabels = StringLabels
-module Sys          = Sys
-module Uchar        = Uchar
-module Unit         = Unit
-module Weak         = Weak
+module String         = String
+module StringLabels   = StringLabels
+module Sys            = Sys
+module Uchar          = Uchar
+module Unit           = Unit
+module Weak           = Weak
diff --git a/stdlib/weak.ml b/stdlib/weak.ml
index 8844a9863ea7..396431829773 100644
--- a/stdlib/weak.ml
+++ b/stdlib/weak.ml
@@ -19,13 +19,16 @@ type !'a t
 
 external create : int -> 'a t = "caml_weak_create"
 
+(** number of additional values in a weak pointer
+ *    - Link
+ *    - Data *)
+let additional_values = 2 (* CAML_EPHE_FIRST_KEY in weak.h *)
+
 let create l =
   if not (0 <= l && l <= Obj.Ephemeron.max_ephe_length) then
     invalid_arg("Weak.create");
   create l
 
-(** number of additional values in a weak pointer *)
-let additional_values = 2
 
 let length x = Obj.size(Obj.repr x) - additional_values
 
diff --git a/testsuite/Makefile b/testsuite/Makefile
index fa5378cd069a..0d1c5bf76ebb 100644
--- a/testsuite/Makefile
+++ b/testsuite/Makefile
@@ -111,10 +111,20 @@ endif
 
 TIMEOUT ?= 600 # 10 minutes
 
+# SHOW_TIMINGS should be set by the user (to a non-empty value) if they want
+# the timings for each test file to be included in the log
+SHOW_TIMINGS ?=
+ifeq "$(SHOW_TIMINGS)" ""
+  OCAMLTEST_SHOW_TIMINGS_FLAG :=
+else
+  OCAMLTEST_SHOW_TIMINGS_FLAG := -show-timings
+endif
+
 OCAMLTESTFLAGS := \
   -timeout $(TIMEOUT) \
   $(OCAMLTEST_PROMOTE_FLAG) \
-  $(OCAMLTEST_KEEP_TEST_DIR_ON_SUCCESS_FLAG)
+  $(OCAMLTEST_KEEP_TEST_DIR_ON_SUCCESS_FLAG) \
+  $(OCAMLTEST_SHOW_TIMINGS_FLAG)
 
 # Make sure USE_RUNTIME is defined
 USE_RUNTIME ?=
diff --git a/testsuite/summarize.awk b/testsuite/summarize.awk
index adf24c9ad122..dc1682721bc0 100644
--- a/testsuite/summarize.awk
+++ b/testsuite/summarize.awk
@@ -109,6 +109,15 @@ function record_unexp() {
     in_test = 1;
 }
 
+/^Wall clock:/ {
+  match($0, /: .* took /);
+  curfile = substr($0, RSTART+2, RLENGTH-8);
+  match($0, / took .*s/);
+  duration = substr($0, RSTART+6, RLENGTH-7);
+  if (duration + 0.0 > 10.0)
+    slow[slowcount++] = sprintf("%s: %s", curfile, duration);
+}
+
 /=> passed/ {
     record_pass();
 }
@@ -205,6 +214,10 @@ END {
         if (nresults != passed + skipped + ignored + failed + unexped){
             printf (" (totals don't add up??)");
         }
+        if (slowcount != 0){
+            printf("\n\nTests taking longer than 10s:\n");
+            for (i=0; i < slowcount; i++) printf("    %s\n", slow[i]);
+        }
         printf ("\n");
         if (reran != 0){
             printf("  %3d test dir re-runs\n", reran);
diff --git a/testsuite/tests/asmcomp/polling.c b/testsuite/tests/asmcomp/polling.c
index c28baab048bf..74b464667964 100644
--- a/testsuite/tests/asmcomp/polling.c
+++ b/testsuite/tests/asmcomp/polling.c
@@ -1,14 +1,19 @@
 #define CAML_NAME_SPACE
 #define CAML_INTERNALS
 
+#include <caml/mlvalues.h>
 #include <caml/domain_state.h>
 #include <caml/signals.h>
 
 CAMLprim value request_minor_gc(value v) {
   Caml_state->requested_minor_gc = 1;
   Caml_state->requested_major_slice = 1;
-  caml_something_to_do = 1;
-  Caml_state->young_limit = Caml_state->young_alloc_end;
+  /*
+    This is massively unsafe in multicore but the polling
+    tests are only run in a single domain, so we're probably
+    good.
+  */
+  Caml_state->young_limit = (uintnat)Caml_state->young_end;
 
   return Val_unit;
 }
diff --git a/testsuite/tests/asmcomp/run.ml b/testsuite/tests/asmcomp/run.ml
new file mode 100644
index 000000000000..feb4bf7902f9
--- /dev/null
+++ b/testsuite/tests/asmcomp/run.ml
@@ -0,0 +1,9 @@
+external run_prog : int -> int -> int -> unit = "run_prog"
+
+let arg n =
+  if n < Array.length Sys.argv then
+    int_of_string Sys.argv.(n)
+  else
+    0
+
+let () = run_prog (arg 1) (arg 2) (arg 3)
diff --git a/testsuite/tests/asmgen/main.c b/testsuite/tests/asmgen/main.c
index 975b54833e82..05e59171f169 100644
--- a/testsuite/tests/asmgen/main.c
+++ b/testsuite/tests/asmgen/main.c
@@ -24,6 +24,10 @@ void caml_call_gc()
 {
 
 }
+void caml_call_realloc_stack()
+{
+
+};
 #endif
 
 void caml_ml_array_bound_error(void)
diff --git a/testsuite/tests/backtrace/backtrace2.reference b/testsuite/tests/backtrace/backtrace2.reference
index 0d97fde595db..20ef708fe001 100644
--- a/testsuite/tests/backtrace/backtrace2.reference
+++ b/testsuite/tests/backtrace/backtrace2.reference
@@ -35,7 +35,7 @@ Uncaught exception Invalid_argument("index out of bounds")
 Raised by primitive operation at Backtrace2.run in file "backtrace2.ml", line 62, characters 14-22
 test_Not_found
 Uncaught exception Not_found
-Raised at Stdlib__Hashtbl.find in file "hashtbl.ml", line 539, characters 13-28
+Raised at Stdlib__Hashtbl.find in file "hashtbl.ml", line 541, characters 13-28
 Called from Backtrace2.test_Not_found in file "backtrace2.ml", line 43, characters 9-42
 Re-raised at Backtrace2.test_Not_found in file "backtrace2.ml", line 43, characters 61-70
 Called from Backtrace2.run in file "backtrace2.ml", line 62, characters 11-23
@@ -46,13 +46,13 @@ Called from Backtrace2.test_lazy.aux in file "backtrace2.ml", line 47, character
 Called from Backtrace2.test_lazy.aux in file "backtrace2.ml", line 47, characters 43-52
 Called from Backtrace2.test_lazy.aux in file "backtrace2.ml", line 47, characters 43-52
 Called from Backtrace2.test_lazy.aux in file "backtrace2.ml", line 47, characters 43-52
-Called from CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 31, characters 17-27
-Re-raised at CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 36, characters 4-11
+Called from CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 49, characters 17-27
+Re-raised at CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 56, characters 4-11
 Called from Backtrace2.run in file "backtrace2.ml", line 62, characters 11-23
 Uncaught exception Not_found
-Raised at Stdlib__Hashtbl.find in file "hashtbl.ml", line 539, characters 13-28
+Raised at Stdlib__Hashtbl.find in file "hashtbl.ml", line 541, characters 13-28
 Called from Backtrace2.test_lazy.exception_raised_internally in file "backtrace2.ml", line 50, characters 8-41
-Re-raised at CamlinternalLazy.force_lazy_block.(fun) in file "camlinternalLazy.ml", line 35, characters 56-63
-Called from CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 31, characters 17-27
-Re-raised at CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 36, characters 4-11
+Re-raised at CamlinternalLazy.do_force_block.(fun) in file "camlinternalLazy.ml", line 54, characters 43-50
+Called from CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 49, characters 17-27
+Re-raised at CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 56, characters 4-11
 Called from Backtrace2.run in file "backtrace2.ml", line 62, characters 11-23
diff --git a/testsuite/tests/backtrace/backtrace_c_exn.ml b/testsuite/tests/backtrace/backtrace_c_exn.ml
new file mode 100644
index 000000000000..9b0077d50990
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_c_exn.ml
@@ -0,0 +1,20 @@
+(* TEST
+   modules = "backtrace_c_exn_.c"
+   flags = "-g"
+   ocamlrunparam += ",b=1"
+*)
+
+(* https://github.com/ocaml-multicore/ocaml-multicore/issues/498 *)
+external stubbed_raise : unit -> unit = "caml_498_raise"
+
+let raise_exn () = failwith "exn"
+
+let () = Callback.register "test_raise_exn" raise_exn
+
+let () =
+  try
+    stubbed_raise ()
+  with
+  | exn ->
+    Printexc.to_string exn |> print_endline;
+    Printexc.print_backtrace stdout
diff --git a/testsuite/tests/backtrace/backtrace_c_exn.reference b/testsuite/tests/backtrace/backtrace_c_exn.reference
new file mode 100644
index 000000000000..2fc09e98b5b1
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_c_exn.reference
@@ -0,0 +1,2 @@
+Failure("exn")
+Raised by primitive operation at Backtrace_c_exn in file "backtrace_c_exn.ml", line 16, characters 4-20
diff --git a/testsuite/tests/backtrace/backtrace_c_exn_.c b/testsuite/tests/backtrace/backtrace_c_exn_.c
new file mode 100644
index 000000000000..a9a603e879f8
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_c_exn_.c
@@ -0,0 +1,14 @@
+#include <caml/callback.h>
+#include <caml/memory.h>
+
+void caml_498_raise(void) {
+    CAMLparam0 ();
+    const value *cl;
+
+    cl = caml_named_value("test_raise_exn");
+
+    if (cl != NULL)
+      caml_callback(*cl, Val_unit);
+
+    CAMLreturn0;
+}
diff --git a/testsuite/tests/backtrace/backtrace_dynlink.ml b/testsuite/tests/backtrace/backtrace_dynlink.ml
new file mode 100644
index 000000000000..c7109ae5623f
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_dynlink.ml
@@ -0,0 +1,42 @@
+(* TEST
+
+include dynlink
+
+readonly_files = "backtrace_dynlink_plugin.ml"
+
+libraries = ""
+
+* shared-libraries
+** native-dynlink
+*** setup-ocamlopt.byte-build-env
+**** ocamlopt.byte
+module = "backtrace_dynlink.ml"
+flags = "-g"
+**** ocamlopt.byte
+program = "backtrace_dynlink_plugin.cmxs"
+flags = "-shared -g"
+all_modules = "backtrace_dynlink_plugin.ml"
+**** ocamlopt.byte
+program = "${test_build_directory}/main.exe"
+libraries = "dynlink"
+all_modules = "backtrace_dynlink.cmx"
+***** run
+ocamlrunparam += ",b=1"
+****** check-program-output
+*)
+
+(* test for backtrace and stack unwinding with dynlink. *)
+(* https://github.com/ocaml-multicore/ocaml-multicore/issues/440 *)
+(* https://github.com/ocaml-multicore/ocaml-multicore/pull/499 *)
+
+let ()  =
+  Dynlink.allow_unsafe_modules true;
+  try
+    Dynlink.loadfile "backtrace_dynlink_plugin.cmxs"
+  with
+  | Dynlink.Error err ->
+     print_endline @@ Dynlink.error_message err;
+     Printexc.print_backtrace stdout;
+  | exn ->
+     Printexc.to_string exn |> print_endline;
+     print_endline "ERROR"
diff --git a/testsuite/tests/backtrace/backtrace_dynlink.reference b/testsuite/tests/backtrace/backtrace_dynlink.reference
new file mode 100644
index 000000000000..12390de6243b
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_dynlink.reference
@@ -0,0 +1,18 @@
+Raised by primitive operation at Backtrace_dynlink_plugin in file "backtrace_dynlink_plugin.ml", line 6, characters 13-38
+Called from Dynlink.Native.run.(fun) in file "otherlibs/dynlink/native/dynlink.ml", line 85, characters 12-29
+Called from Stdlib__List.iter in file "list.ml", line 110, characters 12-15
+Called from Dynlink_common.Make.load.(fun) in file "otherlibs/dynlink/dynlink_common.ml", line 350, characters 13-44
+Called from Stdlib__List.iter in file "list.ml", line 110, characters 12-15
+Called from Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 348, characters 8-240
+Called from Dynlink_common.Make.loadfile in file "otherlibs/dynlink/dynlink_common.ml" (inlined), line 360, characters 26-45
+Called from Backtrace_dynlink in file "backtrace_dynlink.ml", line 35, characters 4-52
+execution of module initializers in the shared library failed: Failure("SUCCESS")
+Raised by primitive operation at Dynlink.Native.run.(fun) in file "otherlibs/dynlink/native/dynlink.ml", line 85, characters 12-29
+Re-raised at Dynlink.Native.run.(fun) in file "otherlibs/dynlink/native/dynlink.ml", line 87, characters 10-149
+Called from Stdlib__List.iter in file "list.ml", line 110, characters 12-15
+Called from Dynlink_common.Make.load.(fun) in file "otherlibs/dynlink/dynlink_common.ml", line 350, characters 13-44
+Called from Stdlib__List.iter in file "list.ml", line 110, characters 12-15
+Called from Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 348, characters 8-240
+Re-raised at Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 358, characters 8-17
+Called from Dynlink_common.Make.loadfile in file "otherlibs/dynlink/dynlink_common.ml" (inlined), line 360, characters 26-45
+Called from Backtrace_dynlink in file "backtrace_dynlink.ml", line 35, characters 4-52
diff --git a/testsuite/tests/backtrace/backtrace_dynlink_plugin.ml b/testsuite/tests/backtrace/backtrace_dynlink_plugin.ml
new file mode 100644
index 000000000000..86a36a020b2c
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_dynlink_plugin.ml
@@ -0,0 +1,8 @@
+let () =
+  try
+    failwith "SUCCESS"
+  with
+  | e ->
+     let c = Printexc.get_callstack 10 in
+     Printexc.print_raw_backtrace stdout c;
+     raise e
diff --git a/testsuite/tests/backtrace/backtrace_effects.ml b/testsuite/tests/backtrace/backtrace_effects.ml
new file mode 100644
index 000000000000..68ffcd420c45
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_effects.ml
@@ -0,0 +1,41 @@
+(* TEST
+   flags = "-g"
+   ocamlrunparam += ",b=1"
+   exit_status = "2"
+*)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let bar i =
+  if i < 0 then begin
+    print_endline "(** raise **)";
+    raise Exit
+  end else begin
+    print_endline "(** get_callstack **)";
+    let bt = Printexc.get_callstack 100 in
+    print_string @@ Printexc.raw_backtrace_to_string bt;
+    perform E;
+    20
+  end
+
+let foo i =
+  ignore @@ bar i;
+  bar (-1)
+
+let baz () =
+  match_with foo 10
+  { retc = (fun x -> ());
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun (k : (a, _) continuation) ->
+          print_endline "(** get_continuation_callstack **)";
+          let bt = Deep.get_callstack k 100 in
+          print_string @@ Printexc.raw_backtrace_to_string bt;
+          continue k ())
+      | _ -> None }
+
+let _ = baz ()
diff --git a/testsuite/tests/backtrace/backtrace_effects.reference b/testsuite/tests/backtrace/backtrace_effects.reference
new file mode 100644
index 000000000000..06acda2bc8bd
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_effects.reference
@@ -0,0 +1,11 @@
+(** get_callstack **)
+Raised by primitive operation at Backtrace_effects.bar in file "backtrace_effects.ml", line 18, characters 13-39
+Called from Backtrace_effects.foo in file "backtrace_effects.ml", line 25, characters 12-17
+Called from Backtrace_effects in file "backtrace_effects.ml", line 41, characters 8-14
+(** get_continuation_callstack **)
+Raised by primitive operation at Backtrace_effects.bar in file "backtrace_effects.ml", line 20, characters 4-13
+Called from Backtrace_effects.foo in file "backtrace_effects.ml", line 25, characters 12-17
+(** raise **)
+Fatal error: exception Stdlib.Exit
+Raised at Backtrace_effects.baz.(fun) in file "backtrace_effects.ml", line 31, characters 21-28
+Called from Backtrace_effects in file "backtrace_effects.ml", line 41, characters 8-14
diff --git a/testsuite/tests/backtrace/backtrace_effects_nested.ml b/testsuite/tests/backtrace/backtrace_effects_nested.ml
new file mode 100644
index 000000000000..c5e41cfd0a29
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_effects_nested.ml
@@ -0,0 +1,37 @@
+(* TEST
+   flags = "-g"
+*)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+            | Inc : unit eff
+
+let blorp () =
+  perform Inc;
+  perform E;
+  42
+
+let baz () =
+    try_with blorp ()
+    { effc = fun (type a) (e : a eff) ->
+        match e with
+        | Inc -> Some (fun (k : (a, _) continuation) ->
+            1 + continue k ())
+        | _ -> None }
+
+let f () =
+  match_with baz ()
+  { retc = (fun x -> Printf.printf "%d\n" x);
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+          match e with
+          | E -> Some (fun (k : (a, _) continuation) ->
+              Deep.get_callstack k 100 |>
+              Printexc.raw_backtrace_to_string |>
+              print_string;
+              continue k ())
+          | _ -> None }
+
+let () = f ()
diff --git a/testsuite/tests/backtrace/backtrace_effects_nested.reference b/testsuite/tests/backtrace/backtrace_effects_nested.reference
new file mode 100644
index 000000000000..a8cc88ed7ff8
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_effects_nested.reference
@@ -0,0 +1,3 @@
+Raised by primitive operation at Backtrace_effects_nested.blorp in file "backtrace_effects_nested.ml", line 13, characters 2-11
+Called from Backtrace_effects_nested.baz.(fun) in file "backtrace_effects_nested.ml", line 21, characters 16-29
+43
diff --git a/testsuite/tests/backtrace/backtrace_systhreads.ml b/testsuite/tests/backtrace/backtrace_systhreads.ml
new file mode 100644
index 000000000000..1b6b158ceffb
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_systhreads.ml
@@ -0,0 +1,40 @@
+(* TEST
+flags = "-g"
+ocamlrunparam += ",b=1"
+* hassysthreads
+include systhreads
+** bytecode
+** native
+*)
+
+let throw_exn msg = failwith msg [@@inline never]
+
+let thread_func delay =
+  Thread.yield ();
+  try throw_exn (string_of_int delay) with
+  | exn ->
+     Thread.delay (float_of_int delay);
+     Gc.minor ();
+     raise exn
+
+let thread_backtrace (cond, mut) =
+  Thread.yield ();
+  try throw_exn "backtrace" with
+  | exn ->
+     Mutex.lock mut;
+     Condition.wait cond mut;
+     raise exn
+
+let () =
+  Random.self_init ();
+  let mut = Mutex.create () in
+  let cond = Condition.create () in
+  let backtrace_thread = Thread.create thread_backtrace (cond, mut) in
+  let threads =
+    List.init 4 begin fun i ->
+      Thread.create thread_func i
+      end
+  in
+  List.iter Thread.join threads;
+  Condition.signal cond;
+  Thread.join backtrace_thread
diff --git a/testsuite/tests/backtrace/backtrace_systhreads.reference b/testsuite/tests/backtrace/backtrace_systhreads.reference
new file mode 100644
index 000000000000..c1b45f1b63ce
--- /dev/null
+++ b/testsuite/tests/backtrace/backtrace_systhreads.reference
@@ -0,0 +1,25 @@
+Thread 2 killed on uncaught exception Failure("0")
+Raised at Stdlib.failwith in file "stdlib.ml", line 29, characters 17-33
+Called from Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 14, characters 6-37
+Re-raised at Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 18, characters 5-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
+Thread 3 killed on uncaught exception Failure("1")
+Raised at Stdlib.failwith in file "stdlib.ml", line 29, characters 17-33
+Called from Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 14, characters 6-37
+Re-raised at Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 18, characters 5-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
+Thread 4 killed on uncaught exception Failure("2")
+Raised at Stdlib.failwith in file "stdlib.ml", line 29, characters 17-33
+Called from Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 14, characters 6-37
+Re-raised at Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 18, characters 5-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
+Thread 5 killed on uncaught exception Failure("3")
+Raised at Stdlib.failwith in file "stdlib.ml", line 29, characters 17-33
+Called from Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 14, characters 6-37
+Re-raised at Backtrace_systhreads.thread_func in file "backtrace_systhreads.ml", line 18, characters 5-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
+Thread 1 killed on uncaught exception Failure("backtrace")
+Raised at Stdlib.failwith in file "stdlib.ml", line 29, characters 17-33
+Called from Backtrace_systhreads.thread_backtrace in file "backtrace_systhreads.ml", line 22, characters 6-27
+Re-raised at Backtrace_systhreads.thread_backtrace in file "backtrace_systhreads.ml", line 26, characters 5-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
diff --git a/testsuite/tests/backtrace/callstack.reference b/testsuite/tests/backtrace/callstack.reference
index 04e04c540f38..2243abe4fdf3 100644
--- a/testsuite/tests/backtrace/callstack.reference
+++ b/testsuite/tests/backtrace/callstack.reference
@@ -12,4 +12,4 @@ Raised by primitive operation at Callstack.f0 in file "callstack.ml", line 11, c
 Called from Callstack.f1 in file "callstack.ml", line 12, characters 27-32
 Called from Callstack.f2 in file "callstack.ml", line 13, characters 27-32
 Called from Callstack.f3 in file "callstack.ml", line 14, characters 27-32
-Called from Thread.create.(fun) in file "thread.ml", line 47, characters 8-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
diff --git a/testsuite/tests/backtrace/lazy.reference b/testsuite/tests/backtrace/lazy.reference
index ccb2a21e6c5f..5e8f53482f2d 100644
--- a/testsuite/tests/backtrace/lazy.reference
+++ b/testsuite/tests/backtrace/lazy.reference
@@ -1,12 +1,14 @@
 Uncaught exception Not_found
 Raised at Lazy.l1 in file "lazy.ml", line 7, characters 28-45
-Called from CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 31, characters 17-27
-Re-raised at CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 36, characters 4-11
+Called from CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 49, characters 17-27
+Re-raised at CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 56, characters 4-11
+Called from CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml" (inlined), line 78, characters 27-67
 Called from Lazy.test1 in file "lazy.ml", line 10, characters 11-24
 Called from Lazy.run in file "lazy.ml", line 19, characters 4-11
 Uncaught exception Not_found
 Raised at Lazy.l2 in file "lazy.ml", line 12, characters 28-45
-Called from CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 31, characters 17-27
-Re-raised at CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml", line 36, characters 4-11
+Called from CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 49, characters 17-27
+Re-raised at CamlinternalLazy.do_force_block in file "camlinternalLazy.ml", line 56, characters 4-11
+Called from CamlinternalLazy.force_lazy_block in file "camlinternalLazy.ml" (inlined), line 78, characters 27-67
 Called from Lazy.test2 in file "lazy.ml", line 15, characters 6-15
 Called from Lazy.run in file "lazy.ml", line 19, characters 4-11
diff --git a/testsuite/tests/basic-modules/anonymous.ocamlc.reference b/testsuite/tests/basic-modules/anonymous.ocamlc.reference
index 755ef658d6ef..d811f7f0db24 100644
--- a/testsuite/tests/basic-modules/anonymous.ocamlc.reference
+++ b/testsuite/tests/basic-modules/anonymous.ocamlc.reference
@@ -2,15 +2,15 @@
   (seq (ignore (let (x = [0: 13 37]) (makeblock 0 x)))
     (let
       (A =
-         (apply (field 0 (global CamlinternalMod!)) [0: "anonymous.ml" 25 6]
-           [0: [0]])
+         (apply (field_imm 0 (global CamlinternalMod!))
+           [0: "anonymous.ml" 25 6] [0: [0]])
        B =
-         (apply (field 0 (global CamlinternalMod!)) [0: "anonymous.ml" 35 6]
-           [0: [0]]))
+         (apply (field_imm 0 (global CamlinternalMod!))
+           [0: "anonymous.ml" 35 6] [0: [0]]))
       (seq (ignore (let (x = [0: 4 2]) (makeblock 0 x)))
-        (apply (field 1 (global CamlinternalMod!)) [0: [0]] A
+        (apply (field_imm 1 (global CamlinternalMod!)) [0: [0]] A
           (module-defn(A) Anonymous anonymous.ml(23):567-608 A))
-        (apply (field 1 (global CamlinternalMod!)) [0: [0]] B
+        (apply (field_imm 1 (global CamlinternalMod!)) [0: [0]] B
           (module-defn(B) Anonymous anonymous.ml(33):703-773
             (let (x = [0: "foo" "bar"]) (makeblock 0))))
         (let (f = (function param : int 0) s = (makemutable 0 ""))
@@ -20,5 +20,5 @@
                 (makeblock 0)))
             (let
               (drop = (function param : int 0)
-               *match* = (apply drop (field 0 s)))
+               *match* = (apply drop (field_mut 0 s)))
               (makeblock 0 A B f s drop))))))))
diff --git a/testsuite/tests/basic-modules/anonymous.ocamlopt.flambda.reference b/testsuite/tests/basic-modules/anonymous.ocamlopt.flambda.reference
index 80d7322a6176..3afce1b43453 100644
--- a/testsuite/tests/basic-modules/anonymous.ocamlopt.flambda.reference
+++ b/testsuite/tests/basic-modules/anonymous.ocamlopt.flambda.reference
@@ -1,16 +1,16 @@
 (seq (ignore (let (x = [0: 13 37]) (makeblock 0 x)))
   (let
     (A =
-       (apply (field 0 (global CamlinternalMod!)) [0: "anonymous.ml" 25 6]
-         [0: [0]])
+       (apply (field_imm 0 (global CamlinternalMod!))
+         [0: "anonymous.ml" 25 6] [0: [0]])
      B =
-       (apply (field 0 (global CamlinternalMod!)) [0: "anonymous.ml" 35 6]
-         [0: [0]]))
+       (apply (field_imm 0 (global CamlinternalMod!))
+         [0: "anonymous.ml" 35 6] [0: [0]]))
     (seq (ignore (let (x = [0: 4 2]) (makeblock 0 x)))
-      (apply (field 1 (global CamlinternalMod!)) [0: [0]] A
-        (module-defn(A) Anonymous anonymous.ml(23):567-608 A))
-      (apply (field 1 (global CamlinternalMod!)) [0: [0]] B
-        (module-defn(B) Anonymous anonymous.ml(33):703-773
+      (apply (field_imm 1 (global CamlinternalMod!)) [0: [0]] A
+        (module-defn(A) anonymous.ml(23):567-608 A))
+      (apply (field_imm 1 (global CamlinternalMod!)) [0: [0]] B
+        (module-defn(B) anonymous.ml(33):703-773
           (let (x = [0: "foo" "bar"]) (makeblock 0))))
       (let (f = (function param : int 0) s = (makemutable 0 ""))
         (seq
diff --git a/testsuite/tests/basic-modules/anonymous.ocamlopt.reference b/testsuite/tests/basic-modules/anonymous.ocamlopt.reference
index 8d2bab93989d..a1e2891c303a 100644
--- a/testsuite/tests/basic-modules/anonymous.ocamlopt.reference
+++ b/testsuite/tests/basic-modules/anonymous.ocamlopt.reference
@@ -1,14 +1,14 @@
 (seq (ignore (let (x = [0: 13 37]) (makeblock 0 x)))
   (let
     (A =
-       (apply (field 0 (global CamlinternalMod!)) [0: "anonymous.ml" 25 6]
-         [0: [0]])
+       (apply (field_imm 0 (global CamlinternalMod!))
+         [0: "anonymous.ml" 25 6] [0: [0]])
      B =
-       (apply (field 0 (global CamlinternalMod!)) [0: "anonymous.ml" 35 6]
-         [0: [0]]))
+       (apply (field_imm 0 (global CamlinternalMod!))
+         [0: "anonymous.ml" 35 6] [0: [0]]))
     (seq (ignore (let (x = [0: 4 2]) (makeblock 0 x)))
-      (apply (field 1 (global CamlinternalMod!)) [0: [0]] A A)
-      (apply (field 1 (global CamlinternalMod!)) [0: [0]] B
+      (apply (field_imm 1 (global CamlinternalMod!)) [0: [0]] A A)
+      (apply (field_imm 1 (global CamlinternalMod!)) [0: [0]] B
         (let (x = [0: "foo" "bar"]) (makeblock 0)))
       (setfield_ptr(root-init) 0 (global Anonymous!) A)
       (setfield_ptr(root-init) 1 (global Anonymous!) B)
@@ -19,13 +19,14 @@
       (ignore
         (let
           (*match* =
-             (setfield_ptr 0 (field 3 (global Anonymous!)) "Hello World!"))
+             (setfield_ptr 0 (field_imm 3 (global Anonymous!))
+               "Hello World!"))
           (makeblock 0)))
       (let (drop = (function param : int 0))
         (setfield_ptr(root-init) 4 (global Anonymous!) drop))
       (let
         (*match* =
-           (apply (field 4 (global Anonymous!))
-             (field 0 (field 3 (global Anonymous!)))))
+           (apply (field_imm 4 (global Anonymous!))
+             (field_mut 0 (field_imm 3 (global Anonymous!)))))
         0)
       0)))
diff --git a/testsuite/tests/basic/arrays.ml b/testsuite/tests/basic/arrays.ml
index 83cc796cae98..a094f155dd5f 100644
--- a/testsuite/tests/basic/arrays.ml
+++ b/testsuite/tests/basic/arrays.ml
@@ -127,6 +127,12 @@ let test8 () =
     ignore (Array.sub [|3;4|] (-1) 1); print_string "Test 8.4: failed\n"
   with Invalid_argument _ -> ())
 
+let test9 () =
+  try
+    (* test exception raised from C (caml_array_bound_error) to OCaml *)
+    ignore ([| |].(0)); print_string "Test 9: failed\n"
+  with Invalid_argument _ -> ()
+
 let _ =
   test1();
   test2();
@@ -136,4 +142,5 @@ let _ =
   test6();
   test7();
   test8();
+  test9();
   exit 0
diff --git a/testsuite/tests/basic/patmatch_for_multiple.ml b/testsuite/tests/basic/patmatch_for_multiple.ml
index 82897808498d..7b45d08c1912 100644
--- a/testsuite/tests/basic/patmatch_for_multiple.ml
+++ b/testsuite/tests/basic/patmatch_for_multiple.ml
@@ -26,15 +26,15 @@ match (3, 2, 1) with
 | _ -> false
 ;;
 [%%expect{|
-(let (*match*/274 = 3 *match*/275 = 2 *match*/276 = 1)
+(let (*match*/279 = 3 *match*/280 = 2 *match*/281 = 1)
   (catch
     (catch
-      (catch (if (!= *match*/275 3) (exit 3) (exit 1)) with (3)
-        (if (!= *match*/274 1) (exit 2) (exit 1)))
+      (catch (if (!= *match*/280 3) (exit 3) (exit 1)) with (3)
+        (if (!= *match*/279 1) (exit 2) (exit 1)))
      with (2) 0)
    with (1) 1))
-(let (*match*/274 = 3 *match*/275 = 2 *match*/276 = 1)
-  (catch (if (!= *match*/275 3) (if (!= *match*/274 1) 0 (exit 1)) (exit 1))
+(let (*match*/279 = 3 *match*/280 = 2 *match*/281 = 1)
+  (catch (if (!= *match*/280 3) (if (!= *match*/279 1) 0 (exit 1)) (exit 1))
    with (1) 1))
 - : bool = false
 |}];;
@@ -47,26 +47,26 @@ match (3, 2, 1) with
 | _ -> false
 ;;
 [%%expect{|
-(let (*match*/279 = 3 *match*/280 = 2 *match*/281 = 1)
+(let (*match*/284 = 3 *match*/285 = 2 *match*/286 = 1)
   (catch
     (catch
       (catch
-        (if (!= *match*/280 3) (exit 6)
-          (let (x/283 =a (makeblock 0 *match*/279 *match*/280 *match*/281))
-            (exit 4 x/283)))
+        (if (!= *match*/285 3) (exit 6)
+          (let (x/288 =a (makeblock 0 *match*/284 *match*/285 *match*/286))
+            (exit 4 x/288)))
        with (6)
-        (if (!= *match*/279 1) (exit 5)
-          (let (x/282 =a (makeblock 0 *match*/279 *match*/280 *match*/281))
-            (exit 4 x/282))))
+        (if (!= *match*/284 1) (exit 5)
+          (let (x/287 =a (makeblock 0 *match*/284 *match*/285 *match*/286))
+            (exit 4 x/287))))
      with (5) 0)
-   with (4 x/277) (seq (ignore x/277) 1)))
-(let (*match*/279 = 3 *match*/280 = 2 *match*/281 = 1)
+   with (4 x/282) (seq (ignore x/282) 1)))
+(let (*match*/284 = 3 *match*/285 = 2 *match*/286 = 1)
   (catch
-    (if (!= *match*/280 3)
-      (if (!= *match*/279 1) 0
-        (exit 4 (makeblock 0 *match*/279 *match*/280 *match*/281)))
-      (exit 4 (makeblock 0 *match*/279 *match*/280 *match*/281)))
-   with (4 x/277) (seq (ignore x/277) 1)))
+    (if (!= *match*/285 3)
+      (if (!= *match*/284 1) 0
+        (exit 4 (makeblock 0 *match*/284 *match*/285 *match*/286)))
+      (exit 4 (makeblock 0 *match*/284 *match*/285 *match*/286)))
+   with (4 x/282) (seq (ignore x/282) 1)))
 - : bool = false
 |}];;
 
@@ -76,8 +76,8 @@ let _ = fun a b ->
   | ((true, _) as _g)
   | ((false, _) as _g) -> ()
 [%%expect{|
-(function a/284[int] b/285 : int 0)
-(function a/284[int] b/285 : int 0)
+(function a/289[int] b/290 : int 0)
+(function a/289[int] b/290 : int 0)
 - : bool -> 'a -> unit = <fun>
 |}];;
 
@@ -96,8 +96,8 @@ let _ = fun a b -> match a, b with
 | (false, _) as p -> p
 (* outside, trivial *)
 [%%expect {|
-(function a/288[int] b/289 (let (p/290 =a (makeblock 0 a/288 b/289)) p/290))
-(function a/288[int] b/289 (makeblock 0 a/288 b/289))
+(function a/293[int] b/294 (let (p/295 =a (makeblock 0 a/293 b/294)) p/295))
+(function a/293[int] b/294 (makeblock 0 a/293 b/294))
 - : bool -> 'a -> bool * 'a = <fun>
 |}]
 
@@ -106,8 +106,8 @@ let _ = fun a b -> match a, b with
 | ((false, _) as p) -> p
 (* inside, trivial *)
 [%%expect{|
-(function a/292[int] b/293 (let (p/294 =a (makeblock 0 a/292 b/293)) p/294))
-(function a/292[int] b/293 (makeblock 0 a/292 b/293))
+(function a/297[int] b/298 (let (p/299 =a (makeblock 0 a/297 b/298)) p/299))
+(function a/297[int] b/298 (makeblock 0 a/297 b/298))
 - : bool -> 'a -> bool * 'a = <fun>
 |}];;
 
@@ -116,11 +116,11 @@ let _ = fun a b -> match a, b with
 | (false as x, _) as p -> x, p
 (* outside, simple *)
 [%%expect {|
-(function a/298[int] b/299
-  (let (x/300 =a[int] a/298 p/301 =a (makeblock 0 a/298 b/299))
-    (makeblock 0 (int,*) x/300 p/301)))
-(function a/298[int] b/299
-  (makeblock 0 (int,*) a/298 (makeblock 0 a/298 b/299)))
+(function a/303[int] b/304
+  (let (x/305 =a[int] a/303 p/306 =a (makeblock 0 a/303 b/304))
+    (makeblock 0 (int,*) x/305 p/306)))
+(function a/303[int] b/304
+  (makeblock 0 (int,*) a/303 (makeblock 0 a/303 b/304)))
 - : bool -> 'a -> bool * (bool * 'a) = <fun>
 |}]
 
@@ -129,11 +129,11 @@ let _ = fun a b -> match a, b with
 | ((false as x, _) as p) -> x, p
 (* inside, simple *)
 [%%expect {|
-(function a/304[int] b/305
-  (let (x/306 =a[int] a/304 p/307 =a (makeblock 0 a/304 b/305))
-    (makeblock 0 (int,*) x/306 p/307)))
-(function a/304[int] b/305
-  (makeblock 0 (int,*) a/304 (makeblock 0 a/304 b/305)))
+(function a/309[int] b/310
+  (let (x/311 =a[int] a/309 p/312 =a (makeblock 0 a/309 b/310))
+    (makeblock 0 (int,*) x/311 p/312)))
+(function a/309[int] b/310
+  (makeblock 0 (int,*) a/309 (makeblock 0 a/309 b/310)))
 - : bool -> 'a -> bool * (bool * 'a) = <fun>
 |}]
 
@@ -142,15 +142,15 @@ let _ = fun a b -> match a, b with
 | (false, x) as p -> x, p
 (* outside, complex *)
 [%%expect{|
-(function a/314[int] b/315[int]
-  (if a/314
-    (let (x/316 =a[int] a/314 p/317 =a (makeblock 0 a/314 b/315))
-      (makeblock 0 (int,*) x/316 p/317))
-    (let (x/318 =a b/315 p/319 =a (makeblock 0 a/314 b/315))
-      (makeblock 0 (int,*) x/318 p/319))))
-(function a/314[int] b/315[int]
-  (if a/314 (makeblock 0 (int,*) a/314 (makeblock 0 a/314 b/315))
-    (makeblock 0 (int,*) b/315 (makeblock 0 a/314 b/315))))
+(function a/319[int] b/320[int]
+  (if a/319
+    (let (x/321 =a[int] a/319 p/322 =a (makeblock 0 a/319 b/320))
+      (makeblock 0 (int,*) x/321 p/322))
+    (let (x/323 =a b/320 p/324 =a (makeblock 0 a/319 b/320))
+      (makeblock 0 (int,*) x/323 p/324))))
+(function a/319[int] b/320[int]
+  (if a/319 (makeblock 0 (int,*) a/319 (makeblock 0 a/319 b/320))
+    (makeblock 0 (int,*) b/320 (makeblock 0 a/319 b/320))))
 - : bool -> bool -> bool * (bool * bool) = <fun>
 |}]
 
@@ -160,19 +160,19 @@ let _ = fun a b -> match a, b with
   -> x, p
 (* inside, complex *)
 [%%expect{|
-(function a/320[int] b/321[int]
+(function a/325[int] b/326[int]
   (catch
-    (if a/320
-      (let (x/328 =a[int] a/320 p/329 =a (makeblock 0 a/320 b/321))
-        (exit 10 x/328 p/329))
-      (let (x/326 =a b/321 p/327 =a (makeblock 0 a/320 b/321))
-        (exit 10 x/326 p/327)))
-   with (10 x/322[int] p/323) (makeblock 0 (int,*) x/322 p/323)))
-(function a/320[int] b/321[int]
+    (if a/325
+      (let (x/333 =a[int] a/325 p/334 =a (makeblock 0 a/325 b/326))
+        (exit 10 x/333 p/334))
+      (let (x/331 =a b/326 p/332 =a (makeblock 0 a/325 b/326))
+        (exit 10 x/331 p/332)))
+   with (10 x/327[int] p/328) (makeblock 0 (int,*) x/327 p/328)))
+(function a/325[int] b/326[int]
   (catch
-    (if a/320 (exit 10 a/320 (makeblock 0 a/320 b/321))
-      (exit 10 b/321 (makeblock 0 a/320 b/321)))
-   with (10 x/322[int] p/323) (makeblock 0 (int,*) x/322 p/323)))
+    (if a/325 (exit 10 a/325 (makeblock 0 a/325 b/326))
+      (exit 10 b/326 (makeblock 0 a/325 b/326)))
+   with (10 x/327[int] p/328) (makeblock 0 (int,*) x/327 p/328)))
 - : bool -> bool -> bool * (bool * bool) = <fun>
 |}]
 
@@ -185,15 +185,15 @@ let _ = fun a b -> match a, b with
 | (false as x, _) as p -> x, p
 (* outside, onecase *)
 [%%expect {|
-(function a/330[int] b/331[int]
-  (if a/330
-    (let (x/332 =a[int] a/330 _p/333 =a (makeblock 0 a/330 b/331))
-      (makeblock 0 (int,*) x/332 [0: 1 1]))
-    (let (x/334 =a[int] a/330 p/335 =a (makeblock 0 a/330 b/331))
-      (makeblock 0 (int,*) x/334 p/335))))
-(function a/330[int] b/331[int]
-  (if a/330 (makeblock 0 (int,*) a/330 [0: 1 1])
-    (makeblock 0 (int,*) a/330 (makeblock 0 a/330 b/331))))
+(function a/335[int] b/336[int]
+  (if a/335
+    (let (x/337 =a[int] a/335 _p/338 =a (makeblock 0 a/335 b/336))
+      (makeblock 0 (int,*) x/337 [0: 1 1]))
+    (let (x/339 =a[int] a/335 p/340 =a (makeblock 0 a/335 b/336))
+      (makeblock 0 (int,*) x/339 p/340))))
+(function a/335[int] b/336[int]
+  (if a/335 (makeblock 0 (int,*) a/335 [0: 1 1])
+    (makeblock 0 (int,*) a/335 (makeblock 0 a/335 b/336))))
 - : bool -> bool -> bool * (bool * bool) = <fun>
 |}]
 
@@ -202,11 +202,11 @@ let _ = fun a b -> match a, b with
 | ((false as x, _) as p) -> x, p
 (* inside, onecase *)
 [%%expect{|
-(function a/336[int] b/337
-  (let (x/338 =a[int] a/336 p/339 =a (makeblock 0 a/336 b/337))
-    (makeblock 0 (int,*) x/338 p/339)))
-(function a/336[int] b/337
-  (makeblock 0 (int,*) a/336 (makeblock 0 a/336 b/337)))
+(function a/341[int] b/342
+  (let (x/343 =a[int] a/341 p/344 =a (makeblock 0 a/341 b/342))
+    (makeblock 0 (int,*) x/343 p/344)))
+(function a/341[int] b/342
+  (makeblock 0 (int,*) a/341 (makeblock 0 a/341 b/342)))
 - : bool -> 'a -> bool * (bool * 'a) = <fun>
 |}]
 
@@ -223,14 +223,14 @@ let _ =fun a b -> match a, b with
 | (_, _) as p -> p
 (* outside, tuplist *)
 [%%expect {|
-(function a/349[int] b/350
+(function a/354[int] b/355
   (catch
-    (if a/349 (if b/350 (let (p/351 =a (field 0 b/350)) p/351) (exit 12))
+    (if a/354 (if b/355 (let (p/356 =a (field_imm 0 b/355)) p/356) (exit 12))
       (exit 12))
-   with (12) (let (p/352 =a (makeblock 0 a/349 b/350)) p/352)))
-(function a/349[int] b/350
-  (catch (if a/349 (if b/350 (field 0 b/350) (exit 12)) (exit 12)) with (12)
-    (makeblock 0 a/349 b/350)))
+   with (12) (let (p/357 =a (makeblock 0 a/354 b/355)) p/357)))
+(function a/354[int] b/355
+  (catch (if a/354 (if b/355 (field_imm 0 b/355) (exit 12)) (exit 12))
+   with (12) (makeblock 0 a/354 b/355)))
 - : bool -> bool tuplist -> bool * bool tuplist = <fun>
 |}]
 
@@ -239,19 +239,20 @@ let _ = fun a b -> match a, b with
 | ((_, _) as p) -> p
 (* inside, tuplist *)
 [%%expect{|
-(function a/353[int] b/354
+(function a/358[int] b/359
   (catch
     (catch
-      (if a/353
-        (if b/354 (let (p/358 =a (field 0 b/354)) (exit 13 p/358)) (exit 14))
+      (if a/358
+        (if b/359 (let (p/363 =a (field_imm 0 b/359)) (exit 13 p/363))
+          (exit 14))
         (exit 14))
-     with (14) (let (p/357 =a (makeblock 0 a/353 b/354)) (exit 13 p/357)))
-   with (13 p/355) p/355))
-(function a/353[int] b/354
+     with (14) (let (p/362 =a (makeblock 0 a/358 b/359)) (exit 13 p/362)))
+   with (13 p/360) p/360))
+(function a/358[int] b/359
   (catch
     (catch
-      (if a/353 (if b/354 (exit 13 (field 0 b/354)) (exit 14)) (exit 14))
-     with (14) (exit 13 (makeblock 0 a/353 b/354)))
-   with (13 p/355) p/355))
+      (if a/358 (if b/359 (exit 13 (field_imm 0 b/359)) (exit 14)) (exit 14))
+     with (14) (exit 13 (makeblock 0 a/358 b/359)))
+   with (13 p/360) p/360))
 - : bool -> bool tuplist -> bool * bool tuplist = <fun>
 |}]
diff --git a/testsuite/tests/basic/patmatch_split_no_or.ml b/testsuite/tests/basic/patmatch_split_no_or.ml
index dcad4f66b398..57427d95245d 100644
--- a/testsuite/tests/basic/patmatch_split_no_or.ml
+++ b/testsuite/tests/basic/patmatch_split_no_or.ml
@@ -18,10 +18,10 @@ let last_is_anys = function
   (last_is_anys/11 =
      (function param/13 : int
        (catch
-         (if (field 0 param/13) (if (field 1 param/13) (exit 1) 1)
-           (if (field 1 param/13) (exit 1) 2))
+         (if (field_imm 0 param/13) (if (field_imm 1 param/13) (exit 1) 1)
+           (if (field_imm 1 param/13) (exit 1) 2))
         with (1) 3)))
-  (apply (field 1 (global Toploop!)) "last_is_anys" last_is_anys/11))
+  (apply (field_mut 1 (global Toploop!)) "last_is_anys" last_is_anys/11))
 val last_is_anys : bool * bool -> int = <fun>
 |}]
 
@@ -35,10 +35,10 @@ let last_is_vars = function
   (last_is_vars/18 =
      (function param/22 : int
        (catch
-         (if (field 0 param/22) (if (field 1 param/22) (exit 3) 1)
-           (if (field 1 param/22) (exit 3) 2))
+         (if (field_imm 0 param/22) (if (field_imm 1 param/22) (exit 3) 1)
+           (if (field_imm 1 param/22) (exit 3) 2))
         with (3) 3)))
-  (apply (field 1 (global Toploop!)) "last_is_vars" last_is_vars/18))
+  (apply (field_mut 1 (global Toploop!)) "last_is_vars" last_is_vars/18))
 val last_is_vars : bool * bool -> int = <fun>
 |}]
 
@@ -55,9 +55,9 @@ type t = ..
   (A/26 = (makeblock 248 "A" (caml_fresh_oo_id 0))
    B/27 = (makeblock 248 "B" (caml_fresh_oo_id 0))
    C/28 = (makeblock 248 "C" (caml_fresh_oo_id 0)))
-  (seq (apply (field 1 (global Toploop!)) "A/26" A/26)
-    (apply (field 1 (global Toploop!)) "B/27" B/27)
-    (apply (field 1 (global Toploop!)) "C/28" C/28)))
+  (seq (apply (field_mut 1 (global Toploop!)) "A/26" A/26)
+    (apply (field_mut 1 (global Toploop!)) "B/27" B/27)
+    (apply (field_mut 1 (global Toploop!)) "C/28" C/28)))
 type t += A | B of unit | C of bool * int
 |}]
 
@@ -71,20 +71,20 @@ let f = function
 ;;
 [%%expect{|
 (let
-  (C/28 = (apply (field 0 (global Toploop!)) "C/28")
-   B/27 = (apply (field 0 (global Toploop!)) "B/27")
-   A/26 = (apply (field 0 (global Toploop!)) "A/26")
+  (C/28 = (apply (field_mut 0 (global Toploop!)) "C/28")
+   B/27 = (apply (field_mut 0 (global Toploop!)) "B/27")
+   A/26 = (apply (field_mut 0 (global Toploop!)) "A/26")
    f/29 =
      (function param/31 : int
-       (let (*match*/32 =a (field 0 param/31))
+       (let (*match*/32 =a (field_imm 0 param/31))
          (catch
-           (if (== *match*/32 A/26) (if (field 1 param/31) 1 (exit 8))
+           (if (== *match*/32 A/26) (if (field_imm 1 param/31) 1 (exit 8))
              (exit 8))
           with (8)
-           (if (field 1 param/31)
-             (if (== (field 0 *match*/32) B/27) 2
-               (if (== (field 0 *match*/32) C/28) 3 4))
-             (if (field 2 param/31) 12 11))))))
-  (apply (field 1 (global Toploop!)) "f" f/29))
+           (if (field_imm 1 param/31)
+             (if (== (field_imm 0 *match*/32) B/27) 2
+               (if (== (field_imm 0 *match*/32) C/28) 3 4))
+             (if (field_imm 2 param/31) 12 11))))))
+  (apply (field_mut 1 (global Toploop!)) "f" f/29))
 val f : t * bool * bool -> int = <fun>
 |}]
diff --git a/testsuite/tests/c-api/alloc_async.ml b/testsuite/tests/c-api/alloc_async.ml
index 0ed35acf163e..b8c99a4b079f 100644
--- a/testsuite/tests/c-api/alloc_async.ml
+++ b/testsuite/tests/c-api/alloc_async.ml
@@ -1,5 +1,7 @@
 (* TEST
    modules = "alloc_async_stubs.c"
+   * skip
+   reason = "alloc async changes: https://github.com/ocaml/ocaml/pull/8897"
 *)
 
 external test : int ref -> unit = "stub"
diff --git a/testsuite/tests/c-api/alloc_async_stubs.c b/testsuite/tests/c-api/alloc_async_stubs.c
index 7dec51eaa707..7dfbeb4caae3 100644
--- a/testsuite/tests/c-api/alloc_async_stubs.c
+++ b/testsuite/tests/c-api/alloc_async_stubs.c
@@ -39,7 +39,7 @@ value stub(value ref)
 
   /* Large allocations */
   caml_alloc(1000, 0);
-  caml_alloc_shr(1000, 0);
+  caml_alloc_shr(1000, String_tag);
   caml_alloc_tuple(1000);
   caml_alloc_float_array(1000);
   caml_alloc_string(10000);
diff --git a/testsuite/tests/callback/minor_named.ml b/testsuite/tests/callback/minor_named.ml
new file mode 100644
index 000000000000..9bc01b77d5e2
--- /dev/null
+++ b/testsuite/tests/callback/minor_named.ml
@@ -0,0 +1,19 @@
+(* TEST
+   include unix
+   modules = "minor_named_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(* Tests Callback.register and caml_named_value on a young object *)
+
+external incr_ref : unit -> unit = "incr_ref"
+
+let () =
+  let r = ref 40 in
+  Callback.register "incr_ref" r;
+  incr_ref ();
+  Gc.minor ();
+  incr_ref ();
+  Printf.printf "%d\n" !r
diff --git a/testsuite/tests/callback/minor_named.mli b/testsuite/tests/callback/minor_named.mli
new file mode 100644
index 000000000000..b47995c5c177
--- /dev/null
+++ b/testsuite/tests/callback/minor_named.mli
@@ -0,0 +1 @@
+external incr_ref : unit -> unit = "incr_ref"
diff --git a/testsuite/tests/callback/minor_named.reference b/testsuite/tests/callback/minor_named.reference
new file mode 100644
index 000000000000..d81cc0710eb6
--- /dev/null
+++ b/testsuite/tests/callback/minor_named.reference
@@ -0,0 +1 @@
+42
diff --git a/testsuite/tests/callback/minor_named_.c b/testsuite/tests/callback/minor_named_.c
new file mode 100644
index 000000000000..2d8ad2d3846b
--- /dev/null
+++ b/testsuite/tests/callback/minor_named_.c
@@ -0,0 +1,10 @@
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+
+value incr_ref(value unit) {
+  static const value* v;
+  if (!v) v = caml_named_value("incr_ref");
+  caml_modify(&Field(*v, 0), Val_int(Int_val(Field(*v, 0)) + 1));
+  return Val_unit;
+}
diff --git a/testsuite/tests/callback/nested_fiber.ml b/testsuite/tests/callback/nested_fiber.ml
new file mode 100644
index 000000000000..d7e792729aa9
--- /dev/null
+++ b/testsuite/tests/callback/nested_fiber.ml
@@ -0,0 +1,53 @@
+(* TEST
+   include unix
+   modules = "nested_fiber_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+external caml_to_c : (unit -> 'a) -> 'a = "caml_to_c"
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+type 'a tree = Empty | Node of 'a tree * 'a tree
+
+let rec make d =
+  match d with
+  | 0 -> Node(Empty, Empty)
+  | _ -> let d = d - 1 in Node(make d, make d)
+
+let rec check = function Empty -> 0 | Node(l, r) -> 1 + check l + check r
+
+let g () =
+  caml_to_c (fun () ->
+      Gc.full_major ();
+      let x = make 10 in
+      Printf.printf "g() check %d\n%!" (check x))
+
+let f () =
+  let x = make 3 in
+  let z = ref 1 in
+  match_with g ()
+  { retc = (fun () -> Printf.printf "g() returned: %d\n%!" !z);
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+          match e with
+          | E -> Some (fun (k : (a, _) continuation) -> assert false)
+          | _ -> None };
+  Printf.printf "f() check: %d\n%!" (check x)
+
+let () =
+  let x = make 3 in
+  let z = ref 2 in
+  match_with f ()
+  { retc = (fun () -> Printf.printf "f() returned: %d\n%!" !z);
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun k -> assert false)
+      | _ -> None };
+  Printf.printf "() check: %d\n%!" (check x)
diff --git a/testsuite/tests/callback/nested_fiber.mli b/testsuite/tests/callback/nested_fiber.mli
new file mode 100644
index 000000000000..51fa3954b4fa
--- /dev/null
+++ b/testsuite/tests/callback/nested_fiber.mli
@@ -0,0 +1 @@
+external caml_to_c : (unit -> 'a) -> 'a = "caml_to_c"
diff --git a/testsuite/tests/callback/nested_fiber.reference b/testsuite/tests/callback/nested_fiber.reference
new file mode 100644
index 000000000000..12358fbb2c41
--- /dev/null
+++ b/testsuite/tests/callback/nested_fiber.reference
@@ -0,0 +1,5 @@
+g() check 2047
+g() returned: 1
+f() check: 15
+f() returned: 2
+() check: 15
diff --git a/testsuite/tests/callback/nested_fiber_.c b/testsuite/tests/callback/nested_fiber_.c
new file mode 100644
index 000000000000..0978ac9b20c7
--- /dev/null
+++ b/testsuite/tests/callback/nested_fiber_.c
@@ -0,0 +1,6 @@
+#include <caml/mlvalues.h>
+#include <caml/callback.h>
+
+value caml_to_c (value f) {
+  return caml_callback(f, Val_unit);
+}
diff --git a/testsuite/tests/callback/stack_overflow.ml b/testsuite/tests/callback/stack_overflow.ml
new file mode 100644
index 000000000000..df0e4f5c4d62
--- /dev/null
+++ b/testsuite/tests/callback/stack_overflow.ml
@@ -0,0 +1,31 @@
+(* TEST
+   include unix
+   modules = "stack_overflow_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+external caml_to_c : (unit -> 'a) -> 'a = "caml_to_c"
+
+let rec deep = function
+  | 0 ->
+     ref 42
+  | n ->
+     caml_to_c (fun () -> deep (n-1))
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let () =
+  Printf.printf "%d\n%d\n%!"
+    (!(deep 1000))
+    (match_with deep 1000
+     { retc = (fun x -> !x);
+       exnc = (fun e -> raise e);
+       effc = fun (type a) (e : a eff) ->
+         match e with
+         | E -> Some (fun k -> assert false)
+         | _ -> None })
diff --git a/testsuite/tests/callback/stack_overflow.mli b/testsuite/tests/callback/stack_overflow.mli
new file mode 100644
index 000000000000..51fa3954b4fa
--- /dev/null
+++ b/testsuite/tests/callback/stack_overflow.mli
@@ -0,0 +1 @@
+external caml_to_c : (unit -> 'a) -> 'a = "caml_to_c"
diff --git a/testsuite/tests/callback/stack_overflow.reference b/testsuite/tests/callback/stack_overflow.reference
new file mode 100644
index 000000000000..daaac9e30302
--- /dev/null
+++ b/testsuite/tests/callback/stack_overflow.reference
@@ -0,0 +1,2 @@
+42
+42
diff --git a/testsuite/tests/callback/stack_overflow_.c b/testsuite/tests/callback/stack_overflow_.c
new file mode 100644
index 000000000000..0978ac9b20c7
--- /dev/null
+++ b/testsuite/tests/callback/stack_overflow_.c
@@ -0,0 +1,6 @@
+#include <caml/mlvalues.h>
+#include <caml/callback.h>
+
+value caml_to_c (value f) {
+  return caml_callback(f, Val_unit);
+}
diff --git a/testsuite/tests/callback/test1.ml b/testsuite/tests/callback/test1.ml
new file mode 100644
index 000000000000..7c45945a8be5
--- /dev/null
+++ b/testsuite/tests/callback/test1.ml
@@ -0,0 +1,52 @@
+(* TEST
+   include unix
+   modules = "test1_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(**************************************************************************)
+
+external mycallback1 : ('a -> 'b) -> 'a -> 'b = "mycallback1"
+external mycallback2 : ('a -> 'b -> 'c) -> 'a -> 'b -> 'c = "mycallback2"
+external mycallback3 : ('a -> 'b -> 'c -> 'd) -> 'a -> 'b -> 'c -> 'd
+    = "mycallback3"
+external mycallback4 :
+    ('a -> 'b -> 'c -> 'd -> 'e) -> 'a -> 'b -> 'c -> 'd -> 'e = "mycallback4"
+
+let rec tak (x, y, z as _tuple) =
+  if x > y then tak(tak (x-1, y, z), tak (y-1, z, x), tak (z-1, x, y))
+           else z
+
+let tak2 x (y, z) = tak (x, y, z)
+
+let tak3 x y z = tak (x, y, z)
+
+let tak4 x y z u = tak (x, y, z + u)
+
+let raise_exit () = (raise Exit : unit)
+
+let trapexit () =
+  begin try
+    mycallback1 raise_exit ()
+  with Exit ->
+    ()
+  end;
+  tak (18, 12, 6)
+
+external mypushroot : 'a -> ('b -> 'c) -> 'b -> 'a = "mypushroot"
+external mycamlparam : 'a -> ('b -> 'c) -> 'b -> 'a = "mycamlparam"
+
+let tripwire f =
+  let s = String.make 5 'a' in
+  f s trapexit ()
+
+let _ =
+  print_int(mycallback1 tak (18, 12, 6)); print_newline();
+  print_int(mycallback2 tak2 18 (12, 6)); print_newline();
+  print_int(mycallback3 tak3 18 12 6); print_newline();
+  print_int(mycallback4 tak4 18 12 3 3); print_newline();
+  print_int(trapexit ()); print_newline();
+  print_string(tripwire mypushroot); print_newline();
+  print_string(tripwire mycamlparam); print_newline();
diff --git a/testsuite/tests/callback/test1.mli b/testsuite/tests/callback/test1.mli
new file mode 100644
index 000000000000..9242c1c6c6d4
--- /dev/null
+++ b/testsuite/tests/callback/test1.mli
@@ -0,0 +1,15 @@
+external mycallback1 : ('a -> 'b) -> 'a -> 'b = "mycallback1"
+external mycallback2 : ('a -> 'b -> 'c) -> 'a -> 'b -> 'c = "mycallback2"
+external mycallback3 : ('a -> 'b -> 'c -> 'd) -> 'a -> 'b -> 'c -> 'd
+  = "mycallback3"
+external mycallback4 :
+  ('a -> 'b -> 'c -> 'd -> 'e) -> 'a -> 'b -> 'c -> 'd -> 'e = "mycallback4"
+val tak : int * int * int -> int
+val tak2 : int -> int * int -> int
+val tak3 : int -> int -> int -> int
+val tak4 : int -> int -> int -> int -> int
+val raise_exit : unit -> unit
+val trapexit : unit -> int
+external mypushroot : 'a -> ('b -> 'c) -> 'b -> 'a = "mypushroot"
+external mycamlparam : 'a -> ('b -> 'c) -> 'b -> 'a = "mycamlparam"
+val tripwire : (string -> (unit -> int) -> unit -> 'a) -> 'a
diff --git a/testsuite/tests/callback/test1.reference b/testsuite/tests/callback/test1.reference
new file mode 100644
index 000000000000..8b2262dc5c0e
--- /dev/null
+++ b/testsuite/tests/callback/test1.reference
@@ -0,0 +1,7 @@
+7
+7
+7
+7
+7
+aaaaa
+aaaaa
diff --git a/testsuite/tests/callback/test1_.c b/testsuite/tests/callback/test1_.c
new file mode 100644
index 000000000000..45879a01917c
--- /dev/null
+++ b/testsuite/tests/callback/test1_.c
@@ -0,0 +1,69 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                OCaml                                   */
+/*                                                                        */
+/*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
+/*                                                                        */
+/*   Copyright 1995 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#include "caml/mlvalues.h"
+#include "caml/memory.h"
+#include "caml/callback.h"
+
+value mycallback1(value fun, value arg)
+{
+  value res;
+  res = caml_callback(fun, arg);
+  return res;
+}
+
+value mycallback2(value fun, value arg1, value arg2)
+{
+  value res;
+  res = caml_callback2(fun, arg1, arg2);
+  return res;
+}
+
+value mycallback3(value fun, value arg1, value arg2, value arg3)
+{
+  value res;
+  res = caml_callback3(fun, arg1, arg2, arg3);
+  return res;
+}
+
+value mycallback4(value fun, value arg1, value arg2, value arg3, value arg4)
+{
+  value args[4];
+  value res;
+  args[0] = arg1;
+  args[1] = arg2;
+  args[2] = arg3;
+  args[3] = arg4;
+  res = caml_callbackN(fun, 4, args);
+  return res;
+}
+
+value mypushroot(value v, value fun, value arg)
+{
+  Begin_root(v)
+    caml_callback(fun, arg);
+  End_roots();
+  return v;
+}
+
+value mycamlparam (value v, value fun, value arg)
+{
+  CAMLparam3 (v, fun, arg);
+  CAMLlocal2 (x, y);
+  x = v;
+  y = caml_callback (fun, arg);
+  v = x;
+  CAMLreturn (v);
+}
diff --git a/testsuite/tests/callback/test2.ml b/testsuite/tests/callback/test2.ml
new file mode 100644
index 000000000000..87314864f29d
--- /dev/null
+++ b/testsuite/tests/callback/test2.ml
@@ -0,0 +1,25 @@
+(* TEST
+   include unix
+   modules = "test2_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(* Tests nested calls from C (main C) to OCaml (main OCaml) to C (caml_to_c) to
+ * OCaml (c_to_caml) to C (printf functions). *)
+
+let printf = Printf.printf
+
+let c_to_caml () =
+  printf "[Caml] Enter c_to_caml\n%!";
+  printf "[Caml] Leave c_to_caml\n%!"
+
+let _ = Callback.register "c_to_caml" c_to_caml
+
+external caml_to_c : unit -> unit = "caml_to_c"
+
+let _ =
+    printf "[Caml] Call caml_to_c\n%!";
+    caml_to_c ();
+    printf "[Caml] Return from caml_to_c\n%!"
diff --git a/testsuite/tests/callback/test2.mli b/testsuite/tests/callback/test2.mli
new file mode 100644
index 000000000000..6aedf9522bf0
--- /dev/null
+++ b/testsuite/tests/callback/test2.mli
@@ -0,0 +1 @@
+external caml_to_c : unit -> unit = "caml_to_c"
diff --git a/testsuite/tests/callback/test2.reference b/testsuite/tests/callback/test2.reference
new file mode 100644
index 000000000000..4b166b2223e4
--- /dev/null
+++ b/testsuite/tests/callback/test2.reference
@@ -0,0 +1,8 @@
+[Caml] Call caml_to_c
+[Caml] Enter c_to_caml
+[Caml] Leave c_to_caml
+[Caml] Return from caml_to_c
+[C] Enter caml_to_c
+[C] Call c_to_caml
+[C] Return from c_to_caml
+[C] Leave caml_to_c
diff --git a/testsuite/tests/callback/test2_.c b/testsuite/tests/callback/test2_.c
new file mode 100644
index 000000000000..f16fe30c0e47
--- /dev/null
+++ b/testsuite/tests/callback/test2_.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+
+value caml_to_c (value unit) {
+  CAMLparam1 (unit);
+  printf ("[C] Enter caml_to_c\n");
+
+  static const value* c_to_caml_closure = NULL;
+
+  if (!c_to_caml_closure)
+    c_to_caml_closure = caml_named_value("c_to_caml");
+
+  printf ("[C] Call c_to_caml\n");
+  caml_callback(*c_to_caml_closure, Val_unit);
+  printf ("[C] Return from c_to_caml\n");
+
+  printf ("[C] Leave caml_to_c\n");
+  CAMLreturn (Val_unit);
+}
diff --git a/testsuite/tests/callback/test3.ml b/testsuite/tests/callback/test3.ml
new file mode 100644
index 000000000000..efa92d00de01
--- /dev/null
+++ b/testsuite/tests/callback/test3.ml
@@ -0,0 +1,37 @@
+(* TEST
+   include unix
+   modules = "test3_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(* Tests nested calls from C (main C) to OCaml (main OCaml) to C (caml_to_c) to
+ * OCaml (c_to_caml) to C (printf functions). A stack overflow and a heap
+ * overflow are triggered in c_to_caml. *)
+
+let printf = Printf.printf
+
+let rec mk_list length acc =
+  if length < 1 then acc
+  else mk_list (length-1) ((length-1)::acc)
+
+let rec sum n = if n = 0 then 0 else n + sum (n-1)
+
+let c_to_caml () =
+  printf "[Caml] Enter c_to_caml\n%!";
+  (* Heap overflow *)
+  let l = mk_list 1000 [] in
+  Printf.printf "%d\n" (List.hd l);
+  (* Stack overflow *)
+  Printf.printf "%d\n" (sum 100000);
+  printf "[Caml] Leave c_to_caml\n%!"
+
+let _ = Callback.register "c_to_caml" c_to_caml
+
+external caml_to_c : unit -> unit = "caml_to_c"
+
+let _ =
+    printf "[Caml] Call caml_to_c\n%!";
+    caml_to_c ();
+    printf "[Caml] Return from caml_to_c\n%!"
diff --git a/testsuite/tests/callback/test3.mli b/testsuite/tests/callback/test3.mli
new file mode 100644
index 000000000000..6aedf9522bf0
--- /dev/null
+++ b/testsuite/tests/callback/test3.mli
@@ -0,0 +1 @@
+external caml_to_c : unit -> unit = "caml_to_c"
diff --git a/testsuite/tests/callback/test3.reference b/testsuite/tests/callback/test3.reference
new file mode 100644
index 000000000000..eb917de6dcfb
--- /dev/null
+++ b/testsuite/tests/callback/test3.reference
@@ -0,0 +1,10 @@
+[Caml] Call caml_to_c
+[Caml] Enter c_to_caml
+0
+5000050000
+[Caml] Leave c_to_caml
+[Caml] Return from caml_to_c
+[C] Enter caml_to_c
+[C] Call c_to_caml
+[C] Return from c_to_caml
+[C] Leave caml_to_c
diff --git a/testsuite/tests/callback/test3_.c b/testsuite/tests/callback/test3_.c
new file mode 100644
index 000000000000..f16fe30c0e47
--- /dev/null
+++ b/testsuite/tests/callback/test3_.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+
+value caml_to_c (value unit) {
+  CAMLparam1 (unit);
+  printf ("[C] Enter caml_to_c\n");
+
+  static const value* c_to_caml_closure = NULL;
+
+  if (!c_to_caml_closure)
+    c_to_caml_closure = caml_named_value("c_to_caml");
+
+  printf ("[C] Call c_to_caml\n");
+  caml_callback(*c_to_caml_closure, Val_unit);
+  printf ("[C] Return from c_to_caml\n");
+
+  printf ("[C] Leave caml_to_c\n");
+  CAMLreturn (Val_unit);
+}
diff --git a/testsuite/tests/callback/test4.ml b/testsuite/tests/callback/test4.ml
new file mode 100644
index 000000000000..3c42317d7e17
--- /dev/null
+++ b/testsuite/tests/callback/test4.ml
@@ -0,0 +1,31 @@
+(* TEST
+   include unix
+   modules = "test4_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(* Tests nested calls from C (main C) to OCaml (main OCaml) to C (caml_to_c) to
+ * OCaml (c_to_caml) to C (printf functions). Exception is raised in a
+ * Callback, which unwinds the C stack and gets caught in OCaml. *)
+
+exception E
+
+let printf = Printf.printf
+
+let c_to_caml () =
+  printf "[Caml] Enter c_to_caml\n%!";
+  printf "[Caml] c_to_caml: raise exception\n%!";
+  raise E
+
+let _ = Callback.register "c_to_caml" c_to_caml
+
+external caml_to_c : unit -> unit = "caml_to_c"
+
+let _ =
+  try
+    printf "[Caml] Call caml_to_c\n%!";
+    caml_to_c ();
+    printf "[Caml] Return from caml_to_c\n%!"
+  with E -> printf "[Caml] Caught exception\n%!"
diff --git a/testsuite/tests/callback/test4.mli b/testsuite/tests/callback/test4.mli
new file mode 100644
index 000000000000..6aedf9522bf0
--- /dev/null
+++ b/testsuite/tests/callback/test4.mli
@@ -0,0 +1 @@
+external caml_to_c : unit -> unit = "caml_to_c"
diff --git a/testsuite/tests/callback/test4.reference b/testsuite/tests/callback/test4.reference
new file mode 100644
index 000000000000..4aa50ab9ecf2
--- /dev/null
+++ b/testsuite/tests/callback/test4.reference
@@ -0,0 +1,6 @@
+[Caml] Call caml_to_c
+[Caml] Enter c_to_caml
+[Caml] c_to_caml: raise exception
+[Caml] Caught exception
+[C] Enter caml_to_c
+[C] Call c_to_caml
diff --git a/testsuite/tests/callback/test4_.c b/testsuite/tests/callback/test4_.c
new file mode 100644
index 000000000000..f16fe30c0e47
--- /dev/null
+++ b/testsuite/tests/callback/test4_.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+
+value caml_to_c (value unit) {
+  CAMLparam1 (unit);
+  printf ("[C] Enter caml_to_c\n");
+
+  static const value* c_to_caml_closure = NULL;
+
+  if (!c_to_caml_closure)
+    c_to_caml_closure = caml_named_value("c_to_caml");
+
+  printf ("[C] Call c_to_caml\n");
+  caml_callback(*c_to_caml_closure, Val_unit);
+  printf ("[C] Return from c_to_caml\n");
+
+  printf ("[C] Leave caml_to_c\n");
+  CAMLreturn (Val_unit);
+}
diff --git a/testsuite/tests/callback/test5.ml b/testsuite/tests/callback/test5.ml
new file mode 100644
index 000000000000..62fd8b9b0ece
--- /dev/null
+++ b/testsuite/tests/callback/test5.ml
@@ -0,0 +1,29 @@
+(* TEST
+   include unix
+   modules = "test5_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(* Tests nested calls from C (main C) to OCaml (main OCaml) to C (caml_to_c) to
+ * OCaml (c_to_caml) to C (printf functions). Test calls with arguments passed
+ * on the stack from C to OCaml and OCaml to C. *)
+
+let printf = Printf.printf
+
+let c_to_caml n =
+  printf "[Caml] Enter c_to_caml\n%!";
+  printf "c_to_caml: n=%d\n" n;
+  printf "[Caml] Leave c_to_caml\n%!"
+
+let _ = Callback.register "c_to_caml" c_to_caml
+
+external caml_to_c : int -> int -> int -> int -> int
+                  -> int -> int -> int -> int -> int
+                  -> int -> unit = "caml_to_c_bytecode" "caml_to_c_native"
+
+let _ =
+    printf "[Caml] Call caml_to_c\n%!";
+    caml_to_c 1 2 3 4 5 6 7 8 9 10 11;
+    printf "[Caml] Return from caml_to_c\n%!"
diff --git a/testsuite/tests/callback/test5.mli b/testsuite/tests/callback/test5.mli
new file mode 100644
index 000000000000..8c0a90422e58
--- /dev/null
+++ b/testsuite/tests/callback/test5.mli
@@ -0,0 +1,3 @@
+external caml_to_c : int -> int -> int -> int -> int
+                  -> int -> int -> int -> int -> int
+                  -> int -> unit = "caml_to_c_bytecode" "caml_to_c_native"
diff --git a/testsuite/tests/callback/test5.reference b/testsuite/tests/callback/test5.reference
new file mode 100644
index 000000000000..d930c4d1ffac
--- /dev/null
+++ b/testsuite/tests/callback/test5.reference
@@ -0,0 +1,9 @@
+[Caml] Call caml_to_c
+[Caml] Enter c_to_caml
+c_to_caml: n=66
+[Caml] Leave c_to_caml
+[Caml] Return from caml_to_c
+[C] Enter caml_to_c
+[C] Call c_to_caml
+[C] Return from c_to_caml
+[C] Leave caml_to_c
diff --git a/testsuite/tests/callback/test5_.c b/testsuite/tests/callback/test5_.c
new file mode 100644
index 000000000000..5e77080e7835
--- /dev/null
+++ b/testsuite/tests/callback/test5_.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+
+value caml_to_c_native (value a1, value a2, value a3, value a4, value a5,
+                        value a6, value a7, value a8, value a9, value a10,
+                        value a11)
+{
+  CAMLparam0 ();
+  long l;
+
+  printf ("[C] Enter caml_to_c\n");
+
+  static const value* c_to_caml_closure = NULL;
+  if (!c_to_caml_closure)
+    c_to_caml_closure = caml_named_value("c_to_caml");
+
+  l = Long_val (a1) + Long_val (a2) + Long_val (a3) + Long_val (a4)
+    + Long_val (a5) + Long_val (a6) + Long_val (a7) + Long_val (a8)
+    + Long_val (a9) + Long_val (a10) + Long_val (a11);
+
+  printf ("[C] Call c_to_caml\n");
+  caml_callback(*c_to_caml_closure, Val_long(l));
+  printf ("[C] Return from c_to_caml\n");
+
+  printf ("[C] Leave caml_to_c\n");
+  CAMLreturn (Val_unit);
+}
+
+value caml_to_c_bytecode (value * argv, int argn) {
+  return caml_to_c_native (argv[0], argv[1], argv[2], argv[3], argv[4],
+                           argv[5], argv[6], argv[7], argv[8], argv[9],
+                           argv[10]);
+}
diff --git a/testsuite/tests/callback/test6.ml b/testsuite/tests/callback/test6.ml
new file mode 100644
index 000000000000..e3ff477f5b6d
--- /dev/null
+++ b/testsuite/tests/callback/test6.ml
@@ -0,0 +1,34 @@
+(* TEST
+   include unix
+   modules = "test6_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(* Tests nested calls from C (main C) to OCaml (main OCaml) to C (caml_to_c) to
+ * OCaml (c_to_caml) to C (printf functions). Exception is raised in a
+ * Callback, which unwinds the C stack and gets caught in OCaml. *)
+
+exception E
+
+let printf = Printf.printf
+
+let c_to_caml () =
+  printf "[Caml] Enter c_to_caml\n%!";
+  printf "[Caml] c_to_caml: raise exception\n%!";
+  raise E
+
+let _ = Callback.register "c_to_caml" c_to_caml
+
+external caml_to_c : unit -> unit = "caml_to_c"
+
+let _ =
+  try
+    printf "[Caml] Call caml_to_c\n%!";
+    caml_to_c ();
+    printf "[Caml] Return from caml_to_c\n%!"
+  with E ->
+    (printf "[Caml] Caught exception\n%!";
+     try caml_to_c() with E -> printf "[Caml] Caught exceception again\n%!";
+     printf "[Caml] Done\n%!")
diff --git a/testsuite/tests/callback/test6.mli b/testsuite/tests/callback/test6.mli
new file mode 100644
index 000000000000..6aedf9522bf0
--- /dev/null
+++ b/testsuite/tests/callback/test6.mli
@@ -0,0 +1 @@
+external caml_to_c : unit -> unit = "caml_to_c"
diff --git a/testsuite/tests/callback/test6.reference b/testsuite/tests/callback/test6.reference
new file mode 100644
index 000000000000..266679b3194f
--- /dev/null
+++ b/testsuite/tests/callback/test6.reference
@@ -0,0 +1,12 @@
+[Caml] Call caml_to_c
+[Caml] Enter c_to_caml
+[Caml] c_to_caml: raise exception
+[Caml] Caught exception
+[Caml] Enter c_to_caml
+[Caml] c_to_caml: raise exception
+[Caml] Caught exceception again
+[Caml] Done
+[C] Enter caml_to_c
+[C] Call c_to_caml
+[C] Enter caml_to_c
+[C] Call c_to_caml
diff --git a/testsuite/tests/callback/test6_.c b/testsuite/tests/callback/test6_.c
new file mode 100644
index 000000000000..f16fe30c0e47
--- /dev/null
+++ b/testsuite/tests/callback/test6_.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+
+value caml_to_c (value unit) {
+  CAMLparam1 (unit);
+  printf ("[C] Enter caml_to_c\n");
+
+  static const value* c_to_caml_closure = NULL;
+
+  if (!c_to_caml_closure)
+    c_to_caml_closure = caml_named_value("c_to_caml");
+
+  printf ("[C] Call c_to_caml\n");
+  caml_callback(*c_to_caml_closure, Val_unit);
+  printf ("[C] Return from c_to_caml\n");
+
+  printf ("[C] Leave caml_to_c\n");
+  CAMLreturn (Val_unit);
+}
diff --git a/testsuite/tests/callback/test7.ml b/testsuite/tests/callback/test7.ml
new file mode 100644
index 000000000000..00aca09afe0c
--- /dev/null
+++ b/testsuite/tests/callback/test7.ml
@@ -0,0 +1,42 @@
+(* TEST
+   include unix
+   modules = "test7_.c"
+   * libunix
+   ** bytecode
+   ** native
+*)
+
+(* Tests nested calls from C (main C) to OCaml (main OCaml) to C (caml_to_c) to
+ * OCaml (c_to_caml) to C (printf functions). Effect E is performed in the
+ * callback, which does not have a handler. *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let printf = Printf.printf
+
+let c_to_caml () =
+  printf "[Caml] Enter c_to_caml\n%!";
+  printf "[Caml] c_to_caml: perform effect\n%!";
+  perform E
+
+let _ = Callback.register "c_to_caml" c_to_caml
+
+external caml_to_c : unit -> unit = "caml_to_c"
+
+let _ =
+  try_with (fun () ->
+    printf "[Caml] Call caml_to_c\n%!";
+    begin try
+      caml_to_c ()
+    with Unhandled ->
+      (printf "[Caml] Caught Unhandled, perform effect\n%!";
+       perform E)
+    end;
+    printf "[Caml] Return from caml_to_c\n%!") ()
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun k -> printf "[Caml] Caught effect\n%!")
+      | _ -> None }
diff --git a/testsuite/tests/callback/test7.mli b/testsuite/tests/callback/test7.mli
new file mode 100644
index 000000000000..6aedf9522bf0
--- /dev/null
+++ b/testsuite/tests/callback/test7.mli
@@ -0,0 +1 @@
+external caml_to_c : unit -> unit = "caml_to_c"
diff --git a/testsuite/tests/callback/test7.reference b/testsuite/tests/callback/test7.reference
new file mode 100644
index 000000000000..4b346e5fcd99
--- /dev/null
+++ b/testsuite/tests/callback/test7.reference
@@ -0,0 +1,7 @@
+[Caml] Call caml_to_c
+[Caml] Enter c_to_caml
+[Caml] c_to_caml: perform effect
+[Caml] Caught Unhandled, perform effect
+[Caml] Caught effect
+[C] Enter caml_to_c
+[C] Call c_to_caml
diff --git a/testsuite/tests/callback/test7_.c b/testsuite/tests/callback/test7_.c
new file mode 100644
index 000000000000..f16fe30c0e47
--- /dev/null
+++ b/testsuite/tests/callback/test7_.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/callback.h>
+
+value caml_to_c (value unit) {
+  CAMLparam1 (unit);
+  printf ("[C] Enter caml_to_c\n");
+
+  static const value* c_to_caml_closure = NULL;
+
+  if (!c_to_caml_closure)
+    c_to_caml_closure = caml_named_value("c_to_caml");
+
+  printf ("[C] Call c_to_caml\n");
+  caml_callback(*c_to_caml_closure, Val_unit);
+  printf ("[C] Return from c_to_caml\n");
+
+  printf ("[C] Leave caml_to_c\n");
+  CAMLreturn (Val_unit);
+}
diff --git a/testsuite/tests/callback/test_finaliser_gc.ml b/testsuite/tests/callback/test_finaliser_gc.ml
new file mode 100644
index 000000000000..c9c51cabb6a6
--- /dev/null
+++ b/testsuite/tests/callback/test_finaliser_gc.ml
@@ -0,0 +1,31 @@
+(* TEST
+*)
+
+let z = ref (0, 1, 2, 3, 4, 5, 6, 7)
+let finaliser_pending = ref true
+
+let force_gc_fn _ =
+  print_string "Finaliser has run!"; print_newline();
+  finaliser_pending := false;
+  Gc.full_major ()
+
+let trigger_finaliser () =
+  (* Construct finaliser which when run will force
+     a major cycle *)
+  Gc.finalise force_gc_fn (ref 0);
+  (* Allocate a block in the minor heap *)
+  let s = String.make 5 'b' in
+  (* Spin on the minor heap allocating but keep [s] in a
+    register and force a major cycle such that the
+    finaliser runs.
+    NB: we quit after ~8B words allocated should something
+    be broken with finalisers *)
+  let x = ref 0 in
+  while (!x < 1_000_000_000) && !finaliser_pending do
+    z := (!x, !x, !x, !x, !x, !x, !x, !x);
+    incr x;
+  done;
+  s
+
+let _ =
+  print_string (trigger_finaliser ()); print_newline();
diff --git a/testsuite/tests/callback/test_finaliser_gc.reference b/testsuite/tests/callback/test_finaliser_gc.reference
new file mode 100644
index 000000000000..8afdc9181bb0
--- /dev/null
+++ b/testsuite/tests/callback/test_finaliser_gc.reference
@@ -0,0 +1,2 @@
+Finaliser has run!
+bbbbb
diff --git a/testsuite/tests/callback/tcallback.ml b/testsuite/tests/callback/test_signalhandler.ml
similarity index 86%
rename from testsuite/tests/callback/tcallback.ml
rename to testsuite/tests/callback/test_signalhandler.ml
index cf9568a8f2e4..9d0ecaac1339 100644
--- a/testsuite/tests/callback/tcallback.ml
+++ b/testsuite/tests/callback/test_signalhandler.ml
@@ -1,6 +1,6 @@
 (* TEST
    include unix
-   modules = "callbackprim.c"
+   modules = "test_signalhandler_.c"
    * libunix
    ** bytecode
    ** native
@@ -52,14 +52,17 @@ let sighandler signo =
   (* Thoroughly wipe the minor heap *)
   ignore (tak (18, 12, 6))
 
-external raise_sigusr1 : unit -> unit = "raise_sigusr1" [@@noalloc]
+external unix_getpid : unit -> int = "unix_getpid" [@@noalloc]
+external unix_kill : int -> int -> unit = "unix_kill" [@@noalloc]
 
 let callbacksig () =
+  let pid = unix_getpid() in
   (* Allocate a block in the minor heap *)
   let s = String.make 5 'b' in
   (* Send a signal to self.  We want s to remain in a register and
-     not be spilled on the stack, hence we use a [@@noalloc] stub *)
-  raise_sigusr1 ();
+     not be spilled on the stack, hence we declare unix_kill
+     [@@noalloc]. *)
+  unix_kill pid Sys.sigusr1;
   (* Allocate some more so that the signal will be tested *)
   let u = (s, s) in
   fst u
diff --git a/testsuite/tests/callback/test_signalhandler.mli b/testsuite/tests/callback/test_signalhandler.mli
new file mode 100644
index 000000000000..8559ebe245c8
--- /dev/null
+++ b/testsuite/tests/callback/test_signalhandler.mli
@@ -0,0 +1,19 @@
+external mycallback1 : ('a -> 'b) -> 'a -> 'b = "mycallback1"
+external mycallback2 : ('a -> 'b -> 'c) -> 'a -> 'b -> 'c = "mycallback2"
+external mycallback3 : ('a -> 'b -> 'c -> 'd) -> 'a -> 'b -> 'c -> 'd
+  = "mycallback3"
+external mycallback4 :
+  ('a -> 'b -> 'c -> 'd -> 'e) -> 'a -> 'b -> 'c -> 'd -> 'e = "mycallback4"
+val tak : int * int * int -> int
+val tak2 : int -> int * int -> int
+val tak3 : int -> int -> int -> int
+val tak4 : int -> int -> int -> int -> int
+val raise_exit : unit -> unit
+val trapexit : unit -> int
+external mypushroot : 'a -> ('b -> 'c) -> 'b -> 'a = "mypushroot"
+external mycamlparam : 'a -> ('b -> 'c) -> 'b -> 'a = "mycamlparam"
+val tripwire : (string -> (unit -> int) -> unit -> 'a) -> 'a
+val sighandler : 'a -> unit
+external unix_getpid : unit -> int = "unix_getpid" [@@noalloc]
+external unix_kill : int -> int -> unit = "unix_kill" [@@noalloc]
+val callbacksig : unit -> string
diff --git a/testsuite/tests/callback/test_signalhandler.reference b/testsuite/tests/callback/test_signalhandler.reference
new file mode 100644
index 000000000000..b35993aa2c0a
--- /dev/null
+++ b/testsuite/tests/callback/test_signalhandler.reference
@@ -0,0 +1,8 @@
+7
+7
+7
+7
+7
+aaaaa
+aaaaa
+bbbbb
diff --git a/testsuite/tests/callback/test_signalhandler_.c b/testsuite/tests/callback/test_signalhandler_.c
new file mode 100644
index 000000000000..45879a01917c
--- /dev/null
+++ b/testsuite/tests/callback/test_signalhandler_.c
@@ -0,0 +1,69 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                OCaml                                   */
+/*                                                                        */
+/*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
+/*                                                                        */
+/*   Copyright 1995 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#include "caml/mlvalues.h"
+#include "caml/memory.h"
+#include "caml/callback.h"
+
+value mycallback1(value fun, value arg)
+{
+  value res;
+  res = caml_callback(fun, arg);
+  return res;
+}
+
+value mycallback2(value fun, value arg1, value arg2)
+{
+  value res;
+  res = caml_callback2(fun, arg1, arg2);
+  return res;
+}
+
+value mycallback3(value fun, value arg1, value arg2, value arg3)
+{
+  value res;
+  res = caml_callback3(fun, arg1, arg2, arg3);
+  return res;
+}
+
+value mycallback4(value fun, value arg1, value arg2, value arg3, value arg4)
+{
+  value args[4];
+  value res;
+  args[0] = arg1;
+  args[1] = arg2;
+  args[2] = arg3;
+  args[3] = arg4;
+  res = caml_callbackN(fun, 4, args);
+  return res;
+}
+
+value mypushroot(value v, value fun, value arg)
+{
+  Begin_root(v)
+    caml_callback(fun, arg);
+  End_roots();
+  return v;
+}
+
+value mycamlparam (value v, value fun, value arg)
+{
+  CAMLparam3 (v, fun, arg);
+  CAMLlocal2 (x, y);
+  x = v;
+  y = caml_callback (fun, arg);
+  v = x;
+  CAMLreturn (v);
+}
diff --git a/testsuite/tests/effects/backtrace.ml b/testsuite/tests/effects/backtrace.ml
new file mode 100644
index 000000000000..9e8813c2a1e8
--- /dev/null
+++ b/testsuite/tests/effects/backtrace.ml
@@ -0,0 +1,54 @@
+(* TEST
+   flags = "-g"
+   ocamlrunparam += ",b=1"
+*)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+let rec foo i =
+  if i = 0 then ()
+  else begin
+    ignore (failwith "exn");
+    foo i
+  end
+  [@@inline never]
+
+let rec bar i =
+  if i = 0 then ()
+  else begin
+    foo i;
+    bar i
+  end
+  [@@inline never]
+
+type _ eff += Wait : unit eff
+
+let task1 () =
+  try
+    bar 42; None
+  with e ->
+    Some (e, Printexc.get_raw_backtrace ())
+
+let rec task2 i =
+  if i = 0 then ()
+  else begin
+    perform Wait;
+    task2 i
+  end
+  [@@inline never]
+
+let main () =
+  let (x, bt) = Option.get (task1 ()) in
+  match_with task2 42
+  { retc = Fun.id;
+    exnc = (fun e ->
+      let open Printexc in
+      print_raw_backtrace stdout (get_raw_backtrace ()));
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | Wait -> Some (fun (k : (a, _) continuation) ->
+          discontinue_with_backtrace k x bt)
+      | _ -> None }
+
+let _ = main ()
diff --git a/testsuite/tests/effects/backtrace.reference b/testsuite/tests/effects/backtrace.reference
new file mode 100644
index 000000000000..09b78b0d2f9a
--- /dev/null
+++ b/testsuite/tests/effects/backtrace.reference
@@ -0,0 +1,6 @@
+Raised at Stdlib.failwith in file "stdlib.ml", line 29, characters 17-33
+Called from Backtrace.foo in file "backtrace.ml", line 12, characters 11-27
+Called from Backtrace.bar in file "backtrace.ml", line 20, characters 4-9
+Called from Backtrace.task1 in file "backtrace.ml", line 29, characters 4-10
+Re-raised at Stdlib__EffectHandlers.Deep.discontinue_with_backtrace.(fun) in file "effectHandlers.ml", line 41, characters 4-38
+Called from Backtrace.task2 in file "backtrace.ml", line 36, characters 4-16
diff --git a/testsuite/tests/effects/cmphash.ml b/testsuite/tests/effects/cmphash.ml
new file mode 100644
index 000000000000..49f2cd3285c1
--- /dev/null
+++ b/testsuite/tests/effects/cmphash.ml
@@ -0,0 +1,21 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let () =
+  try_with perform E
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun k ->
+          begin match k = k with
+          | _ -> assert false
+          | exception (Invalid_argument _) -> print_endline "ok"
+          end;
+          begin match Hashtbl.hash k with
+          | _ -> print_endline "ok"
+          end)
+      | e -> None }
diff --git a/testsuite/tests/effects/cmphash.reference b/testsuite/tests/effects/cmphash.reference
new file mode 100644
index 000000000000..79ebd0860f49
--- /dev/null
+++ b/testsuite/tests/effects/cmphash.reference
@@ -0,0 +1,2 @@
+ok
+ok
diff --git a/testsuite/tests/effects/evenodd.ml b/testsuite/tests/effects/evenodd.ml
new file mode 100644
index 000000000000..0a0b735988f3
--- /dev/null
+++ b/testsuite/tests/effects/evenodd.ml
@@ -0,0 +1,22 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let rec even n =
+  if n = 0 then true
+  else try_with odd (n-1)
+       { effc = fun (type a) (e : a eff) ->
+           match e with
+           | E -> Some (fun k -> assert false)
+           | _ -> None }
+and odd n =
+  if n = 0 then false
+  else even (n-1)
+
+let _ =
+  let n = 1_000_000 in
+  Printf.printf "even %d is %B\n%!" n (even n)
diff --git a/testsuite/tests/effects/evenodd.reference b/testsuite/tests/effects/evenodd.reference
new file mode 100644
index 000000000000..f2d595966ee2
--- /dev/null
+++ b/testsuite/tests/effects/evenodd.reference
@@ -0,0 +1 @@
+even 1000000 is true
diff --git a/testsuite/tests/effects/issue479.compilers.reference b/testsuite/tests/effects/issue479.compilers.reference
new file mode 100644
index 000000000000..e973425fe75f
--- /dev/null
+++ b/testsuite/tests/effects/issue479.compilers.reference
@@ -0,0 +1,13 @@
+- : unit = ()
+type ('a, 'container) iterator = ('a -> unit) -> 'container -> unit
+type 'a generator = unit -> 'a option
+type ('a, 'container) iter2gen =
+    ('a, 'container) iterator -> 'container -> 'a generator
+type _ Stdlib.EffectHandlers.eff += Hold : unit EffectHandlers.eff
+val iter2gen : (int, 'a) iter2gen = <fun>
+val f : unit -> unit = <fun>
+Hold 1
+1
+Hold 2
+Exception: Continuation_already_taken.
+
diff --git a/testsuite/tests/effects/issue479.ml b/testsuite/tests/effects/issue479.ml
new file mode 100644
index 000000000000..8d526f8639ea
--- /dev/null
+++ b/testsuite/tests/effects/issue479.ml
@@ -0,0 +1,56 @@
+(* TEST
+   * toplevel
+*)
+
+(* https://github.com/ocaml-multicore/ocaml-multicore/issues/479 *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+[@@@warning "-5-26"];;
+
+Printexc.record_backtrace false;;
+
+type ('a, 'container) iterator = ('a -> unit) -> 'container -> unit;;
+type 'a generator = unit -> 'a option;;
+
+type ('a,'container) iter2gen =
+  ('a, 'container) iterator     (* List.iter *)
+  -> 'container
+  -> 'a generator;;
+
+type _ eff += Hold: unit eff
+
+let iter2gen : _ iter2gen = fun iter c ->
+  let r = ref None in
+  let suspending_f x =
+    r:=Some x;
+    perform Hold
+  in
+  let next =
+    match_with (iter suspending_f) c
+    { retc = (fun _ -> fun () -> None);
+      exnc = (fun e -> raise e);
+      effc = fun (type a) (e : a eff) ->
+        match e with
+        | Hold -> Some (fun (k : (a,_) continuation) ->
+            fun () ->
+              let x = !r in
+              Printf.printf "Hold %s\n%!" (
+                match x with
+                | None -> "?"
+                | Some x->string_of_int x);
+              continue k ();
+              x)
+        | e -> None }
+   in
+   fun () -> next();;
+
+let f () =
+  let gen = iter2gen List.iter in
+  let gen = gen [1;2;3] in
+  let gen() = match gen() with None->"?" | Some x-> string_of_int x in
+  Printf.printf "%s\n%!" (gen());
+  Printf.printf "%s\n%!" (gen());;
+
+f ();;
diff --git a/testsuite/tests/effects/manylive.ml b/testsuite/tests/effects/manylive.ml
new file mode 100644
index 000000000000..96e25e23d8b0
--- /dev/null
+++ b/testsuite/tests/effects/manylive.ml
@@ -0,0 +1,27 @@
+(* TEST
+ *)
+
+let f x =
+  let a0 = ref 1 in
+  let a1 = ref 1 in
+  let a2 = ref 1 in
+  let a3 = ref 1 in
+  let a4 = ref 1 in
+  let a5 = ref 1 in
+  let a6 = ref 1 in
+  let a7 = ref 1 in
+  let a8 = ref 1 in
+  let a9 = ref 1 in
+  let a10 = ref 1 in
+  let a11 = ref 1 in
+  let a12 = ref 1 in
+  if x then raise Not_found;
+  [| a0; a1; a2; a3; a4; a5; a6; a7; a8; a9; a10; a11; a12 |]
+
+let () =
+  for i = 1 to 50000 do
+    let rs = Sys.opaque_identity f false in
+    assert (Array.for_all (fun x -> !x = 1) rs);
+    let _ = Array.make (Random.int 30) 'a' in ()
+  done;
+  print_string "ok\n"
diff --git a/testsuite/tests/effects/manylive.reference b/testsuite/tests/effects/manylive.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/effects/manylive.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/effects/overflow.ml b/testsuite/tests/effects/overflow.ml
new file mode 100644
index 000000000000..28325d47e2ce
--- /dev/null
+++ b/testsuite/tests/effects/overflow.ml
@@ -0,0 +1,40 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let f a b c d e f g h =
+   let bb = b + b in
+   let bbb = bb + b in
+   let cc = c + c in
+   let ccc = cc + c in
+   let dd = d + d in
+   let ddd = dd + d in
+   let ee = e + e in
+   let eee = ee + e in
+   let ff = f + f in
+   let fff = ff + f in
+   let gg = g + g in
+   let ggg = gg + g in
+   let hh = h + h in
+   let hhh = hh + h in
+   min 20 a +
+     b + bb + bbb +
+     c + cc + ccc +
+     d + dd + ddd +
+     e + ee + eee +
+     f + ff + fff +
+     g + gg + ggg +
+     h + hh + hhh
+
+let () =
+  match_with (fun _ -> f 1 2 3 4 5 6 7 8) ()
+  { retc = (fun n -> Printf.printf "%d\n" n);
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun k -> assert false)
+      | _ -> None }
diff --git a/testsuite/tests/effects/overflow.reference b/testsuite/tests/effects/overflow.reference
new file mode 100644
index 000000000000..dba40afcf755
--- /dev/null
+++ b/testsuite/tests/effects/overflow.reference
@@ -0,0 +1 @@
+211
diff --git a/testsuite/tests/effects/partial.ml b/testsuite/tests/effects/partial.ml
new file mode 100644
index 000000000000..440ce14d6f92
--- /dev/null
+++ b/testsuite/tests/effects/partial.ml
@@ -0,0 +1,28 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+exception Done
+
+let handle_partial f =
+  try_with f ()
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun k -> assert false)
+      | _ -> None }
+
+let f () x = perform E
+
+let () =
+  match_with (handle_partial f) ()
+  { retc = (fun x -> assert false);
+    exnc = (function
+      | Done -> print_string "ok\n"
+      | e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun (k : (a, _) continuation) -> discontinue k Done)
+      | _ -> None }
diff --git a/testsuite/tests/effects/partial.reference b/testsuite/tests/effects/partial.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/effects/partial.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/effects/reperform.ml b/testsuite/tests/effects/reperform.ml
new file mode 100644
index 000000000000..759c1e4804f8
--- /dev/null
+++ b/testsuite/tests/effects/reperform.ml
@@ -0,0 +1,37 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : int -> int eff
+            | F : unit eff
+
+let rec nest = function
+  | 0 -> perform (E 42)
+  | n ->
+     match_with (fun _ -> Printf.printf "[%d\n" n; nest (n - 1)) ()
+     { retc = (fun x -> Printf.printf " %d]\n" n; x);
+       exnc = (fun e -> Printf.printf " !%d]\n" n; raise e);
+       effc = fun (type a) (e : a eff) ->
+         match e with
+         | F -> Some (fun k -> assert false)
+         | _ -> None }
+
+let () =
+  match_with nest 5
+  { retc = (fun x -> Printf.printf "= %d\n" x);
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | E n -> Some (fun (k : (a, _) continuation) -> continue k (n + 100))
+      | _ -> None }
+
+let () =
+  match_with nest 5
+  { retc = (fun x -> assert false);
+    exnc = (fun e -> Printf.printf "%s\n" (Printexc.to_string e));
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | F -> Some (fun k -> assert false)
+      | _ -> None }
diff --git a/testsuite/tests/effects/reperform.reference b/testsuite/tests/effects/reperform.reference
new file mode 100644
index 000000000000..4c69a7f94c8e
--- /dev/null
+++ b/testsuite/tests/effects/reperform.reference
@@ -0,0 +1,22 @@
+[5
+[4
+[3
+[2
+[1
+ 1]
+ 2]
+ 3]
+ 4]
+ 5]
+= 142
+[5
+[4
+[3
+[2
+[1
+ !1]
+ !2]
+ !3]
+ !4]
+ !5]
+Unhandled
diff --git a/testsuite/tests/effects/sched.ml b/testsuite/tests/effects/sched.ml
new file mode 100644
index 000000000000..b78014ecb19c
--- /dev/null
+++ b/testsuite/tests/effects/sched.ml
@@ -0,0 +1,65 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+exception E
+type _ eff += Yield : unit eff
+            | Fork : (unit -> string) -> unit eff
+            | Ping : unit eff
+exception Pong
+
+let say = print_string
+
+let run main =
+  let run_q = Queue.create () in
+  let enqueue k = Queue.push k run_q in
+  let rec dequeue () =
+    if Queue.is_empty run_q then `Finished
+    else continue (Queue.pop run_q) ()
+  in
+  let rec spawn f =
+    match_with f ()
+    { retc = (function
+        | "ok" -> say "."; dequeue ()
+        | s -> failwith ("Unexpected result: " ^ s));
+      exnc = (function
+        | E -> say "!"; dequeue ()
+        | e -> raise e);
+      effc = fun (type a) (e : a eff) ->
+        match e with
+        | Yield -> Some (fun (k : (a, _) continuation) ->
+            say ","; enqueue k; dequeue ())
+        | Fork f -> Some (fun (k : (a, _) continuation) ->
+            say "+"; enqueue k; spawn f)
+        | Ping -> Some (fun (k : (a, _) continuation) ->
+            say "["; discontinue k Pong)
+        | _ -> None }
+  in
+  spawn main
+
+let test () =
+  say "A";
+  perform (Fork (fun () ->
+     perform Yield; say "C"; perform Yield;
+     begin match_with (fun () -> perform Ping; failwith "no pong?") ()
+      { retc = (fun x -> x);
+        exnc = (function
+          | Pong -> say "]"
+          | e -> raise e);
+        effc = fun (type a) (e : a eff) ->
+          match e with
+          | Yield -> Some (fun (k : (a,_) continuation) -> failwith "what?")
+          | _ -> None }
+     end;
+     raise E));
+  perform (Fork (fun () -> say "B"; "ok"));
+  say "D";
+  perform Yield;
+  say "E";
+  "ok"
+
+let () =
+  let `Finished = run test in
+  say "\n"
diff --git a/testsuite/tests/effects/sched.reference b/testsuite/tests/effects/sched.reference
new file mode 100644
index 000000000000..47294f1ef70f
--- /dev/null
+++ b/testsuite/tests/effects/sched.reference
@@ -0,0 +1 @@
+A+,+B.C,D,[]!E.
diff --git a/testsuite/tests/effects/shallow_state.ml b/testsuite/tests/effects/shallow_state.ml
new file mode 100644
index 000000000000..7d74a8e1b861
--- /dev/null
+++ b/testsuite/tests/effects/shallow_state.ml
@@ -0,0 +1,48 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Shallow
+
+(*
+let handle_state init f x =
+  let rec loop state k x =
+    continue k x with
+    | result -> result, state
+    | effect Get, k -> loop state k state
+    | effect Set new_state, k -> loop new_state k ()
+  in
+  loop init (fiber f) x
+*)
+
+type _ eff += Get : int eff
+            | Set : int -> unit eff
+
+let handle_state init f x =
+  let rec loop : type a r. int -> (a, r) continuation -> a -> r * int =
+    fun state k x ->
+      continue_with k x
+      { retc = (fun result -> result, state);
+        exnc = (fun e -> raise e);
+        effc = (fun (type b) (eff : b eff) ->
+          match eff with
+          | Get -> Some (fun (k : (b,r) continuation) ->
+              loop state k state)
+          | Set new_state -> Some (fun (k : (b,r) continuation) ->
+              loop new_state k ())
+          | e -> None) }
+  in
+  loop init (fiber f) x
+
+
+let comp () =
+  Printf.printf "Initial state: %d\n" (perform Get);
+  perform (Set 42);
+  Printf.printf "Updated state: %d\n" (perform Get);
+  perform (Set 43)
+
+let main () =
+  let (), i = handle_state 0 comp () in
+  Printf.printf "Final state: %d\n" i
+
+let _ = main ()
diff --git a/testsuite/tests/effects/shallow_state.reference b/testsuite/tests/effects/shallow_state.reference
new file mode 100644
index 000000000000..6cb73dd1e206
--- /dev/null
+++ b/testsuite/tests/effects/shallow_state.reference
@@ -0,0 +1,3 @@
+Initial state: 0
+Updated state: 42
+Final state: 43
diff --git a/testsuite/tests/effects/shallow_state_io.ml b/testsuite/tests/effects/shallow_state_io.ml
new file mode 100644
index 000000000000..ecf431db732d
--- /dev/null
+++ b/testsuite/tests/effects/shallow_state_io.ml
@@ -0,0 +1,51 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Shallow
+
+type _ eff += Get   : int eff
+            | Set   : int -> unit eff
+            | Print : string -> unit eff
+
+let handle_state init f x =
+  let rec loop : type a r. int -> (a, r) continuation -> a -> r * int =
+    fun state k x ->
+      continue_with k x
+      { retc = (fun result -> result, state);
+        exnc = (fun e -> raise e);
+        effc = (fun (type b) (eff : b eff) ->
+          match eff with
+          | Get -> Some (fun (k : (b,r) continuation) ->
+              loop state k state)
+          | Set new_state -> Some (fun (k : (b,r) continuation) ->
+              loop new_state k ())
+          | e -> None) }
+  in
+  loop init (fiber f) x
+
+let handle_print f =
+  let rec loop : type r. (unit, r) continuation -> r =
+    fun k ->
+      continue_with k ()
+      { retc = (fun x -> x);
+        exnc = (fun e -> raise e);
+        effc = (fun (type a) (eff : a eff) ->
+          match eff with
+          | Print s -> Some (fun (k : (a,r) continuation) ->
+              print_string s; loop k)
+          | e -> None) }
+  in
+  loop (fiber f)
+
+let comp () =
+  perform (Print (Printf.sprintf "Initial state: %d\n" (perform Get)));
+  perform (Set 42);
+  perform (Print (Printf.sprintf "Updated state: %d\n" (perform Get)));
+  perform (Set 43)
+
+let main () =
+  let (), i = handle_print (handle_state 0 comp) in
+  Printf.printf "Final state: %d\n" i
+
+let _ = main ()
diff --git a/testsuite/tests/effects/shallow_state_io.reference b/testsuite/tests/effects/shallow_state_io.reference
new file mode 100644
index 000000000000..6cb73dd1e206
--- /dev/null
+++ b/testsuite/tests/effects/shallow_state_io.reference
@@ -0,0 +1,3 @@
+Initial state: 0
+Updated state: 42
+Final state: 43
diff --git a/testsuite/tests/effects/test1.ml b/testsuite/tests/effects/test1.ml
new file mode 100644
index 000000000000..2654f113dd83
--- /dev/null
+++ b/testsuite/tests/effects/test1.ml
@@ -0,0 +1,15 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let () =
+  Printf.printf "%d\n%!" @@
+    try_with (fun x -> x) 10
+    { effc = (fun (type a) (e : a eff) ->
+        match e with
+        | E -> Some (fun k -> 11)
+        | e -> None) }
diff --git a/testsuite/tests/effects/test1.reference b/testsuite/tests/effects/test1.reference
new file mode 100644
index 000000000000..f599e28b8ab0
--- /dev/null
+++ b/testsuite/tests/effects/test1.reference
@@ -0,0 +1 @@
+10
diff --git a/testsuite/tests/effects/test10.ml b/testsuite/tests/effects/test10.ml
new file mode 100644
index 000000000000..612692ac9dcb
--- /dev/null
+++ b/testsuite/tests/effects/test10.ml
@@ -0,0 +1,34 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += Peek : int eff
+type _ eff += Poke : unit eff
+
+let rec a i = perform Peek + Random.int i
+let rec b i = a i + Random.int i
+let rec c i = b i + Random.int i
+
+let rec d i =
+  Random.int i +
+  try_with c i
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | Poke -> Some (fun (k : (a,_) continuation) -> continue k ())
+      | _ -> None }
+
+let rec e i =
+  Random.int i +
+  try_with d i
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | Peek -> Some (fun (k : (a,_) continuation) ->
+          ignore (Deep.get_callstack k 100);
+          continue k 42)
+      | _ -> None }
+
+let _ =
+  ignore (e 1);
+  print_string "ok\n"
diff --git a/testsuite/tests/effects/test10.reference b/testsuite/tests/effects/test10.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/effects/test10.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/effects/test11.ml b/testsuite/tests/effects/test11.ml
new file mode 100644
index 000000000000..32b706a58010
--- /dev/null
+++ b/testsuite/tests/effects/test11.ml
@@ -0,0 +1,22 @@
+(* TEST
+*)
+
+(* Tests RESUMETERM with extra_args != 0 in bytecode,
+   by calling a handler with a tail-continue that returns a function *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : int eff
+
+let handle comp =
+  try_with comp ()
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun (k : (a,_) continuation) -> continue k 10)
+      | _ -> None }
+
+let () =
+  handle (fun () ->
+      Printf.printf "%d\n" (perform E);
+      Printf.printf "%d\n") 42
diff --git a/testsuite/tests/effects/test11.reference b/testsuite/tests/effects/test11.reference
new file mode 100644
index 000000000000..5c8f9eaff135
--- /dev/null
+++ b/testsuite/tests/effects/test11.reference
@@ -0,0 +1,2 @@
+10
+42
diff --git a/testsuite/tests/effects/test2.ml b/testsuite/tests/effects/test2.ml
new file mode 100644
index 000000000000..c7cd9d51bc35
--- /dev/null
+++ b/testsuite/tests/effects/test2.ml
@@ -0,0 +1,30 @@
+(* TEST
+ *)
+
+open Printf
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : int -> int eff
+
+let f () =
+  printf "perform effect (E 0)\n%!";
+  let v = perform (E 0) in
+  printf "perform returns %d\n%!" v;
+  v + 1
+
+let h : type a. a eff -> ((a, 'b) continuation -> 'b) option = function
+  | E v -> Some (fun k ->
+      printf "caught effect (E %d). continuting..\n%!" v;
+      let v = continue k (v + 1) in
+      printf "continue returns %d\n%!" v;
+      v + 1)
+  | e -> None
+
+let v =
+  match_with f ()
+  { retc = (fun v -> printf "done %d\n%!" v; v + 1);
+    exnc = (fun e -> raise e);
+    effc = h }
+
+let () = printf "result=%d\n%!" v
diff --git a/testsuite/tests/effects/test2.reference b/testsuite/tests/effects/test2.reference
new file mode 100644
index 000000000000..951cd4f387ca
--- /dev/null
+++ b/testsuite/tests/effects/test2.reference
@@ -0,0 +1,6 @@
+perform effect (E 0)
+caught effect (E 0). continuting..
+perform returns 1
+done 2
+continue returns 3
+result=4
diff --git a/testsuite/tests/effects/test3.ml b/testsuite/tests/effects/test3.ml
new file mode 100644
index 000000000000..9d413920ded6
--- /dev/null
+++ b/testsuite/tests/effects/test3.ml
@@ -0,0 +1,22 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+exception X
+
+let () =
+  Printf.printf "%d\n%!" @@
+  match_with (fun () ->
+    Printf.printf "in handler. raising X\n%!";
+    raise X) ()
+    { retc = (fun v -> v);
+      exnc = (function
+        | X -> 10
+        | e -> raise e);
+      effc = (fun (type a) (e : a eff) ->
+        match e with
+        | E -> Some (fun k -> 11)
+        | e -> None) }
diff --git a/testsuite/tests/effects/test3.reference b/testsuite/tests/effects/test3.reference
new file mode 100644
index 000000000000..78ea20d6e8cd
--- /dev/null
+++ b/testsuite/tests/effects/test3.reference
@@ -0,0 +1,2 @@
+in handler. raising X
+10
diff --git a/testsuite/tests/effects/test4.ml b/testsuite/tests/effects/test4.ml
new file mode 100644
index 000000000000..0169bab8400b
--- /dev/null
+++ b/testsuite/tests/effects/test4.ml
@@ -0,0 +1,21 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += Foo : int -> int eff
+
+let r =
+  try_with perform (Foo 3)
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | Foo i -> Some (fun (k : (a,_) continuation) ->
+          try_with (continue k) (i+1)
+          { effc = fun (type a) (e : a eff) ->
+              match e with
+              | Foo i -> Some (fun k -> failwith "NO")
+              | e -> None })
+      | e -> None }
+
+let () = Printf.printf "%d\n" r
diff --git a/testsuite/tests/effects/test4.reference b/testsuite/tests/effects/test4.reference
new file mode 100644
index 000000000000..b8626c4cff28
--- /dev/null
+++ b/testsuite/tests/effects/test4.reference
@@ -0,0 +1 @@
+4
diff --git a/testsuite/tests/effects/test5.ml b/testsuite/tests/effects/test5.ml
new file mode 100644
index 000000000000..4529e4b6df8a
--- /dev/null
+++ b/testsuite/tests/effects/test5.ml
@@ -0,0 +1,24 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += Foo : int -> int eff
+
+let f () = (perform (Foo 3)) (* 3 + 1 *)
+         + (perform (Foo 3)) (* 3 + 1 *)
+
+let r =
+  try_with f ()
+  { effc = fun (type a) (e : a eff) ->
+      match e with
+      | Foo i -> Some (fun (k : (a, _) continuation) ->
+          try_with (continue k) (i + 1)
+          { effc = fun (type a) (e : a eff) ->
+              match e with
+              | Foo i -> Some (fun k -> failwith "NO")
+              | _ -> None })
+      | e -> None }
+
+let () = Printf.printf "%d\n" r
diff --git a/testsuite/tests/effects/test5.reference b/testsuite/tests/effects/test5.reference
new file mode 100644
index 000000000000..45a4fb75db86
--- /dev/null
+++ b/testsuite/tests/effects/test5.reference
@@ -0,0 +1 @@
+8
diff --git a/testsuite/tests/effects/test6.ml b/testsuite/tests/effects/test6.ml
new file mode 100644
index 000000000000..c6c9991a3895
--- /dev/null
+++ b/testsuite/tests/effects/test6.ml
@@ -0,0 +1,22 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+            | F : unit eff
+
+let () =
+  let ok1 = ref false and ok2 = ref false in
+  let f r =
+    try perform E with Unhandled -> r := true in
+  f ok1;
+  Printf.printf "%b\n%!" !ok1;
+  try_with f ok2 {
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | F -> Some (fun k -> assert false)
+      | _ -> None
+  };
+  Printf.printf "%b\n%!" !ok2
diff --git a/testsuite/tests/effects/test6.reference b/testsuite/tests/effects/test6.reference
new file mode 100644
index 000000000000..bb101b641b9b
--- /dev/null
+++ b/testsuite/tests/effects/test6.reference
@@ -0,0 +1,2 @@
+true
+true
diff --git a/testsuite/tests/effects/test_lazy.ml b/testsuite/tests/effects/test_lazy.ml
new file mode 100644
index 000000000000..697fddd2d1e9
--- /dev/null
+++ b/testsuite/tests/effects/test_lazy.ml
@@ -0,0 +1,49 @@
+(* TEST *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += Stop : unit eff
+
+let f count =
+  let r = ref 0 in
+  for i = 1 to count do
+    incr r;
+    if i = count / 2 then perform Stop
+  done;
+  !r
+
+let _ =
+  let l = lazy (f 1_000) in
+  let v1 =
+    try_with Lazy.force l
+    { effc = fun (type a) (e : a eff) ->
+        match e with
+        | Stop -> Some (fun (k : (a, _) continuation) -> continue k ())
+        | _ -> None }
+  in
+  Printf.printf "%d\n" v1;
+  let l2 = lazy (f 2_000) in
+  let v2 =
+    try_with Lazy.force l2
+    { effc = fun (type a) (e : a eff) ->
+        match e with
+        | Stop -> Some (fun (k : (a, _) continuation) ->
+            let d = Domain.spawn(fun () -> continue k ()) in
+            Domain.join d)
+        | _ -> None }
+  in
+  Printf.printf "%d\n" v2;
+  let l3 = lazy (f 3_000) in
+  let _ =
+    try_with Lazy.force l3
+    { effc = fun (type a) (e : a eff) ->
+        match e with
+        | Stop -> Some (fun _ ->
+            try
+              let d = Domain.spawn(fun () -> Lazy.force l3) in
+              Domain.join d
+            with CamlinternalLazy.Undefined -> Printf.printf "Undefined\n"; 0)
+        | _ -> None }
+  in
+  ()
diff --git a/testsuite/tests/effects/test_lazy.reference b/testsuite/tests/effects/test_lazy.reference
new file mode 100644
index 000000000000..3e572fff4a64
--- /dev/null
+++ b/testsuite/tests/effects/test_lazy.reference
@@ -0,0 +1,3 @@
+1000
+2000
+Undefined
diff --git a/testsuite/tests/effects/used_cont.ml b/testsuite/tests/effects/used_cont.ml
new file mode 100644
index 000000000000..e302d7231852
--- /dev/null
+++ b/testsuite/tests/effects/used_cont.ml
@@ -0,0 +1,21 @@
+(* TEST
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += E : unit eff
+
+let r = ref None
+let () =
+  match_with (fun _ -> perform E; 42) ()
+  { retc = (fun n -> assert (n = 42));
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+      match e with
+      | E -> Some (fun (k : (a,_) continuation) ->
+          continue k ();
+          r := Some (k : (unit, unit) continuation);
+          Gc.full_major ();
+          print_string "ok\n")
+      | _ -> None }
diff --git a/testsuite/tests/effects/used_cont.reference b/testsuite/tests/effects/used_cont.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/effects/used_cont.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/ephe-c-api/test.ml b/testsuite/tests/ephe-c-api/test.ml
index a29cd8516b1e..ff1646835a1f 100644
--- a/testsuite/tests/ephe-c-api/test.ml
+++ b/testsuite/tests/ephe-c-api/test.ml
@@ -1,5 +1,7 @@
 (* TEST
    modules = "stubs.c"
+   * skip
+   reason = "port the new Ephemeron C-api to multicore : https://github.com/ocaml/ocaml/pull/676"
 *)
 
 (* C version of ephetest.ml *)
diff --git a/testsuite/tests/functors/functors.compilers.reference b/testsuite/tests/functors/functors.compilers.reference
index a3ae39de5078..17bc253eac72 100644
--- a/testsuite/tests/functors/functors.compilers.reference
+++ b/testsuite/tests/functors/functors.compilers.reference
@@ -4,7 +4,7 @@
        (module-defn(O) Functors functors.ml(12):184-279
          (function X is_a_functor always_inline
            (let
-             (cow = (function x[int] : int (apply (field 0 X) x))
+             (cow = (function x[int] : int (apply (field_imm 0 X) x))
               sheep = (function x[int] : int (+ 1 (apply cow x))))
              (makeblock 0 cow sheep))))
      F =
@@ -13,7 +13,7 @@
            (let
              (cow =
                 (function x[int] : int
-                  (apply (field 0 Y) (apply (field 0 X) x)))
+                  (apply (field_imm 0 Y) (apply (field_imm 0 X) x)))
               sheep = (function x[int] : int (+ 1 (apply cow x))))
              (makeblock 0 cow sheep))))
      F1 =
@@ -22,17 +22,17 @@
            (let
              (sheep =
                 (function x[int] : int
-                  (+ 1 (apply (field 0 Y) (apply (field 0 X) x)))))
+                  (+ 1 (apply (field_imm 0 Y) (apply (field_imm 0 X) x)))))
              (makeblock 0 sheep))))
      F2 =
        (module-defn(F2) Functors functors.ml(36):634-784
          (function X Y is_a_functor always_inline
            (let
-             (X =a (makeblock 0 (field 1 X))
-              Y =a (makeblock 0 (field 1 Y))
+             (X =a (makeblock 0 (field_mut 1 X))
+              Y =a (makeblock 0 (field_mut 1 Y))
               sheep =
                 (function x[int] : int
-                  (+ 1 (apply (field 0 Y) (apply (field 0 X) x)))))
+                  (+ 1 (apply (field_imm 0 Y) (apply (field_imm 0 X) x)))))
              (makeblock 0 sheep))))
      M =
        (module-defn(M) Functors functors.ml(41):786-970
@@ -43,14 +43,14 @@
                   (let
                     (cow =
                        (function x[int] : int
-                         (apply (field 0 Y) (apply (field 0 X) x)))
+                         (apply (field_imm 0 Y) (apply (field_imm 0 X) x)))
                      sheep = (function x[int] : int (+ 1 (apply cow x))))
                     (makeblock 0 cow sheep)))))
            (makeblock 0
              (function funarg funarg is_a_functor stub
                (let
                  (let =
-                    (apply F (makeblock 0 (field 1 funarg))
-                      (makeblock 0 (field 1 funarg))))
-                 (makeblock 0 (field 1 let))))))))
+                    (apply F (makeblock 0 (field_mut 1 funarg))
+                      (makeblock 0 (field_mut 1 funarg))))
+                 (makeblock 0 (field_mut 1 let))))))))
     (makeblock 0 O F F1 F2 M)))
diff --git a/testsuite/tests/gc-roots/globroots.ml b/testsuite/tests/gc-roots/globroots.ml
index 3363eba3b435..b11aff1f67a1 100644
--- a/testsuite/tests/gc-roots/globroots.ml
+++ b/testsuite/tests/gc-roots/globroots.ml
@@ -1,5 +1,4 @@
 (* TEST
-   flags += " -w -a "
    modules = "globrootsprim.c"
 *)
 
@@ -27,10 +26,12 @@ module Generational : GLOBREF = struct
   external remove: t -> unit = "gb_generational_remove"
 end
 
-module Test(G: GLOBREF) = struct
+module Test(G: GLOBREF) () = struct
 
   let size = 1024
 
+  let random_state = Domain.DLS.new_key Random.State.make_self_init
+
   let vals = Array.init size Int.to_string
 
   let a = Array.init size (fun i -> G.register (Int.to_string i))
@@ -44,49 +45,35 @@ module Test(G: GLOBREF) = struct
     done
 
   let change () =
-    match Random.int 37 with
+    match Random.State.int (Domain.DLS.get random_state) 37 with
     | 0 ->
         Gc.full_major()
     | 1|2|3|4 ->
         Gc.minor()
     | 5|6|7|8|9|10|11|12 ->             (* update with young value *)
-        let i = Random.int size in
+        let i = Random.State.int (Domain.DLS.get random_state) size in
         G.set a.(i) (Int.to_string i)
     | 13|14|15|16|17|18|19|20 ->        (* update with old value *)
-        let i = Random.int size in
+        let i = Random.State.int (Domain.DLS.get random_state) size in
         G.set a.(i) vals.(i)
     | 21|22|23|24|25|26|27|28 ->        (* re-register young value *)
-        let i = Random.int size in
+        let i = Random.State.int (Domain.DLS.get random_state) size in
         G.remove a.(i);
         a.(i) <- G.register (Int.to_string i)
     | (*29|30|31|32|33|34|35|36*) _ ->  (* re-register old value *)
-        let i = Random.int size in
+        let i = Random.State.int (Domain.DLS.get random_state) size in
         G.remove a.(i);
         a.(i) <- G.register vals.(i)
 
   let test n =
     for i = 1 to n do
-      change();
-      print_string "."; flush stdout
+      change(); check();
     done
 end
 
-module TestClassic = Test(Classic)
-module TestGenerational = Test(Generational)
+module TestClassic = Test(Classic) ()
+module TestGenerational = Test(Generational) ()
 
 external young2old : unit -> unit = "gb_young2old"
-let _ = young2old (); Gc.full_major ()
 
 external static2young : int * int -> (unit -> unit) -> int = "gb_static2young"
-let _ =
-  assert (static2young (1, 1) Gc.full_major == 0x42)
-
-let _ =
-  let n =
-    if Array.length Sys.argv < 2 then 10000 else int_of_string Sys.argv.(1) in
-  print_string "Non-generational API\n";
-  TestClassic.test n;
-  print_newline();
-  print_string "Generational API\n";
-  TestGenerational.test n;
-  print_newline()
diff --git a/testsuite/tests/gc-roots/globroots_parallel.ml b/testsuite/tests/gc-roots/globroots_parallel.ml
new file mode 100644
index 000000000000..4c3e36d14087
--- /dev/null
+++ b/testsuite/tests/gc-roots/globroots_parallel.ml
@@ -0,0 +1,23 @@
+(* TEST
+   flags += " -w a "
+   modules = "globrootsprim.c globroots.ml"
+*)
+
+open Globroots
+
+let num_domains = 8
+let n = 125
+
+let _ =
+  let domains = Array.init (num_domains - 1) (fun _ ->
+    Domain.spawn(fun () ->
+      let module TestClassic = Test(Classic) () in
+      let module TestGenerational = Test(Generational) () in
+      TestClassic.test n;
+      TestGenerational.test n)) in
+  young2old (); Gc.full_major ();
+  assert (static2young (1, 1) Gc.full_major == 0x42);
+  TestClassic.test n;
+  TestGenerational.test n;
+  Array.iter Domain.join domains;
+  print_string "ok\n"
diff --git a/testsuite/tests/gc-roots/globroots_parallel.reference b/testsuite/tests/gc-roots/globroots_parallel.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/gc-roots/globroots_parallel.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/gc-roots/globroots_parallel_spawn_burn.ml b/testsuite/tests/gc-roots/globroots_parallel_spawn_burn.ml
new file mode 100644
index 000000000000..bf392c13b028
--- /dev/null
+++ b/testsuite/tests/gc-roots/globroots_parallel_spawn_burn.ml
@@ -0,0 +1,51 @@
+(* TEST
+   flags += " -w a "
+   modules = "globrootsprim.c globroots.ml"
+*)
+
+open Globroots
+
+module TestClassic = Test(Classic)
+module TestGenerational = Test(Generational)
+
+let n = 10
+
+let _ =
+  for _ = 1 to 20 do
+    let burn = fun () ->
+      let module TestClassic = Test(Classic) () in
+      let module TestGenerational = Test(Generational) () in
+      TestClassic.test n;
+      TestGenerational.test n in
+    let d = Array.init 4 (fun _ -> Domain.spawn burn) in
+    Array.iter Domain.join d
+  done;
+  let n = 128 in
+
+  let arr_classic =
+    Array.init n (fun i -> Classic.register (Int.to_string i)) in
+  let d_classic_set = Array.init 4 (fun i -> Domain.spawn(fun () ->
+    for j = i * (n / 4) to ((i + 1) * (n / 4) - 1) do
+      Classic.set arr_classic.(j) (Int.to_string (j * 4))
+    done)) in
+  Array.iter Domain.join d_classic_set;
+  let d_classic_remove = Array.init 4 (fun i -> Domain.spawn(fun () ->
+    for j = i * (n / 4) to ((i + 1) * (n / 4) - 1) do
+      Classic.remove arr_classic.(j)
+    done)) in
+  Array.iter Domain.join d_classic_remove;
+
+  let arr_generational =
+    Array.init 128 (fun i -> Generational.register (Int.to_string (i+1))) in
+  let d_generational_set = Array.init 4 (fun i -> Domain.spawn(fun () ->
+    for j = i * (n / 4) to ((i + 1) * (n / 4) - 1) do
+      Generational.set arr_generational.(j) (Int.to_string (j * 4))
+    done)) in
+  Array.iter Domain.join d_generational_set;
+  let d_generational_remove = Array.init 4 (fun i -> Domain.spawn(fun () ->
+    for j = i * (n / 4) to ((i + 1) * (n / 4) - 1) do
+      Generational.remove arr_generational.(j)
+    done)) in
+  Array.iter Domain.join d_generational_remove;
+
+  print_string "ok\n";
diff --git a/testsuite/tests/gc-roots/globroots_parallel_spawn_burn.reference b/testsuite/tests/gc-roots/globroots_parallel_spawn_burn.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/gc-roots/globroots_parallel_spawn_burn.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/gc-roots/globroots_sequential.ml b/testsuite/tests/gc-roots/globroots_sequential.ml
new file mode 100644
index 000000000000..6bf995bf96b5
--- /dev/null
+++ b/testsuite/tests/gc-roots/globroots_sequential.ml
@@ -0,0 +1,67 @@
+(* TEST
+   flags += " -w a "
+   modules = "globrootsprim.c globroots.ml"
+*)
+open Globroots
+
+module Test(G: GLOBREF) = struct
+
+  let size = 1024
+
+  let vals = Array.init size Int.to_string
+
+  let a = Array.init size (fun i -> G.register (Int.to_string i))
+
+  let check () =
+    for i = 0 to size - 1 do
+      if G.get a.(i) <> vals.(i) then begin
+        print_string "Error on "; print_int i; print_string ": ";
+        print_string (String.escaped (G.get a.(i))); print_newline()
+      end
+    done
+
+  let change () =
+    match Random.int 37 with
+    | 0 ->
+        Gc.full_major()
+    | 1|2|3|4 ->
+        Gc.minor()
+    | 5|6|7|8|9|10|11|12 ->             (* update with young value *)
+        let i = Random.int size in
+        G.set a.(i) (Int.to_string i)
+    | 13|14|15|16|17|18|19|20 ->        (* update with old value *)
+        let i = Random.int size in
+        G.set a.(i) vals.(i)
+    | 21|22|23|24|25|26|27|28 ->        (* re-register young value *)
+        let i = Random.int size in
+        G.remove a.(i);
+        a.(i) <- G.register (Int.to_string i)
+    | (*29|30|31|32|33|34|35|36*) _ ->  (* re-register old value *)
+        let i = Random.int size in
+        G.remove a.(i);
+        a.(i) <- G.register vals.(i)
+
+  let test n =
+    for i = 1 to n do
+      change(); check ();
+      print_string "."; flush stdout
+    done
+end
+
+module TestClassic = Test(Classic)
+module TestGenerational = Test(Generational)
+
+let _ = young2old (); Gc.full_major ()
+
+let _ =
+  assert (static2young (1, 1) Gc.full_major == 0x42)
+
+let _ =
+  let n =
+    if Array.length Sys.argv < 2 then 10000 else int_of_string Sys.argv.(1) in
+  print_string "Non-generational API\n";
+  TestClassic.test n;
+  print_newline();
+  print_string "Generational API\n";
+  TestGenerational.test n;
+  print_newline()
diff --git a/testsuite/tests/gc-roots/globroots.reference b/testsuite/tests/gc-roots/globroots_sequential.reference
similarity index 100%
rename from testsuite/tests/gc-roots/globroots.reference
rename to testsuite/tests/gc-roots/globroots_sequential.reference
diff --git a/testsuite/tests/gc-roots/globrootsprim.c b/testsuite/tests/gc-roots/globrootsprim.c
index 0eb777b0a5b2..28d6fb941863 100644
--- a/testsuite/tests/gc-roots/globrootsprim.c
+++ b/testsuite/tests/gc-roots/globrootsprim.c
@@ -13,10 +13,13 @@
 
 /* For testing global root registration */
 
+#define CAML_INTERNALS
+
 #include "caml/mlvalues.h"
 #include "caml/memory.h"
 #include "caml/alloc.h"
 #include "caml/gc.h"
+#include "caml/shared_heap.h"
 #include "caml/callback.h"
 
 struct block { value header; value v; };
@@ -32,7 +35,7 @@ value gb_get(value vblock)
 value gb_classic_register(value v)
 {
   struct block * b = caml_stat_alloc(sizeof(struct block));
-  b->header = Make_header(1, 0, Caml_black);
+  b->header = Make_header(1, 0, NOT_MARKABLE);
   b->v = v;
   caml_register_global_root(&(b->v));
   return Val_block(b);
@@ -53,7 +56,7 @@ value gb_classic_remove(value vblock)
 value gb_generational_register(value v)
 {
   struct block * b = caml_stat_alloc(sizeof(struct block));
-  b->header = Make_header(1, 0, Caml_black);
+  b->header = Make_header(1, 0, NOT_MARKABLE);
   b->v = v;
   caml_register_generational_global_root(&(b->v));
   return Val_block(b);
diff --git a/testsuite/tests/generalized-open/gpr1506.ml b/testsuite/tests/generalized-open/gpr1506.ml
index bd72ed302fb6..dd55e349afc1 100644
--- a/testsuite/tests/generalized-open/gpr1506.ml
+++ b/testsuite/tests/generalized-open/gpr1506.ml
@@ -103,9 +103,9 @@ include struct open struct type t = T end let x = T end
 Line 1, characters 15-41:
 1 | include struct open struct type t = T end let x = T end
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^
-Error: The type t/337 introduced by this open appears in the signature
+Error: The type t/342 introduced by this open appears in the signature
        Line 1, characters 46-47:
-         The value x has no valid type if t/337 is hidden
+         The value x has no valid type if t/342 is hidden
 |}];;
 
 module A = struct
@@ -123,9 +123,9 @@ Lines 3-6, characters 4-7:
 4 |       type t = T
 5 |       let x = T
 6 |     end
-Error: The type t/342 introduced by this open appears in the signature
+Error: The type t/347 introduced by this open appears in the signature
        Line 7, characters 8-9:
-         The value y has no valid type if t/342 is hidden
+         The value y has no valid type if t/347 is hidden
 |}];;
 
 module A = struct
@@ -142,9 +142,9 @@ Lines 3-5, characters 4-7:
 3 | ....open struct
 4 |       type t = T
 5 |     end
-Error: The type t/347 introduced by this open appears in the signature
+Error: The type t/352 introduced by this open appears in the signature
        Line 6, characters 8-9:
-         The value y has no valid type if t/347 is hidden
+         The value y has no valid type if t/352 is hidden
 |}]
 
 (* It was decided to not allow this anymore. *)
diff --git a/testsuite/tests/instrumented-runtime/main.ml b/testsuite/tests/instrumented-runtime/main.ml
index 94200ec67f45..084ceb03cabd 100644
--- a/testsuite/tests/instrumented-runtime/main.ml
+++ b/testsuite/tests/instrumented-runtime/main.ml
@@ -1,5 +1,7 @@
 (* TEST
   * instrumented-runtime
+  * skip
+  reason = "instrumented runtime test is not very useful and broken on multicore. (#9413)"
   ** native
     flags = "-runtime-variant=i"
 *)
diff --git a/testsuite/tests/lazy/lazy2.ml b/testsuite/tests/lazy/lazy2.ml
new file mode 100644
index 000000000000..cccbd96d7145
--- /dev/null
+++ b/testsuite/tests/lazy/lazy2.ml
@@ -0,0 +1,10 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+
+open Domain
+
+let () =
+  let l = lazy (print_string "Lazy Forced\n") in
+  let d = spawn (fun () -> Lazy.force l) in
+  join d
diff --git a/testsuite/tests/lazy/lazy2.reference b/testsuite/tests/lazy/lazy2.reference
new file mode 100644
index 000000000000..9a0db54156f1
--- /dev/null
+++ b/testsuite/tests/lazy/lazy2.reference
@@ -0,0 +1 @@
+Lazy Forced
diff --git a/testsuite/tests/lazy/lazy3.ml b/testsuite/tests/lazy/lazy3.ml
new file mode 100644
index 000000000000..a22a0893b99e
--- /dev/null
+++ b/testsuite/tests/lazy/lazy3.ml
@@ -0,0 +1,27 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+
+let f count =
+  let _n = (Domain.self ():> int) in
+  let r = ref 0 in
+  for i = 1 to count do
+    incr r;
+  done;
+  !r
+
+let main () =
+  let l = lazy (f 1_000_000_000) in
+  let d1 =
+    Domain.spawn (fun () ->
+        let _n = (Domain.self ():> int) in
+        Lazy.force l)
+  in
+  let n2 = Lazy.force l in
+  let n1 = Domain.join d1 in
+  (n1, n2)
+
+let _ =
+  match main () with
+  | (n1, n2) -> Printf.printf "n1=%d n2=%d\n" n1 n2
+  | exception Lazy.Undefined -> print_endline "Undefined"
diff --git a/testsuite/tests/lazy/lazy3.reference b/testsuite/tests/lazy/lazy3.reference
new file mode 100644
index 000000000000..bcd790260a9e
--- /dev/null
+++ b/testsuite/tests/lazy/lazy3.reference
@@ -0,0 +1 @@
+Undefined
diff --git a/testsuite/tests/lazy/lazy4.ml b/testsuite/tests/lazy/lazy4.ml
new file mode 100644
index 000000000000..3988318c79ed
--- /dev/null
+++ b/testsuite/tests/lazy/lazy4.ml
@@ -0,0 +1,15 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+let r = ref None
+
+let f () =
+  match !r with
+  | Some l -> Lazy.force l
+  | None -> ()
+
+let l = Lazy.from_fun f
+let _ = r := Some l
+let _ =
+  try Lazy.force l
+  with Lazy.Undefined -> print_endline "Undefined"
diff --git a/testsuite/tests/lazy/lazy4.reference b/testsuite/tests/lazy/lazy4.reference
new file mode 100644
index 000000000000..bcd790260a9e
--- /dev/null
+++ b/testsuite/tests/lazy/lazy4.reference
@@ -0,0 +1 @@
+Undefined
diff --git a/testsuite/tests/lazy/lazy5.ml b/testsuite/tests/lazy/lazy5.ml
new file mode 100644
index 000000000000..217b84175ef9
--- /dev/null
+++ b/testsuite/tests/lazy/lazy5.ml
@@ -0,0 +1,26 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+let rec safe_force l =
+  try Lazy.force l with
+  | Lazy.Undefined ->
+      Domain.cpu_relax ();
+      safe_force l
+
+let f count =
+  let _n = (Domain.self ():> int) in
+  let r = ref 0 in
+  for i = 1 to count do
+    incr r;
+  done;
+  !r
+
+let l = lazy (f 1_000_000_000)
+let d1 =
+  Domain.spawn (fun () ->
+      let _n = (Domain.self ():> int) in
+      safe_force l)
+let n2 = safe_force l
+let n1 = Domain.join d1
+
+let _ = Printf.printf "n1=%d n2=%d\n" n1 n2
diff --git a/testsuite/tests/lazy/lazy5.reference b/testsuite/tests/lazy/lazy5.reference
new file mode 100644
index 000000000000..b6706a70abb9
--- /dev/null
+++ b/testsuite/tests/lazy/lazy5.reference
@@ -0,0 +1 @@
+n1=1000000000 n2=1000000000
diff --git a/testsuite/tests/lazy/lazy6.ml b/testsuite/tests/lazy/lazy6.ml
new file mode 100644
index 000000000000..098848769a9d
--- /dev/null
+++ b/testsuite/tests/lazy/lazy6.ml
@@ -0,0 +1,32 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+
+let flag1 = Atomic.make false
+let flag2 = Atomic.make false
+
+let rec wait_for_flag f =
+  if Atomic.get f then ()
+  else (Domain.cpu_relax (); wait_for_flag f)
+
+let l1 = Lazy.from_fun (fun () ->
+  Atomic.set flag1 true;
+  wait_for_flag flag2)
+
+let first_domain () =
+  Lazy.force l1
+
+let second_domain () =
+  wait_for_flag flag1;
+  let l2 = Lazy.from_fun (fun () -> Lazy.force l1) in
+  let rec loop () =
+    try Lazy.force l2 with
+    | Lazy.Undefined -> Atomic.set flag2 true
+  in
+  loop ()
+
+let _ =
+  let d = Domain.spawn first_domain in
+  second_domain ();
+  Domain.join d;
+  print_endline "OK"
diff --git a/testsuite/tests/lazy/lazy6.reference b/testsuite/tests/lazy/lazy6.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lazy/lazy6.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/lazy/lazy7.ml b/testsuite/tests/lazy/lazy7.ml
new file mode 100644
index 000000000000..6c96f32907d0
--- /dev/null
+++ b/testsuite/tests/lazy/lazy7.ml
@@ -0,0 +1,37 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+
+let num_domains = 4
+
+let rec safe_force l =
+  try Lazy.force l with
+  | Lazy.Undefined ->
+      Domain.cpu_relax ();
+      safe_force l
+
+let f count =
+  let _n = (Domain.self ():> int) in
+  let r = ref 0 in
+  for _ = 1 to count do
+    incr r;
+  done;
+  !r
+
+let go = Atomic.make false
+
+let l = lazy (f 1_000_000_000)
+let d1 = Array.init (num_domains - 1) (fun _->
+  Domain.spawn (fun () ->
+      let rec wait () =
+        if Atomic.get go then ()
+        else wait ()
+      in
+      wait ();
+      let _n = (Domain.self ():> int) in
+      safe_force l))
+let _ = Atomic.set go true
+let n2 = safe_force l
+let n1 = Array.map Domain.join d1
+
+let _ = Printf.printf "n1=%d n2=%d\n" n1.(0) n2
diff --git a/testsuite/tests/lazy/lazy7.reference b/testsuite/tests/lazy/lazy7.reference
new file mode 100644
index 000000000000..b6706a70abb9
--- /dev/null
+++ b/testsuite/tests/lazy/lazy7.reference
@@ -0,0 +1 @@
+n1=1000000000 n2=1000000000
diff --git a/testsuite/tests/lazy/lazy8.ml b/testsuite/tests/lazy/lazy8.ml
new file mode 100644
index 000000000000..c9b5781617e5
--- /dev/null
+++ b/testsuite/tests/lazy/lazy8.ml
@@ -0,0 +1,26 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+
+exception E
+
+let main () =
+  let l = lazy (raise E) in
+
+  begin try Lazy.force_val l with
+  | E -> ()
+  end;
+
+  begin try Lazy.force_val l with
+  | Lazy.Undefined -> ()
+  end;
+
+  let d = Domain.spawn (fun () ->
+    begin try Lazy.force_val l with
+    | Lazy.Undefined -> ()
+    end)
+  in
+  Domain.join d;
+  print_endline "OK"
+
+let _ = main ()
diff --git a/testsuite/tests/lazy/lazy8.reference b/testsuite/tests/lazy/lazy8.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lazy/lazy8.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/lazy/minor_major_force.ml b/testsuite/tests/lazy/minor_major_force.ml
new file mode 100644
index 000000000000..b18fe44a9b98
--- /dev/null
+++ b/testsuite/tests/lazy/minor_major_force.ml
@@ -0,0 +1,73 @@
+(* TEST
+   ocamlopt_flags += " -O3 "
+*)
+
+(*
+  - create a record with a mutable field that has a lazy value in it
+  - force a minor_gc to make sure that record is on the heap
+  - update the lazy value to be a minor heap value
+  - force the lazy value to be a forward to an item in the minor heap
+  - call minor_gc and watch it fail the assert which makes sure that all remembered set items have been forwarded
+*)
+
+type test_record = {
+  mutable lzy_str: string Lazy.t;
+  mutable lzy_int: int Lazy.t;
+}
+
+let is_shared x = Obj.is_shared (Obj.repr x)
+
+let glbl_int = ref 0
+let glbl_string = ref "init"
+
+let get_random_int () =
+  Random.int 256
+
+let get_random_string () =
+  Printf.sprintf "%f" (Random.float 1.)
+
+
+let get_lazy_status fmt_str x =
+  if Lazy.is_val x then
+    Printf.sprintf fmt_str (Lazy.force x)
+  else
+    "<not forced>"
+
+let get_lazy_int_status x = get_lazy_status "%d" x
+let get_lazy_string_status x = get_lazy_status "%s" x
+
+let dump_record_status x =
+  Printf.printf "x.lzy_string=%s [shared=%b]\n" (get_lazy_string_status x.lzy_str) (is_shared x.lzy_str);
+  Printf.printf "x.lzy_int=%s [shared=%b]\n" (get_lazy_int_status x.lzy_int) (is_shared x.lzy_int)
+
+let force_lazy_vals x =
+  let v = Lazy.force x.lzy_str in
+  Printf.printf "forcing x.lzy_str [%s] %b %d\n%!" v (is_shared x.lzy_str) (Obj.tag (Obj.repr x.lzy_str));
+  let v = Lazy.force x.lzy_int in
+  Printf.printf "forcing x.lzy_int [%d] %b %d\n%!" v (is_shared x.lzy_int) (Obj.tag (Obj.repr x.lzy_int))
+
+let do_minor_gc () =
+  Printf.printf "Gc.minor ()\n%!";
+  Gc.minor ()
+
+let () =
+  Random.init 34;
+  let x = {
+    lzy_str = lazy (glbl_string := get_random_string (); !glbl_string);
+    lzy_int = lazy (glbl_int := get_random_int (); !glbl_int);
+  } in
+
+  do_minor_gc ();
+  (* x should now be on the heap *)
+  dump_record_status x;
+  Printf.printf "x is setup on major heap\n\n%!";
+
+  Printf.printf "updating fields in x\n\n%!";
+  x.lzy_str <- lazy (glbl_string := get_random_string (); !glbl_string);
+  x.lzy_int <- lazy (glbl_int := get_random_int (); !glbl_int);
+  dump_record_status x;
+
+  force_lazy_vals x;
+  dump_record_status x;
+  do_minor_gc ();
+  dump_record_status x
diff --git a/testsuite/tests/lazy/minor_major_force.reference b/testsuite/tests/lazy/minor_major_force.reference
new file mode 100644
index 000000000000..af75cd6e639f
--- /dev/null
+++ b/testsuite/tests/lazy/minor_major_force.reference
@@ -0,0 +1,16 @@
+Gc.minor ()
+x.lzy_string=<not forced> [shared=true]
+x.lzy_int=<not forced> [shared=true]
+x is setup on major heap
+
+updating fields in x
+
+x.lzy_string=<not forced> [shared=false]
+x.lzy_int=<not forced> [shared=false]
+forcing x.lzy_str [0.152944] false 250
+forcing x.lzy_int [175] false 250
+x.lzy_string=0.152944 [shared=false]
+x.lzy_int=175 [shared=false]
+Gc.minor ()
+x.lzy_string=0.152944 [shared=true]
+x.lzy_int=175 [shared=true]
diff --git a/testsuite/tests/letrec-compilation/modrec.ml b/testsuite/tests/letrec-compilation/modrec.ml
new file mode 100644
index 000000000000..b467039fa78f
--- /dev/null
+++ b/testsuite/tests/letrec-compilation/modrec.ml
@@ -0,0 +1,8 @@
+module rec M : sig
+  val f : unit -> unit
+  val g : unit -> unit
+end = struct
+  let rec f () = () and g () = ()
+end
+
+let () = print_endline "ok"
diff --git a/testsuite/tests/letrec-compilation/modrec.reference b/testsuite/tests/letrec-compilation/modrec.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/letrec-compilation/modrec.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/lf_skiplist/stubs.c b/testsuite/tests/lf_skiplist/stubs.c
new file mode 100644
index 000000000000..5a20d6d99c61
--- /dev/null
+++ b/testsuite/tests/lf_skiplist/stubs.c
@@ -0,0 +1,164 @@
+#define CAML_INTERNALS
+
+#include "caml/lf_skiplist.h"
+#include "caml/memory.h"
+#include <assert.h>
+#define FMT ARCH_INTNAT_PRINTF_FORMAT
+
+CAMLextern value test_skiplist_serial(value val) {
+  CAMLparam0();
+
+  struct lf_skiplist list;
+
+  caml_lf_skiplist_init(&list);
+
+  for (int c = 1; c < 10000; c++) {
+    assert(caml_lf_skiplist_insert(&list, c, c));
+  }
+
+  for (int c = 1; c < 10000; c++) {
+    uintnat j = 0;
+    caml_lf_skiplist_find(&list, c, &j);
+    assert(j == c);
+  }
+
+  for (int c = 1; c < 10000; c++) {
+    assert(caml_lf_skiplist_remove(&list, c));
+  }
+
+  for (int c = 1; c < 10000; c++) {
+    uintnat j = 0;
+    assert(caml_lf_skiplist_insert(&list, c, c));
+    caml_lf_skiplist_find(&list, c, &j);
+    assert(j == c);
+    assert(caml_lf_skiplist_remove(&list, c));
+  }
+
+  CAMLreturn(Val_unit);
+}
+
+static struct lf_skiplist the_list;
+
+CAMLextern value init_skiplist(value val) {
+  CAMLparam0();
+
+  caml_lf_skiplist_init(&the_list);
+
+  CAMLreturn(Val_unit);
+}
+
+CAMLextern value cardinal_skiplist(value val) {
+  CAMLparam0();
+  uintnat r = 0;
+  FOREACH_LF_SKIPLIST_ELEMENT(p,&the_list,r++);
+  CAMLreturn(Val_long(r));
+}
+
+static int get_len(struct  lf_skipcell *p, struct lf_skipcell *end) {
+  int len = 0 ;
+  for ( ; p != end; p = atomic_load(&p->garbage_next)) len++ ;
+  return len ;
+}
+
+
+static uintnat count_marks(struct lf_skiplist *sk) {
+  uintnat r = 0;
+  struct lf_skipcell *p = sk->head;
+  uintptr_t succ;
+
+  while (p) {
+    for (int k = p->top_level; k >= 0; k--) {
+      succ =
+        (uintptr_t)atomic_load_explicit(&p->forward[k],memory_order_relaxed);
+      if (LF_SK_IS_MARKED(succ)) r++ ;
+    }
+    p = LF_SK_UNMARK(succ);
+  }
+  return r;
+}
+
+CAMLextern value clean_skiplist(value val) {
+  CAMLparam1(val);
+  intnat v = Long_val(val) ;
+
+  assert (count_marks(&the_list) == 0) ;
+  {
+    int len = get_len(atomic_load(&the_list.garbage_head),the_list.head) ;
+    if (v >= 0) {
+      if (len != v) {
+        fprintf(stderr,"len=%d, and v=%" FMT "d differ, space leak detected\n",
+                        len,v);
+      }
+    }
+  }
+  caml_lf_skiplist_free_garbage(&the_list);
+  assert(get_len(atomic_load(&the_list.garbage_head),the_list.head) == 0) ;
+  CAMLreturn(Val_unit);
+}
+
+CAMLextern value hammer_skiplist(value domain_id_val) {
+  CAMLparam1(domain_id_val);
+
+  uintnat domain_id = Long_val(domain_id_val);
+
+  for (int i = 0; i < 100; i++) {
+    for (int c = 10000 * domain_id + 1; c < 10000 * (domain_id + 1); c++) {
+      assert(caml_lf_skiplist_insert(&the_list, c, c));
+    }
+
+    for (int c = 10000 * domain_id + 1; c < 10000 * (domain_id + 1); c++) {
+      uintnat j = 0;
+      caml_lf_skiplist_find(&the_list, c, &j);
+      assert(j == c);
+    }
+
+    for (int c = 10000 * domain_id + 1; c < 10000 * (domain_id + 1); c++) {
+      assert(caml_lf_skiplist_remove(&the_list, c));
+    }
+
+    for (int c = 10000 * domain_id + 1; c < 10000 * (domain_id + 1); c++) {
+      uintnat j = 0;
+      assert(caml_lf_skiplist_insert(&the_list, c, c));
+      caml_lf_skiplist_find(&the_list, c, &j);
+      assert(j == c);
+      assert(caml_lf_skiplist_remove(&the_list, c));
+    }
+  }
+
+  CAMLreturn(Val_unit);
+}
+
+inline static uintnat calc_value(uintnat id) { return id; }
+inline static uintnat calc_key(uintnat id,uintnat turn) { return 1024*id+turn+1; }
+inline static uintnat calc_right(uintnat id,uintnat turn,uintnat ndoms) { return (id+turn) % ndoms; }
+
+CAMLextern value insert_skiplist(value turn_val,value ndoms_val,value domain_id_val) {
+  CAMLparam3(turn_val,ndoms_val,domain_id_val);
+  uintnat domain_id = Long_val(domain_id_val);
+  uintnat ndoms = Long_val(ndoms_val);
+  uintnat turn = Long_val(turn_val);
+  uintnat k = calc_key(domain_id,turn) ;
+  uintnat v =  calc_value(domain_id) ;
+  //  fprintf(stderr,"I: %" FMT "u -> %" FMT "u\n",k,v);
+  int r = caml_lf_skiplist_insert(&the_list, k, v) ;
+  assert(r);
+  CAMLreturn(Val_unit);
+}
+
+CAMLextern value find_skiplist(value turn_val,value ndoms_val,value domain_id_val) {
+  CAMLparam3(turn_val,ndoms_val,domain_id_val);
+  uintnat domain_id = Long_val(domain_id_val);
+  uintnat ndoms = Long_val(ndoms_val);
+  uintnat turn = Long_val(turn_val);
+  uintnat right = calc_right(domain_id,turn,ndoms) ; // neighbour on the right
+  uintnat k = calc_key(right,turn);
+  uintnat w = 0 ;
+  int r = caml_lf_skiplist_find(&the_list, k, &w);
+  if (r) {
+    assert(w == calc_value(right));
+    assert(caml_lf_skiplist_remove(&the_list, k));
+    assert(!caml_lf_skiplist_remove(&the_list, k));
+    //    fprintf(stderr,"R: %lu -> %lu\n",k,w);
+  }
+  CAMLreturn(Val_bool(r));
+}
diff --git a/testsuite/tests/lf_skiplist/test.ml b/testsuite/tests/lf_skiplist/test.ml
new file mode 100644
index 000000000000..f3b11bf4112a
--- /dev/null
+++ b/testsuite/tests/lf_skiplist/test.ml
@@ -0,0 +1,30 @@
+(* TEST
+   modules = "stubs.c"
+*)
+
+external test_skiplist_serial : unit -> unit = "test_skiplist_serial"
+
+let () = test_skiplist_serial ()
+
+external init_skiplist : unit -> unit = "init_skiplist"
+external insert_skiplist : int -> int -> int -> unit = "insert_skiplist"
+external find_skiplist : int -> int -> int -> bool = "find_skiplist"
+external clean_skiplist : int -> unit = "clean_skiplist"
+external cardinal_skiplist : unit -> int = "cardinal_skiplist"
+
+let () =
+  let nturns = 128
+  and nseq = 4 in
+  assert (nturns < 1024); (* See calc_key in stubs.c *)
+  init_skiplist ();
+  for i=1 to nseq do
+    for k = 1 to nturns do
+      insert_skiplist k 1 0
+    done ;
+    assert(cardinal_skiplist () = nturns) ;
+    for k = 1 to nturns do
+      assert(find_skiplist k 1 0)
+    done ;
+    assert(cardinal_skiplist () = 0) ;
+    clean_skiplist nturns
+  done
diff --git a/testsuite/tests/lf_skiplist/test_parallel.ml b/testsuite/tests/lf_skiplist/test_parallel.ml
new file mode 100644
index 000000000000..ac02717889cd
--- /dev/null
+++ b/testsuite/tests/lf_skiplist/test_parallel.ml
@@ -0,0 +1,102 @@
+(* TEST
+   modules = "stubs.c"
+*)
+
+external init_skiplist : unit -> unit = "init_skiplist"
+external hammer_skiplist : int -> unit = "hammer_skiplist"
+
+let () =
+   init_skiplist ();
+   let domains_list = List.init 4 (fun i -> Domain.spawn (fun () -> hammer_skiplist i)) in
+      ignore(List.iter Domain.join domains_list)
+
+(* Concurrent versions of the memory test in tests.ml, see there first *)
+external insert_skiplist : int -> int -> int -> unit = "insert_skiplist"
+external find_skiplist : int -> int -> int -> bool = "find_skiplist"
+external clean_skiplist : int -> unit = "clean_skiplist"
+external cardinal_skiplist : unit -> int = "cardinal_skiplist"
+
+let () =
+  (* Clean garbage list *)
+  clean_skiplist (-1);
+  (* Check cleaning *)
+  clean_skiplist 0;
+  init_skiplist ();
+  assert (cardinal_skiplist() = 0) ;
+  let nturns = 128 and npar = 4 and nseq = 4 in
+  assert (nturns < 1024); (* See calc_key in stubs.c *)
+  (* Fill skip list and then empty it *)
+  for k = 1 to nseq do
+    let d_list =
+      List.init npar
+        (fun i ->
+          Domain.spawn
+            (fun () ->
+              for k = 1 to nturns do insert_skiplist k npar i done)) in
+    ignore (List.iter Domain.join d_list) ;
+    assert (cardinal_skiplist() = npar*nturns) ;
+    let d_list =
+      List.init npar
+        (fun i ->
+          Domain.spawn
+            (fun () ->
+              for k = 1 to nturns do assert(find_skiplist k npar i) done)) in
+    ignore (List.iter Domain.join d_list) ;
+    assert (cardinal_skiplist() = 0) ;
+    clean_skiplist (npar*nturns) ;
+  done ;
+  (* Fill and empty skiplist concurrently *)
+  for k = 1 to nseq do
+    let d_list =
+      List.init (npar*2)
+        (fun i ->
+          Domain.spawn
+            (fun () ->
+              let j = i/2 in
+              if i mod 2 = 0 then
+                for k = 1 to nturns do insert_skiplist k npar j done
+              else
+                for k = 1 to nturns do
+                  while not (find_skiplist k npar j) do
+                    Domain.cpu_relax ()
+                  done
+                done)) in
+    ignore (List.iter Domain.join d_list) ;
+    assert (cardinal_skiplist() = 0) ;
+    clean_skiplist (npar*nturns) ;
+  done ;
+  (* Fill and empty skiplist concurrently, checking list consistency *)
+  for k = 1 to nseq do
+    let d_list =
+      List.init (npar*2)
+        (fun i ->
+          Domain.spawn
+            (fun () ->
+              let j = i/2 in
+              if i mod 2 = 0 then
+                for k = 1 to nturns do insert_skiplist k npar j done
+              else if j mod 2 = 0 then
+                for k = 1 to nturns do
+                  while not (find_skiplist k npar j) do
+                    Domain.cpu_relax ()
+                  done
+                done)) in
+    ignore (List.iter Domain.join d_list) ;
+    assert (cardinal_skiplist() = nturns*(npar-(npar+1)/2)) ;
+    clean_skiplist (nturns*((npar+1)/2)) ;
+    let d_list =
+      List.init npar
+        (fun i ->
+          Domain.spawn
+            (fun () ->
+              if i mod 2 = 1 then
+                for k = 1 to nturns do
+                  while not (find_skiplist k npar i) do
+                    Domain.cpu_relax ()
+                  done
+                done)) in
+    ignore (List.iter Domain.join d_list) ;
+    assert (cardinal_skiplist() = 0);
+    clean_skiplist (nturns*(npar/2)) ;
+  done ;
+  ()
diff --git a/testsuite/tests/lib-dynlink-initializers/test10_main.byte.reference b/testsuite/tests/lib-dynlink-initializers/test10_main.byte.reference
index c8424f5a75ae..8f70ff73cd2c 100755
--- a/testsuite/tests/lib-dynlink-initializers/test10_main.byte.reference
+++ b/testsuite/tests/lib-dynlink-initializers/test10_main.byte.reference
@@ -5,8 +5,8 @@ Called from Test10_plugin.f in file "test10_plugin.ml", line 6, characters 2-6
 Called from Test10_plugin in file "test10_plugin.ml", line 10, characters 2-6
 Called from Dynlink.Bytecode.run in file "otherlibs/dynlink/dynlink.ml", line 137, characters 16-25
 Re-raised at Dynlink.Bytecode.run in file "otherlibs/dynlink/dynlink.ml", line 139, characters 6-137
-Called from Dynlink_common.Make.load.(fun) in file "otherlibs/dynlink/dynlink_common.ml", line 337, characters 13-44
+Called from Dynlink_common.Make.load.(fun) in file "otherlibs/dynlink/dynlink_common.ml", line 350, characters 13-44
 Called from Stdlib__List.iter in file "list.ml", line 110, characters 12-15
-Called from Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 335, characters 8-240
-Re-raised at Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 345, characters 8-17
+Called from Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 348, characters 8-240
+Re-raised at Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 358, characters 8-17
 Called from Test10_main in file "test10_main.ml", line 51, characters 13-69
diff --git a/testsuite/tests/lib-dynlink-initializers/test10_main.native.reference b/testsuite/tests/lib-dynlink-initializers/test10_main.native.reference
index 42d2b971ffaf..6496b8870fa4 100755
--- a/testsuite/tests/lib-dynlink-initializers/test10_main.native.reference
+++ b/testsuite/tests/lib-dynlink-initializers/test10_main.native.reference
@@ -1,14 +1,10 @@
 Error: Failure("Plugin error")
-Raised at Stdlib.failwith in file "stdlib.ml", line 29, characters 17-33
-Called from Test10_plugin.g in file "test10_plugin.ml", line 2, characters 15-38
-Called from Test10_plugin in file "test10_plugin.ml", line 10, characters 2-6
-Called from Dynlink.Native.run.(fun) in file "otherlibs/dynlink/native/dynlink.ml", line 85, characters 12-29
-Called from Dynlink.Native.run.(fun) in file "otherlibs/dynlink/native/dynlink.ml", line 85, characters 12-29
+Raised by primitive operation at Dynlink.Native.run.(fun) in file "otherlibs/dynlink/native/dynlink.ml", line 85, characters 12-29
 Re-raised at Dynlink.Native.run.(fun) in file "otherlibs/dynlink/native/dynlink.ml", line 87, characters 10-149
 Called from Stdlib__List.iter in file "list.ml", line 110, characters 12-15
-Called from Dynlink_common.Make.load.(fun) in file "otherlibs/dynlink/dynlink_common.ml", line 337, characters 13-44
+Called from Dynlink_common.Make.load.(fun) in file "otherlibs/dynlink/dynlink_common.ml", line 350, characters 13-44
 Called from Stdlib__List.iter in file "list.ml", line 110, characters 12-15
-Called from Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 335, characters 8-240
-Re-raised at Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 345, characters 8-17
-Called from Dynlink_common.Make.loadfile in file "otherlibs/dynlink/dynlink_common.ml" (inlined), line 347, characters 26-45
+Called from Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 348, characters 8-240
+Re-raised at Dynlink_common.Make.load in file "otherlibs/dynlink/dynlink_common.ml", line 358, characters 8-17
+Called from Dynlink_common.Make.loadfile in file "otherlibs/dynlink/dynlink_common.ml" (inlined), line 360, characters 26-45
 Called from Test10_main in file "test10_main.ml", line 49, characters 30-87
diff --git a/testsuite/tests/lib-format/mc_pr586_par.ml b/testsuite/tests/lib-format/mc_pr586_par.ml
new file mode 100644
index 000000000000..456a306ce70e
--- /dev/null
+++ b/testsuite/tests/lib-format/mc_pr586_par.ml
@@ -0,0 +1,12 @@
+(* TEST *)
+
+let () =
+  let domains = Array.init 7 (fun i ->
+    Domain.spawn (fun () ->
+          for j = 1 to 10000000 do () done;
+          for j = 1 to 100 do
+            Format.printf "21. +. 21. is %f@." (21. +. 21.);
+          done
+      )
+    ) in
+  Array.iter Domain.join domains
diff --git a/testsuite/tests/lib-format/mc_pr586_par.reference b/testsuite/tests/lib-format/mc_pr586_par.reference
new file mode 100644
index 000000000000..479916521741
--- /dev/null
+++ b/testsuite/tests/lib-format/mc_pr586_par.reference
@@ -0,0 +1,700 @@
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
diff --git a/testsuite/tests/lib-format/mc_pr586_par2.ml b/testsuite/tests/lib-format/mc_pr586_par2.ml
new file mode 100644
index 000000000000..0ead3627b580
--- /dev/null
+++ b/testsuite/tests/lib-format/mc_pr586_par2.ml
@@ -0,0 +1,14 @@
+(* TEST *)
+
+let () =
+  let fmt_key = Format.synchronized_formatter_of_out_channel stdout in
+  let domains = Array.init 7 (fun i ->
+    Domain.spawn (fun () ->
+          let fmt = Domain.DLS.get fmt_key in
+          for j = 1 to 10000000 do () done;
+          for j = 1 to 100 do
+            Format.fprintf fmt "21. +. 21. is %f@." (21. +. 21.);
+          done
+      )
+    ) in
+  Array.iter Domain.join domains
diff --git a/testsuite/tests/lib-format/mc_pr586_par2.reference b/testsuite/tests/lib-format/mc_pr586_par2.reference
new file mode 100644
index 000000000000..479916521741
--- /dev/null
+++ b/testsuite/tests/lib-format/mc_pr586_par2.reference
@@ -0,0 +1,700 @@
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
+21. +. 21. is 42.000000
diff --git a/testsuite/tests/lib-hashtbl/htbl.ml b/testsuite/tests/lib-hashtbl/htbl.ml
index fe3d18e13b80..15141b97a083 100644
--- a/testsuite/tests/lib-hashtbl/htbl.ml
+++ b/testsuite/tests/lib-hashtbl/htbl.ml
@@ -175,14 +175,6 @@ module TI2 = Test(HI2)(MI)
 module TSP = Test(HSP)(MSP)
 module TSL = Test(HSL)(MSL)
 
-(* These work with the old ephemeron API *)
-[@@@alert "-old_ephemeron_api"]
-module TWS  = Test(WS)(MS)
-module TWSP1 = Test(WSP1)(MSP)
-module TWSP2 = Test(WSP2)(MSP)
-module TWSL = Test(WSL)(MSL)
-module TWSA = Test(WSA)(MSA)
-
 (* Data set: strings from a file, associated with their line number *)
 
 let file_data filename =
@@ -257,20 +249,7 @@ let _ =
   printf "-- Pairs of strings\n%!";
   TSP.test (pair_data d);
   printf "-- Lists of strings\n%!";
-  TSL.test (list_data d);
-  (* weak *)
-  let d =
-    try file_data "../../LICENSE" with Sys_error _ -> string_data in
-  printf "-- Weak K1 -- Strings, functorial interface\n%!";
-  TWS.test d;
-  printf "-- Weak K1 -- Pairs of strings\n%!";
-  TWSP1.test (pair_data d);
-  printf "-- Weak K2 -- Pairs of strings\n%!";
-  TWSP2.test (pair_data d);
-  printf "-- Weak K1 -- Lists of strings\n%!";
-  TWSL.test (list_data d);
-  printf "-- Weak Kn -- Arrays of strings\n%!";
-  TWSA.test (Array.map (fun (l,i) -> (Array.of_list l,i)) (list_data d))
+  TSL.test (list_data d)
 
 
 let () =
diff --git a/testsuite/tests/lib-hashtbl/htbl.reference b/testsuite/tests/lib-hashtbl/htbl.reference
index 653fbc561c50..45f56290837d 100644
--- a/testsuite/tests/lib-hashtbl/htbl.reference
+++ b/testsuite/tests/lib-hashtbl/htbl.reference
@@ -22,26 +22,6 @@ Removal: passed
 Insertion: passed
 Insertion: passed
 Removal: passed
--- Weak K1 -- Strings, functorial interface
-Insertion: passed
-Insertion: passed
-Removal: passed
--- Weak K1 -- Pairs of strings
-Insertion: passed
-Insertion: passed
-Removal: passed
--- Weak K2 -- Pairs of strings
-Insertion: passed
-Insertion: passed
-Removal: passed
--- Weak K1 -- Lists of strings
-Insertion: passed
-Insertion: passed
-Removal: passed
--- Weak Kn -- Arrays of strings
-Insertion: passed
-Insertion: passed
-Removal: passed
 1000 elements
 100,2
 200,4
diff --git a/testsuite/tests/lib-marshal/intern_final.ml b/testsuite/tests/lib-marshal/intern_final.ml
index d50fb97832c8..ad886c573fec 100644
--- a/testsuite/tests/lib-marshal/intern_final.ml
+++ b/testsuite/tests/lib-marshal/intern_final.ml
@@ -14,13 +14,16 @@ let rec fill_minor accu = function
   | 0 -> accu
   | n -> fill_minor (n::accu) (n-1)
 
+let rec callback c0 () =
+  seek_in c0 0;
+  let a = Marshal.from_channel c0 in
+  Array.iter (fun v -> assert (v = 0)) a
+
 let () =
   let c0 = open_in_bin "data0" in
   let c42 = open_in_bin "data42" in
 
-  ignore (Gc.create_alarm (fun () ->
-              seek_in c0 0;
-              ignore (Marshal.from_channel c0)));
+  ignore (Gc.create_alarm (callback c0));
 
   for i = 0 to 100000 do
     seek_in c42 0;
diff --git a/testsuite/tests/lib-marshal/intext_par.ml b/testsuite/tests/lib-marshal/intext_par.ml
new file mode 100644
index 000000000000..2efb84666559
--- /dev/null
+++ b/testsuite/tests/lib-marshal/intext_par.ml
@@ -0,0 +1,598 @@
+(* TEST
+   modules = "intextaux_par.c"
+*)
+
+(* Test for output_value / input_value *)
+
+let max_data_depth = 500000
+
+type t = A | B of int | C of float | D of string | E of char
+       | F of t | G of t * t | H of int * t | I of t * float | J
+
+let longstring =
+"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+let verylongstring =
+"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+ 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+ 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+ 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+ 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+ 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+ 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+ 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+
+let bigint = Int64.to_int 0x123456789ABCDEF0L
+
+let rec fib n =
+  if n < 2 then 1 else fib(n-1) + fib(n-2)
+
+let test_out filename =
+  let oc = open_out_bin filename in
+  output_value oc 1;
+  output_value oc (-1);
+  output_value oc 258;
+  output_value oc 20000;
+  output_value oc 0x12345678;
+  output_value oc bigint;
+  output_value oc "foobargeebuz";
+  output_value oc longstring;
+  output_value oc verylongstring;
+  output_value oc 3.141592654;
+  output_value oc ();
+  output_value oc A;
+  output_value oc (B 1);
+  output_value oc (C 2.718);
+  output_value oc (D "hello, world!");
+  output_value oc (E 'l');
+  output_value oc (F(B 1));
+  output_value oc (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e')))));
+  output_value oc (H(1, A));
+  output_value oc (I(B 2, 1e-6));
+  let x = D "sharing" in
+  let y = G(x, x) in
+  let z = G(y, G(x, y)) in
+  output_value oc z;
+  output_value oc [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|];
+  let rec big n = if n <= 0 then A else H(n, big(n-1)) in
+  output_value oc (big 1000);
+  Marshal.to_channel oc y [Marshal.No_sharing];
+  Marshal.to_channel oc fib [Marshal.Closures];
+  output_value oc (Int32.of_string "0");
+  output_value oc (Int32.of_string "123456");
+  output_value oc (Int32.of_string "-123456");
+  output_value oc (Int64.of_string "0");
+  output_value oc (Int64.of_string "123456789123456");
+  output_value oc (Int64.of_string "-123456789123456");
+  output_value oc (Nativeint.of_string "0");
+  output_value oc (Nativeint.of_string "123456");
+  output_value oc (Nativeint.of_string "-123456");
+  output_value oc (Nativeint.shift_left (Nativeint.of_string "123456789") 32);
+  output_value oc (Nativeint.shift_left (Nativeint.of_string "-123456789") 32);
+  let i = Int64.of_string "123456789123456" in output_value oc (i,i);
+  close_out oc
+
+
+let test n b =
+  if not b then begin
+    print_string "Test ";
+    print_int n;
+    print_string " FAILED.\n";
+    flush stderr
+  end
+
+let test_in filename =
+  let ic = open_in_bin filename in
+  test 1 (input_value ic = 1);
+  test 2 (input_value ic = (-1));
+  test 3 (input_value ic = 258);
+  test 4 (input_value ic = 20000);
+  test 5 (input_value ic = 0x12345678);
+  test 6 (input_value ic = bigint);
+  test 7 (input_value ic = "foobargeebuz");
+  test 8 (input_value ic = longstring);
+  test 9 (input_value ic = verylongstring);
+  test 10 (input_value ic = 3.141592654);
+  test 11 (input_value ic = ());
+  test 12 (match input_value ic with
+    A -> true
+  | _ -> false);
+  test 13 (match input_value ic with
+    (B 1) -> true
+  | _ -> false);
+  test 14 (match input_value ic with
+    (C f) -> f = 2.718
+  | _ -> false);
+  test 15 (match input_value ic with
+    (D "hello, world!") -> true
+  | _ -> false);
+  test 16 (match input_value ic with
+    (E 'l') -> true
+  | _ -> false);
+  test 17 (match input_value ic with
+    (F(B 1)) -> true
+  | _ -> false);
+  test 18 (match input_value ic with
+    (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) -> true
+  | _ -> false);
+  test 19 (match input_value ic with
+    (H(1, A)) -> true
+  | _ -> false);
+  test 20 (match input_value ic with
+    (I(B 2, 1e-6)) -> true
+  | _ -> false);
+  test 21 (match input_value ic with
+    G((G((D "sharing" as t1), t2) as t3), G(t4, t5)) ->
+      t1 == t2 && t3 == t5 && t4 == t1
+  | _ -> false);
+  test 22 (input_value ic = [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|]);
+  let rec check_big n t =
+    if n <= 0 then
+      test 23 (match t with A -> true | _ -> false)
+    else
+      match t with H(m, s) -> if m = n then check_big (n-1) s
+                                       else test 23 false
+                 | _ -> test 23 false
+  in
+    check_big 1000 (input_value ic);
+  test 24 (match input_value ic with
+    G((D "sharing" as t1), (D "sharing" as t2)) -> t1 != t2
+  | _ -> false);
+  test 25 (let fib = (input_value ic : int -> int) in fib 5 = 8 && fib 10 = 89);
+  test 26 (input_value ic = Int32.of_string "0");
+  test 27 (input_value ic = Int32.of_string "123456");
+  test 28 (input_value ic = Int32.of_string "-123456");
+  test 29 (input_value ic = Int64.of_string "0");
+  test 30 (input_value ic = Int64.of_string "123456789123456");
+  test 31 (input_value ic = Int64.of_string "-123456789123456");
+  test 32 (input_value ic = Nativeint.of_string "0");
+  test 33 (input_value ic = Nativeint.of_string "123456");
+  test 34 (input_value ic = Nativeint.of_string "-123456");
+  test 35 (input_value ic =
+             Nativeint.shift_left (Nativeint.of_string "123456789") 32);
+  test 36 (input_value ic =
+             Nativeint.shift_left (Nativeint.of_string "-123456789") 32);
+  let ((i, j) : int64 * int64) = input_value ic in
+  test 37 (i = Int64.of_string "123456789123456");
+  test 38 (j = Int64.of_string "123456789123456");
+  test 39 (i == j);
+  close_in ic
+
+let test_string () =
+  let s = Marshal.to_string 1 [] in
+  test 101 (Marshal.from_string s 0 = 1);
+  let s = Marshal.to_string (-1) [] in
+  test 102 (Marshal.from_string s 0 = (-1));
+  let s = Marshal.to_string 258 [] in
+  test 103 (Marshal.from_string s 0 = 258);
+  let s = Marshal.to_string 20000 [] in
+  test 104 (Marshal.from_string s 0 = 20000);
+  let s = Marshal.to_string 0x12345678 [] in
+  test 105 (Marshal.from_string s 0 = 0x12345678);
+  let s = Marshal.to_string bigint [] in
+  test 106 (Marshal.from_string s 0 = bigint);
+  let s = Marshal.to_string "foobargeebuz" [] in
+  test 107 (Marshal.from_string s 0 = "foobargeebuz");
+  let s = Marshal.to_string longstring [] in
+  test 108 (Marshal.from_string s 0 = longstring);
+  let s = Marshal.to_string verylongstring [] in
+  test 109 (Marshal.from_string s 0 = verylongstring);
+  let s = Marshal.to_string 3.141592654 [] in
+  test 110 (Marshal.from_string s 0 = 3.141592654);
+  let s = Marshal.to_string () [] in
+  test 111 (Marshal.from_string s 0 = ());
+  let s = Marshal.to_string A [] in
+  test 112 (match Marshal.from_string s 0 with
+    A -> true
+  | _ -> false);
+  let s = Marshal.to_string (B 1) [] in
+  test 113 (match Marshal.from_string s 0 with
+    (B 1) -> true
+  | _ -> false);
+  let s = Marshal.to_string (C 2.718) [] in
+  test 114 (match Marshal.from_string s 0 with
+    (C f) -> f = 2.718
+  | _ -> false);
+  let s = Marshal.to_string (D "hello, world!") [] in
+  test 115 (match Marshal.from_string s 0 with
+    (D "hello, world!") -> true
+  | _ -> false);
+  let s = Marshal.to_string (E 'l') [] in
+  test 116 (match Marshal.from_string s 0 with
+    (E 'l') -> true
+  | _ -> false);
+  let s = Marshal.to_string (F(B 1)) [] in
+  test 117 (match Marshal.from_string s 0 with
+    (F(B 1)) -> true
+  | _ -> false);
+  let s = Marshal.to_string (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) [] in
+  test 118 (match Marshal.from_string s 0 with
+    (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) -> true
+  | _ -> false);
+  let s = Marshal.to_string (H(1, A)) [] in
+  test 119 (match Marshal.from_string s 0 with
+    (H(1, A)) -> true
+  | _ -> false);
+  let s = Marshal.to_string (I(B 2, 1e-6)) [] in
+  test 120 (match Marshal.from_string s 0 with
+    (I(B 2, 1e-6)) -> true
+  | _ -> false);
+  let x = D "sharing" in
+  let y = G(x, x) in
+  let z = G(y, G(x, y)) in
+  let s = Marshal.to_string z [] in
+  test 121 (match Marshal.from_string s 0 with
+    G((G((D "sharing" as t1), t2) as t3), G(t4, t5)) ->
+      t1 == t2 && t3 == t5 && t4 == t1
+  | _ -> false);
+  let s = Marshal.to_string [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|] [] in
+  test 122
+       (Marshal.from_string s 0 = [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|]);
+  let rec big n = if n <= 0 then A else H(n, big(n-1)) in
+  let s = Marshal.to_string (big 1000) [] in
+  let rec check_big n t =
+    if n <= 0 then
+      test 123 (match t with A -> true | _ -> false)
+    else
+      match t with H(m, s) -> if m = n then check_big (n-1) s
+                                       else test 123 false
+                 | _ -> test 123 false
+  in
+    check_big 1000 (Marshal.from_string s 0)
+
+let marshal_to_buffer s start len v flags =
+  ignore (Marshal.to_buffer s start len v flags)
+;;
+
+let test_buffer () =
+  let s = Bytes.create 512 in
+  marshal_to_buffer s 0 512 1 [];
+  test 201 (Marshal.from_bytes s 0 = 1);
+  marshal_to_buffer s 0 512 (-1) [];
+  test 202 (Marshal.from_bytes s 0 = (-1));
+  marshal_to_buffer s 0 512 258 [];
+  test 203 (Marshal.from_bytes s 0 = 258);
+  marshal_to_buffer s 0 512 20000 [];
+  test 204 (Marshal.from_bytes s 0 = 20000);
+  marshal_to_buffer s 0 512 0x12345678 [];
+  test 205 (Marshal.from_bytes s 0 = 0x12345678);
+  marshal_to_buffer s 0 512 bigint [];
+  test 206 (Marshal.from_bytes s 0 = bigint);
+  marshal_to_buffer s 0 512 "foobargeebuz" [];
+  test 207 (Marshal.from_bytes s 0 = "foobargeebuz");
+  marshal_to_buffer s 0 512 longstring [];
+  test 208 (Marshal.from_bytes s 0 = longstring);
+  test 209
+    (try marshal_to_buffer s 0 512 verylongstring []; false
+     with Failure s when s = "Marshal.to_buffer: buffer overflow" -> true);
+  marshal_to_buffer s 0 512 3.141592654 [];
+  test 210 (Marshal.from_bytes s 0 = 3.141592654);
+  marshal_to_buffer s 0 512 () [];
+  test 211 (Marshal.from_bytes s 0 = ());
+  marshal_to_buffer s 0 512 A [];
+  test 212 (match Marshal.from_bytes s 0 with
+    A -> true
+  | _ -> false);
+  marshal_to_buffer s 0 512 (B 1) [];
+  test 213 (match Marshal.from_bytes s 0 with
+    (B 1) -> true
+  | _ -> false);
+  marshal_to_buffer s 0 512 (C 2.718) [];
+  test 214 (match Marshal.from_bytes s 0 with
+    (C f) -> f = 2.718
+  | _ -> false);
+  marshal_to_buffer s 0 512 (D "hello, world!") [];
+  test 215 (match Marshal.from_bytes s 0 with
+    (D "hello, world!") -> true
+  | _ -> false);
+  marshal_to_buffer s 0 512 (E 'l') [];
+  test 216 (match Marshal.from_bytes s 0 with
+    (E 'l') -> true
+  | _ -> false);
+  marshal_to_buffer s 0 512 (F(B 1)) [];
+  test 217 (match Marshal.from_bytes s 0 with
+    (F(B 1)) -> true
+  | _ -> false);
+  marshal_to_buffer s 0 512 (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) [];
+  test 218 (match Marshal.from_bytes s 0 with
+    (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) -> true
+  | _ -> false);
+  marshal_to_buffer s 0 512 (H(1, A)) [];
+  test 219 (match Marshal.from_bytes s 0 with
+    (H(1, A)) -> true
+  | _ -> false);
+  marshal_to_buffer s 0 512 (I(B 2, 1e-6)) [];
+  test 220 (match Marshal.from_bytes s 0 with
+    (I(B 2, 1e-6)) -> true
+  | _ -> false);
+  let x = D "sharing" in
+  let y = G(x, x) in
+  let z = G(y, G(x, y)) in
+  marshal_to_buffer s 0 512 z [];
+  test 221 (match Marshal.from_bytes s 0 with
+    G((G((D "sharing" as t1), t2) as t3), G(t4, t5)) ->
+      t1 == t2 && t3 == t5 && t4 == t1
+  | _ -> false);
+  marshal_to_buffer s 0 512 [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|] [];
+  test 222
+       (Marshal.from_bytes s 0 = [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|]);
+  let rec big n = if n <= 0 then A else H(n, big(n-1)) in
+  test 223
+    (try marshal_to_buffer s 0 512 (big 1000) []; false
+     with Failure s when s = "Marshal.to_buffer: buffer overflow" -> true)
+
+let test_size() =
+  let s = Marshal.to_bytes (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) [] in
+  test 300 (Marshal.header_size + Marshal.data_size s 0 = Bytes.length s)
+
+external marshal_to_block : int -> 'a -> Marshal.extern_flags list -> unit
+                          = "marshal_to_block"
+external marshal_from_block : int -> 'a = "marshal_from_block"
+
+let test_block () =
+  marshal_to_block 512 1 [];
+  test 401 (marshal_from_block 512 = 1);
+  marshal_to_block 512 (-1) [];
+  test 402 (marshal_from_block 512 = (-1));
+  marshal_to_block 512 258 [];
+  test 403 (marshal_from_block 512 = 258);
+  marshal_to_block 512 20000 [];
+  test 404 (marshal_from_block 512 = 20000);
+  marshal_to_block 512 0x12345678 [];
+  test 405 (marshal_from_block 512 = 0x12345678);
+  marshal_to_block 512 bigint [];
+  test 406 (marshal_from_block 512 = bigint);
+  marshal_to_block 512 "foobargeebuz" [];
+  test 407 (marshal_from_block 512 = "foobargeebuz");
+  marshal_to_block 512 longstring [];
+  test 408 (marshal_from_block 512 = longstring);
+  test 409
+    (try marshal_to_block 512 verylongstring []; false
+     with Failure s when s = "Marshal.to_buffer: buffer overflow" -> true);
+  marshal_to_block 512 3.141592654 [];
+  test 410 (marshal_from_block 512 = 3.141592654);
+  marshal_to_block 512 () [];
+  test 411 (marshal_from_block 512 = ());
+  marshal_to_block 512 A [];
+  test 412 (match marshal_from_block 512 with
+    A -> true
+  | _ -> false);
+  marshal_to_block 512 (B 1) [];
+  test 413 (match marshal_from_block 512 with
+    (B 1) -> true
+  | _ -> false);
+  marshal_to_block 512 (C 2.718) [];
+  test 414 (match marshal_from_block 512 with
+    (C f) -> f = 2.718
+  | _ -> false);
+  marshal_to_block 512 (D "hello, world!") [];
+  test 415 (match marshal_from_block 512 with
+    (D "hello, world!") -> true
+  | _ -> false);
+  marshal_to_block 512 (E 'l') [];
+  test 416 (match marshal_from_block 512 with
+    (E 'l') -> true
+  | _ -> false);
+  marshal_to_block 512 (F(B 1)) [];
+  test 417 (match marshal_from_block 512 with
+    (F(B 1)) -> true
+  | _ -> false);
+  marshal_to_block 512 (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) [];
+  test 418 (match marshal_from_block 512 with
+    (G(A, G(B 2, G(C 3.14, G(D "glop", E 'e'))))) -> true
+  | _ -> false);
+  marshal_to_block 512 (H(1, A)) [];
+  test 419 (match marshal_from_block 512 with
+    (H(1, A)) -> true
+  | _ -> false);
+  marshal_to_block 512 (I(B 2, 1e-6)) [];
+  test 420 (match marshal_from_block 512 with
+    (I(B 2, 1e-6)) -> true
+  | _ -> false);
+  let x = D "sharing" in
+  let y = G(x, x) in
+  let z = G(y, G(x, y)) in
+  marshal_to_block 512 z [];
+  test 421 (match marshal_from_block 512 with
+    G((G((D "sharing" as t1), t2) as t3), G(t4, t5)) ->
+      t1 == t2 && t3 == t5 && t4 == t1
+  | _ -> false);
+  marshal_to_block 512 [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|] [];
+  test 422 (marshal_from_block 512 =
+            [|1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16|]);
+  let rec big n = if n <= 0 then A else H(n, big(n-1)) in
+  test 423
+    (try marshal_to_block 512 (big 1000) []; false
+     with Failure _ -> true);
+  test 424
+    (try marshal_to_block 512 "Hello, world!" [];
+         ignore (marshal_from_block 8);
+         false
+     with Failure _ -> true)
+
+(* Test for really big objects *)
+
+(* Test for really deep data structures *)
+let test_deep () =
+  (* Right-leaning *)
+  let rec loop acc i =
+    if i < max_data_depth
+    then loop (i :: acc) (i+1)
+    else acc in
+  let x = loop [] 0 in
+  let s = Marshal.to_string x [] in
+  test 425 (Marshal.from_string s 0 = x);
+  (* Left-leaning *)
+  let rec loop acc i =
+    if i < max_data_depth
+    then loop (G(acc, B i)) (i+1)
+    else acc in
+  let x = loop A 0 in
+  let s = Marshal.to_string x [] in
+  test 426 (Marshal.from_string s 0 = x)
+
+(* Test for objects *)
+class foo = object (self : 'self)
+  val data1 = "foo"
+  val data2 = "bar"
+  val data3 = 42L
+  method test1 = data1 ^ data2
+  method test2 = false
+  method test3 = self#test1
+  method test4 = data3
+end
+
+class bar = object (self : 'self)
+  inherit foo as super
+  val! data2 = "test5"
+  val data4 = "test3"
+  val data5 = "test4"
+  method test1 =
+    data1
+  ^ data2
+  ^ data4
+  ^ data5
+  ^ Int64.to_string self#test4
+end
+
+class foobar = object (self : 'self)
+  inherit foo as super
+  inherit! bar
+end
+
+(* Test for objects *)
+let test_objects () =
+  let x = new foo in
+  let s = Marshal.to_string x [Marshal.Closures] in
+  let x = Marshal.from_string s 0 in
+  test 500 (x#test1 = "foobar");
+  test 501 (x#test2 = false);
+  test 502 (x#test3 = "foobar");
+  test 503 (x#test4 = 42L);
+  let x = new bar in
+  let s = Marshal.to_string x [Marshal.Closures] in
+  let x = Marshal.from_string s 0 in
+  test 504 (x#test1 = "footest5test3test442");
+  test 505 (x#test2 = false);
+  test 506 (x#test3 = "footest5test3test442");
+  test 507 (x#test4 = 42L);
+  let x0 = new foobar in
+  let s = Marshal.to_string x0 [Marshal.Closures] in
+  let x = Marshal.from_string s 0 in
+  test 508 (x#test1 = "footest5test3test442");
+  test 509 (x#test2 = false);
+  test 510 (x#test3 = "footest5test3test442");
+  test 511 (x#test4 = 42L);
+  test 512 (Oo.id x = Oo.id x0 + 1)     (* PR#5610 *)
+
+(* Test for infix pointers *)
+let test_infix () =
+  let t = true and
+      f = false in
+  let rec odd n =
+    if n = 0
+    then f
+    else even (n-1)
+  and even n =
+    if n = 0
+    then t
+    else odd (n-1)
+  in
+  let s = Marshal.to_string (odd, even) [Marshal.Closures] in
+  let (odd', even': (int -> bool) * (int -> bool)) = Marshal.from_string s 0 in
+  test 600 (odd' 41 = true);
+  test 601 (odd' 41 = odd 41);
+  test 602 (odd' 142 = false);
+  test 603 (odd' 142 = odd 142);
+  test 604 (even' 41 = false);
+  test 605 (even' 41 = even 41);
+  test 606 (even' 142 = true);
+  test 607 (even' 142 = even 142)
+
+
+let test_mutual_rec_regression () =
+  (* this regression was reported by Cedric Pasteur in PR#5772 *)
+  let rec test_one q x = x > 3
+  and test_list q = List.for_all (test_one q) q in
+  let g () = () in
+  let f q = if test_list q then g () in
+
+  test 700 (try ignore (Marshal.to_string f [Marshal.Closures]); true
+            with _ -> false)
+
+let test_end_of_file_regression file =
+  (* See PR#7142 *)
+  let write oc n =
+    for k = 0 to n - 1 do
+      Marshal.to_channel oc k []
+    done
+  in
+  let read ic n =
+    let k = ref 0 in
+    try
+      while true do
+        if Marshal.from_channel ic != !k then
+          failwith "unexpected integer";
+        incr k
+      done
+    with
+      | End_of_file when !k != n -> failwith "missing integer"
+      | End_of_file -> ()
+  in
+  test 800 (
+    try
+      let n = 100 in
+      let oc = open_out_bin file in
+      write oc n;
+      close_out oc;
+
+      let ic = open_in_bin file in
+      try
+        read ic n;
+        close_in ic;
+        true
+      with _ ->
+        close_in ic;
+        false
+    with _ -> false
+  )
+
+external init_buggy_custom_ops : unit -> unit =
+  "init_buggy_custom_ops"
+let () = init_buggy_custom_ops ()
+type buggy
+external value_with_buggy_serialiser : unit -> buggy =
+  "value_with_buggy_serialiser"
+let test_buggy_serialisers () =
+  let x = value_with_buggy_serialiser () in
+  let s = Marshal.to_string x [] in
+  match Marshal.from_string s 0 with
+  | exception (Failure _) -> ()
+  | _ ->
+     failwith "Marshalling should not have succeeded with a bad serialiser!"
+
+let main_domain id =
+  let file = Format.sprintf "intext_%d.data" id in
+  test_out file; test_in file;
+  test_out file; test_in file;
+  test_string();
+  test_buffer();
+  test_size();
+  test_block();
+  test_deep();
+  test_objects();
+  test_infix ();
+  test_mutual_rec_regression ();
+  test_end_of_file_regression file;
+  test_buggy_serialisers ();
+  Sys.remove file;
+  ()
+
+let main () =
+  let domains = Array.init 8 (fun id ->
+    Domain.spawn (fun () -> main_domain id))
+  in
+  Array.iter Domain.join domains;
+  print_endline "OK";
+  exit 0
+
+let _ = main ()
diff --git a/testsuite/tests/lib-marshal/intext_par.reference b/testsuite/tests/lib-marshal/intext_par.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lib-marshal/intext_par.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/lib-marshal/intextaux_par.c b/testsuite/tests/lib-marshal/intextaux_par.c
new file mode 100644
index 000000000000..710015749630
--- /dev/null
+++ b/testsuite/tests/lib-marshal/intextaux_par.c
@@ -0,0 +1,69 @@
+/**************************************************************************/
+/*                                                                        */
+/*                                OCaml                                   */
+/*                                                                        */
+/*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
+/*                                                                        */
+/*   Copyright 2001 Institut National de Recherche en Informatique et     */
+/*     en Automatique.                                                    */
+/*                                                                        */
+/*   All rights reserved.  This file is distributed under the terms of    */
+/*   the GNU Lesser General Public License version 2.1, with the          */
+/*   special exception on linking described in the file LICENSE.          */
+/*                                                                        */
+/**************************************************************************/
+
+#include <caml/mlvalues.h>
+#include <caml/intext.h>
+#include <caml/custom.h>
+
+#define CAML_INTERNALS
+
+#define BLOCK_SIZE 512
+static __thread char marshal_block[BLOCK_SIZE];
+
+value marshal_to_block(value vlen, value v, value vflags)
+{
+  CAMLassert(Long_val(vlen) <= BLOCK_SIZE);
+  caml_output_value_to_block(v, vflags, marshal_block, Long_val(vlen));
+  return Val_unit;
+}
+
+value marshal_from_block(value vlen)
+{
+  CAMLassert(Long_val(vlen) <= BLOCK_SIZE);
+  return caml_input_value_from_block(marshal_block, Long_val(vlen));
+}
+
+static void bad_serialize(value v, uintnat* sz_32, uintnat* sz_64)
+{
+  caml_serialize_int_4(42);
+  *sz_32 = *sz_64 = 100;
+}
+
+static uintnat bad_deserialize(void* dst)
+{
+  return 10;
+}
+
+static struct custom_operations buggy_ops = {
+  "foo",
+  custom_finalize_default,
+  custom_compare_default,
+  custom_hash_default,
+  bad_serialize,
+  bad_deserialize,
+  custom_compare_ext_default,
+  custom_fixed_length_default
+};
+
+value init_buggy_custom_ops()
+{
+  caml_register_custom_operations(&buggy_ops);
+  return Val_unit;
+}
+
+value value_with_buggy_serialiser()
+{
+  return caml_alloc_custom(&buggy_ops, 20, 0, 1);
+}
diff --git a/testsuite/tests/lib-obj/reachable_words_bug.ml b/testsuite/tests/lib-obj/reachable_words_bug.ml
new file mode 100644
index 000000000000..15969eb4210c
--- /dev/null
+++ b/testsuite/tests/lib-obj/reachable_words_bug.ml
@@ -0,0 +1,9 @@
+(* TEST
+*)
+
+let _ =
+  (* In 4.13 this causes Obj.reachable_words to segfault
+     because of a missing initialization in caml_obj_reachable_words *)
+  ignore (Marshal.(to_string 123 [No_sharing]));
+  let n = Obj.reachable_words (Obj.repr (Array.init 10 (fun i -> i))) in
+  assert (n = 11)
diff --git a/testsuite/tests/lib-obj/reachable_words_np.ml b/testsuite/tests/lib-obj/reachable_words_np.ml
deleted file mode 100644
index 8a50268d2060..000000000000
--- a/testsuite/tests/lib-obj/reachable_words_np.ml
+++ /dev/null
@@ -1,21 +0,0 @@
-(* TEST
- * naked_pointers
- ** bytecode
- ** native
-*)
-
-let native =
-  match Sys.backend_type with
-  | Sys.Native -> true
-  | Sys.Bytecode -> false
-  | Sys.Other s -> print_endline s; assert false
-
-let size x = Obj.reachable_words (Obj.repr x)
-
-let expect_size s x =
-  let i = size x in
-  if i <> s then
-    Printf.printf "size = %i; expected = %i\n%!" i s
-
-let () =
-  expect_size (if native then 0 else 3) (1, 2)
diff --git a/testsuite/tests/lib-str/parallel.ml b/testsuite/tests/lib-str/parallel.ml
new file mode 100644
index 000000000000..23f2a5f7deda
--- /dev/null
+++ b/testsuite/tests/lib-str/parallel.ml
@@ -0,0 +1,39 @@
+(* TEST
+* hasstr
+include str
+** bytecode
+** native
+*)
+
+let total = Atomic.make 0
+
+let run str () =
+  let re = Str.regexp str in
+  let input = "The quick brown fox jumped over the lazy_t" in
+  match Str.search_forward re input 0 with
+  | exception Not_found -> Atomic.decr total
+  | _ ->
+      let s = Str.matched_group 0 input in
+      if not (String.equal s str) then
+        Atomic.decr total
+      else
+        Atomic.incr total
+
+let _ =
+  (* generate a set of cases matching the reference input or not. *)
+  let domain_params = List.init 7
+    (fun i -> if i mod 2 == 0 then "the lazy_t" else "the lazy dog")
+  in
+  for i = 0 to 3 do
+    let domains =
+      List.map (fun param -> Domain.spawn (run param)) domain_params
+    in
+    (* domain 0 is an "odd" case, required to achieve a neutral total by the end *)
+    run "the lazy dog" ();
+    List.iter Domain.join domains
+  done;
+  let total' = Atomic.get total in
+  if total' != 0 then
+    Printf.eprintf "NOK: total is not 0: %d\n" total'
+  else
+    print_endline "OK"
diff --git a/testsuite/tests/lib-str/parallel.reference b/testsuite/tests/lib-str/parallel.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lib-str/parallel.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/lib-string/test_string.ml b/testsuite/tests/lib-string/test_string.ml
index 003236f46c59..07bdd28c72c9 100644
--- a/testsuite/tests/lib-string/test_string.ml
+++ b/testsuite/tests/lib-string/test_string.ml
@@ -52,16 +52,4 @@ let ()  =
     while !sz <= 0 do push big l; sz += Sys.max_string_length done;
     try ignore (String.concat "" !l); assert false
     with Invalid_argument _ -> ();
-    assert(String.starts_with ~prefix:"foob" "foobarbaz");
-    assert(String.starts_with ~prefix:"" "foobarbaz");
-    assert(String.starts_with ~prefix:"" "");
-    assert(not (String.starts_with ~prefix:"foobar" "bar"));
-    assert(not (String.starts_with ~prefix:"foo" ""));
-    assert(not (String.starts_with ~prefix:"fool" "foobar"));
-    assert(String.ends_with ~suffix:"baz" "foobarbaz");
-    assert(String.ends_with ~suffix:"" "foobarbaz");
-    assert(String.ends_with ~suffix:"" "");
-    assert(not (String.ends_with ~suffix:"foobar" "bar"));
-    assert(not (String.ends_with ~suffix:"foo" ""));
-    assert(not (String.ends_with ~suffix:"obaz" "foobar"));
   end
diff --git a/testsuite/tests/lib-sync/prodcons.ml b/testsuite/tests/lib-sync/prodcons.ml
new file mode 100644
index 000000000000..2e2c09756cee
--- /dev/null
+++ b/testsuite/tests/lib-sync/prodcons.ml
@@ -0,0 +1,65 @@
+(* TEST
+*)
+
+(* Classic producer-consumer *)
+
+type 'a prodcons =
+  { buffer: 'a array;
+    lock: Mutex.t;
+    mutable readpos: int;
+    mutable writepos: int;
+    notempty: Condition.t;
+    notfull: Condition.t }
+
+let create size init =
+  { buffer = Array.make size init;
+    lock = Mutex.create();
+    readpos = 0;
+    writepos = 0;
+    notempty = Condition.create();
+    notfull = Condition.create() }
+
+let put p data =
+  Mutex.lock p.lock;
+  while (p.writepos + 1) mod Array.length p.buffer = p.readpos do
+    Condition.wait p.notfull p.lock
+  done;
+  p.buffer.(p.writepos) <- data;
+  p.writepos <- (p.writepos + 1) mod Array.length p.buffer;
+  Condition.signal p.notempty;
+  Mutex.unlock p.lock
+
+let get p =
+  Mutex.lock p.lock;
+  while p.writepos = p.readpos do
+    Condition.wait p.notempty p.lock
+  done;
+  let data = p.buffer.(p.readpos) in
+  p.readpos <- (p.readpos + 1) mod Array.length p.buffer;
+  Condition.signal p.notfull;
+  Mutex.unlock p.lock;
+  data
+
+(* Test *)
+
+let rec produce buff n max =
+  put buff n;
+  if n < max then produce buff (n+1) max
+
+let rec consume buff cur max =
+  let n = get buff in
+  if n <> cur then false
+  else if n = max then true
+  else consume buff (cur + 1) max
+
+let _ =
+  let buff1 = create 20 0 and buff2 = create 30 0 in
+  let ok1 = ref false and ok2 = ref false in
+  let _p1 = Domain.spawn (fun () -> produce buff1 0 10000)
+  and _p2 = Domain.spawn (fun () -> produce buff2 0 8000)
+  and c1 = Domain.spawn (fun () -> ok1 := consume buff1 0 10000) in
+  ok2 := consume buff2 0 8000;
+  Domain.join c1;
+  if !ok1 && !ok2
+  then print_string "passed\n"
+  else print_string "FAILED\n"
diff --git a/testsuite/tests/lib-sync/prodcons.reference b/testsuite/tests/lib-sync/prodcons.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/lib-sync/prodcons.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/lib-sync/trylock.ml b/testsuite/tests/lib-sync/trylock.ml
new file mode 100644
index 000000000000..314f1142caeb
--- /dev/null
+++ b/testsuite/tests/lib-sync/trylock.ml
@@ -0,0 +1,13 @@
+(* TEST
+*)
+
+(* Test Mutex.try_lock *)
+
+let () =
+  let m = Mutex.create () in
+  Mutex.lock m;
+  let res = Mutex.try_lock m in
+  if res = false then
+    print_endline "passed"
+  else
+    print_endline "FAILED (try_lock returned true)"
diff --git a/testsuite/tests/lib-sync/trylock.reference b/testsuite/tests/lib-sync/trylock.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/lib-sync/trylock.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/lib-sync/trylock2.ml b/testsuite/tests/lib-sync/trylock2.ml
new file mode 100644
index 000000000000..b31ace08b564
--- /dev/null
+++ b/testsuite/tests/lib-sync/trylock2.ml
@@ -0,0 +1,10 @@
+(* TEST
+*)
+
+(* Test Mutex.try_lock *)
+
+let () =
+  let m = Mutex.create () in
+  assert (Mutex.try_lock m);
+  Mutex.unlock m;
+  print_endline "passed"
diff --git a/testsuite/tests/lib-sync/trylock2.reference b/testsuite/tests/lib-sync/trylock2.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/lib-sync/trylock2.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/lib-systhreads/multicore_lifecycle.ml b/testsuite/tests/lib-systhreads/multicore_lifecycle.ml
new file mode 100644
index 000000000000..f48146490696
--- /dev/null
+++ b/testsuite/tests/lib-systhreads/multicore_lifecycle.ml
@@ -0,0 +1,19 @@
+(* TEST
+* hassysthreads
+include systhreads
+** bytecode
+** native
+*)
+
+let _ =
+  let t = ref (Thread.self ()) in
+  let d = Domain.spawn begin fun () ->
+     let thread_func () = Unix.sleep 5 in
+     let tt = Thread.create thread_func () in
+     t := tt;
+    ()
+   end
+  in
+  Domain.join d;
+  Thread.join (!t);
+  Domain.join @@ Domain.spawn (fun () -> print_endline "ok")
diff --git a/testsuite/tests/lib-systhreads/multicore_lifecycle.reference b/testsuite/tests/lib-systhreads/multicore_lifecycle.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/lib-systhreads/multicore_lifecycle.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/lib-systhreads/test_c_thread_register.ml b/testsuite/tests/lib-systhreads/test_c_thread_register.ml
new file mode 100644
index 000000000000..a8ec98aa9a7d
--- /dev/null
+++ b/testsuite/tests/lib-systhreads/test_c_thread_register.ml
@@ -0,0 +1,18 @@
+(* TEST
+   modules = "test_c_thread_register_cstubs.c"
+   * hassysthreads
+   include systhreads
+   ** not-bsd
+   *** bytecode
+   *** native
+*)
+
+(* spins a external thread from C and register it to the OCaml runtime *)
+
+external spawn_thread : (unit -> unit) -> unit = "spawn_thread"
+
+let passed () = Printf.printf "passed\n"
+
+let _ =
+  spawn_thread (passed);
+  Thread.delay 0.5
diff --git a/testsuite/tests/lib-systhreads/test_c_thread_register.reference b/testsuite/tests/lib-systhreads/test_c_thread_register.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/lib-systhreads/test_c_thread_register.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/lib-systhreads/test_c_thread_register_cstubs.c b/testsuite/tests/lib-systhreads/test_c_thread_register_cstubs.c
new file mode 100644
index 000000000000..9fd78589785d
--- /dev/null
+++ b/testsuite/tests/lib-systhreads/test_c_thread_register_cstubs.c
@@ -0,0 +1,27 @@
+#include <string.h>
+#include <pthread.h>
+#include "caml/mlvalues.h"
+#include "caml/gc.h"
+#include "caml/memory.h"
+#include "caml/callback.h"
+#include "threads.h"
+
+void *thread_func(void *fn) {
+  caml_c_thread_register();
+  caml_acquire_runtime_system();
+  caml_callback((value) fn, Val_unit);
+  caml_release_runtime_system();
+  caml_c_thread_unregister();
+  return 0;
+}
+
+value spawn_thread(value clos)
+{
+  pthread_t thr;
+  pthread_attr_t attr;
+
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+  pthread_create(&thr, &attr, thread_func, (void *) clos);
+  return Val_unit;
+}
diff --git a/testsuite/tests/lib-systhreads/testfork2.ml b/testsuite/tests/lib-systhreads/testfork2.ml
new file mode 100644
index 000000000000..9f95024eee35
--- /dev/null
+++ b/testsuite/tests/lib-systhreads/testfork2.ml
@@ -0,0 +1,37 @@
+(* TEST
+   * hassysthreads
+   include systhreads
+   ** not-bsd
+   *** libunix
+   **** bytecode
+   **** native
+*)
+
+(* POSIX threads and fork() *)
+
+let alloc_string () = ignore(String.make 2048 '0')
+
+let compute_thread () =
+  Thread.create begin fun () ->
+    alloc_string ()
+  end ()
+
+let fork () =
+  match Unix.fork() with
+  | 0 ->
+      alloc_string ();
+      print_string "passed";
+      print_newline ();
+      Thread.delay 1.0;
+      exit 0
+  | pid ->
+      Thread.delay 4.0;
+      exit 0
+
+let main () =
+  ignore(compute_thread ());
+  ignore(compute_thread ());
+  ignore(compute_thread ());
+  fork ()
+
+let _ = main()
diff --git a/testsuite/tests/lib-systhreads/testfork2.reference b/testsuite/tests/lib-systhreads/testfork2.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/lib-systhreads/testfork2.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/lib-threads/beat.ml b/testsuite/tests/lib-threads/beat.ml
index 1bf88332d391..06120ab961e5 100644
--- a/testsuite/tests/lib-threads/beat.ml
+++ b/testsuite/tests/lib-threads/beat.ml
@@ -2,6 +2,8 @@
 
 * hassysthreads
 include systhreads
+* skip
+reason = "off-by-one error on MacOS+Clang (#408)"
 ** bytecode
 ** native
 
diff --git a/testsuite/tests/lib-threads/uncaught_exception_handler.reference b/testsuite/tests/lib-threads/uncaught_exception_handler.reference
index 83d8bd06641d..34a8e2d7f6cc 100644
--- a/testsuite/tests/lib-threads/uncaught_exception_handler.reference
+++ b/testsuite/tests/lib-threads/uncaught_exception_handler.reference
@@ -1,12 +1,12 @@
 Thread 1 killed on uncaught exception Uncaught_exception_handler.CallbackExn
 Raised at Uncaught_exception_handler.fn in file "uncaught_exception_handler.ml", line 28, characters 12-113
-Called from Thread.create.(fun) in file "thread.ml", line 47, characters 8-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
 [thread 2] caught Uncaught_exception_handler.CallbackExn
 Raised at Uncaught_exception_handler.fn in file "uncaught_exception_handler.ml", line 28, characters 12-113
-Called from Thread.create.(fun) in file "thread.ml", line 47, characters 8-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
 Thread 2 killed on uncaught exception Uncaught_exception_handler.CallbackExn
 Raised at Uncaught_exception_handler.fn in file "uncaught_exception_handler.ml", line 28, characters 12-113
-Called from Thread.create.(fun) in file "thread.ml", line 47, characters 8-14
+Called from Thread.create.(fun) in file "thread.ml", line 49, characters 8-14
 Thread 2 uncaught exception handler raised Uncaught_exception_handler.UncaughtHandlerExn
 Raised at Uncaught_exception_handler.handler in file "uncaught_exception_handler.ml", line 26, characters 2-26
-Called from Thread.create.(fun) in file "thread.ml", line 53, characters 10-41
+Called from Thread.create.(fun) in file "thread.ml", line 55, characters 10-41
diff --git a/testsuite/tests/lib-unix/common/fork_cleanup.ml b/testsuite/tests/lib-unix/common/fork_cleanup.ml
new file mode 100644
index 000000000000..eca8f9de9d35
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/fork_cleanup.ml
@@ -0,0 +1,19 @@
+(* TEST
+* hasunix
+include unix
+** not-windows
+*** bytecode
+*** native
+*)
+
+(* this test checks that the domain lock is properly reinitialized
+   in the child process after fork.
+   See: https://github.com/ocaml-multicore/ocaml-multicore/issues/471 *)
+
+let () =
+  let fd = Unix.dup Unix.stdout in
+  let ret = Unix.fork () in
+  if ret = 0 then
+    Unix.close fd
+  else
+    print_endline "OK"
diff --git a/testsuite/tests/lib-unix/common/fork_cleanup.reference b/testsuite/tests/lib-unix/common/fork_cleanup.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/fork_cleanup.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/lib-unix/common/fork_cleanup_systhreads.ml b/testsuite/tests/lib-unix/common/fork_cleanup_systhreads.ml
new file mode 100644
index 000000000000..db0ebf4f6d92
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/fork_cleanup_systhreads.ml
@@ -0,0 +1,22 @@
+(* TEST
+* hassysthreads
+include systhreads
+** not-windows
+*** bytecode
+*** native
+*)
+
+(* this test checks that the domain lock is properly reinitialized
+   in the child process after fork.
+   See: https://github.com/ocaml-multicore/ocaml-multicore/issues/471 *)
+
+let () =
+  let th = Thread.create (fun () -> Thread.delay 0.5) () in
+  let fd = Unix.dup Unix.stdout in
+  match Unix.fork () with
+  | 0 ->
+     Unix.close fd;
+     print_endline "OK"
+  | _ ->
+     Unix.close fd;
+     Thread.join th
diff --git a/testsuite/tests/lib-unix/common/fork_cleanup_systhreads.reference b/testsuite/tests/lib-unix/common/fork_cleanup_systhreads.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/fork_cleanup_systhreads.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/lib-unix/common/multicore_fork_domain_alone.ml b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone.ml
new file mode 100644
index 000000000000..9549f256ea53
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone.ml
@@ -0,0 +1,22 @@
+(* TEST
+include unix
+* hasunix
+** not-windows
+*** bytecode
+*** native
+*)
+
+(* on Multicore, fork is not allowed is another domain is, and was running. *)
+(* this test checks that we can't fork if a domain is currently running. *)
+
+let expect_exn ="Unix.fork may not be called while other domains were created"
+
+let () =
+  let _ = Domain.spawn (fun () -> Unix.sleep 1) in
+  match Unix.fork () with
+  | exception Failure msg ->
+     if String.equal msg expect_exn then
+       print_endline "OK"
+     else
+       Printf.printf "failed: expected Failure: %s, got %s\n" expect_exn msg
+  | _ -> print_endline "NOK"
diff --git a/testsuite/tests/lib-unix/common/multicore_fork_domain_alone.reference b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/lib-unix/common/multicore_fork_domain_alone2.ml b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone2.ml
new file mode 100644
index 000000000000..929ed5ecf7e5
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone2.ml
@@ -0,0 +1,23 @@
+(* TEST
+include unix
+* hasunix
+** not-windows
+*** bytecode
+*** native
+*)
+
+(* on Multicore, fork is not allowed is another domain is, and was running. *)
+(* this test checks that we can't fork if another domain ran before. *)
+
+let expect_exn ="Unix.fork may not be called while other domains were created"
+
+let () =
+  let d = Domain.spawn (fun () -> ()) in
+  Domain.join d;
+  match Unix.fork () with
+  | exception Failure msg ->
+     if String.equal msg expect_exn then
+       print_endline "OK"
+     else
+       Printf.printf "failed: expected Failure: %s, got %s\n" expect_exn msg
+  | _ -> print_endline "NOK"
diff --git a/testsuite/tests/lib-unix/common/multicore_fork_domain_alone2.reference b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone2.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/lib-unix/common/multicore_fork_domain_alone2.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/misc/ephe_infix.ml b/testsuite/tests/misc/ephe_infix.ml
deleted file mode 100644
index 5d8b5e9a548c..000000000000
--- a/testsuite/tests/misc/ephe_infix.ml
+++ /dev/null
@@ -1,29 +0,0 @@
-(* TEST *)
-
-(* Testing handling of infix_tag by ephemeron *)
-
-(* This test will have to be ported to the new ephemeron API *)
-[@@@alert "-old_ephemeron_api"]
-
-let infix n = let rec f () = n and g () = f () in g
-
-(* Issue #9485 *)
-let () =
-  let w = Weak.create 1 in
-  Weak.set w 0 (Some (infix 12));
-  match Weak.get_copy w 0 with Some h -> ignore (h ()) | _ -> ()
-
-(* Issue #7810 *)
-let ephe x =
-  let open Ephemeron.K1 in
-  let e = create () in
-  set_key e x;
-  set_data e 42;
-  Gc.full_major ();
-  (x, get_data e)
-
-let () =
-  assert (ephe (ref 1000) = (ref 1000, Some 42));
-  match ephe (infix 12) with
-  | (h, Some 42) -> ()
-  | _ -> assert false
diff --git a/testsuite/tests/misc/ephe_issue9391.ml b/testsuite/tests/misc/ephe_issue9391.ml
deleted file mode 100644
index 00b5ca322bd8..000000000000
--- a/testsuite/tests/misc/ephe_issue9391.ml
+++ /dev/null
@@ -1,70 +0,0 @@
-(* TEST
-*)
-
-(* This test is only relevant to the old ephemeron API *)
-[@@@alert "-old_ephemeron_api"]
-
-let debug = false
-
-open Printf
-open Ephemeron
-
-let empty = ref 0
-let make_ra ~size = Array.init size (fun _ -> ref 1) [@@inline never]
-let make_ephes ~size = Array.init size (fun _ -> Ephemeron.K1.create ()) [@@inline never]
-
-let test ~size ~slice =
-  let keys1 = make_ra ~size in
-  let keys2 = make_ra ~size in
-  let datas1 = make_ra ~size in
-  let datas2 = make_ra ~size in
-  let ephe1 = make_ephes ~size in
-  let ephe2 = make_ephes ~size in
-  if debug then Gc.set { (Gc.get ()) with Gc.verbose = 0x3 };
-  (** Fill ephe.(i )from key.(i) to data.(i) *)
-  for i=0 to size-1 do Ephemeron.K1.set_key  ephe1.(i) keys1.(i); done;
-  for i=0 to size-1 do Ephemeron.K1.set_data ephe1.(i) datas1.(i); done;
-  for i=0 to size-1 do Ephemeron.K1.set_key  ephe2.(i) keys2.(i); done;
-  for i=0 to size-1 do Ephemeron.K1.set_data ephe2.(i) datas2.(i); done;
-  (** Push everything in the major heap *)
-  if debug then Printf.eprintf "Start minor major\n%!";
-  Gc.minor ();
-  Gc.major ();
-  if debug then Printf.eprintf "start emptying\n%!";
-  for i=0 to size-1 do keys1.(i) <- empty; done;
-  for i=0 to size-1 do datas1.(i) <- empty; done;
-  (** The emptying is done during a major so keys and data are kept alive by the
-     assignments. Restart a new major *)
-  Gc.major ();
-  if debug then Printf.eprintf "Start checking state\n%!";
-  (** Fill the ephemeron with an alive key *)
-  if debug then Printf.eprintf "Start replacing dead key into alive one\n%!";
-  (* Printf.eprintf "put in set (2) %i\n%!" (Gc.major_slice (10*4*slice*6)); *)
-  for i=0 to size-1 do
-    ignore (Gc.major_slice (4));
-    if debug then Printf.eprintf "@%!";
-    Ephemeron.K1.blit_data ephe1.(i) ephe2.(i);
-    if debug && 0 = i mod (size / 10) then Printf.eprintf "done %5i/%i\n%!" i size;
-  done;
-  if debug then   Printf.eprintf "end\n%!";
-  (** Finish all, assertion in clean phase should not find a dangling data *)
-  Gc.full_major ();
-  let r = ref 0 in
-  if debug then
-    for i=0 to size-1 do
-      if Ephemeron.K1.check_data ephe2.(size-1-i) then incr r;
-      if 0 = i mod (size / 10) then Printf.eprintf "done %5i/%i %i\n%!" i size !r;
-    done;
-  (* keep the arrays alive *)
-  assert (Array.length keys1 = size);
-  assert (Array.length keys2 = size);
-  assert (Array.length datas1 = size);
-  assert (Array.length datas2 = size);
-  assert (Array.length ephe1 = size);
-  assert (Array.length ephe2 = size)
-[@@inline never]
-
-let () =
-  test ~size:1000 ~slice:5;
-  test ~size:1000 ~slice:10;
-  test ~size:1000 ~slice:15
diff --git a/testsuite/tests/misc/ephetest.ml b/testsuite/tests/misc/ephetest.ml
deleted file mode 100644
index abb2873bc166..000000000000
--- a/testsuite/tests/misc/ephetest.ml
+++ /dev/null
@@ -1,174 +0,0 @@
-(* TEST
-*)
-
-(* These tests will have to be ported to the new API *)
-[@@@alert "-old_ephemeron_api"]
-
-let debug = false
-
-open Printf
-open Ephemeron
-
-let is_true test s b = printf "%s %s: %s\n" test s (if b then "OK" else "FAIL")
-let is_false test s b = is_true test s (not b)
-
-let is_data_value test eph (v:int) =
-  match K1.get_data_copy eph with
-  | Some x ->
-      if !x = v
-      then printf "%s data set: OK\n" test
-      else printf "%s data set: FAIL(bad value %i)\n" test (!x)
-  | None -> printf "%s data set: FAIL\n" test
-
-let is_key_value test eph (v:int) =
-  match K1.get_key_copy eph with
-  | Some x ->
-      if !x = v
-      then printf "%s key set: OK\n" test
-      else printf "%s key set: FAIL(bad value %i)\n" test (!x)
-  | None -> printf "%s key unset: FAIL\n" test
-
-let is_key_unset test eph =
-  is_false test "key unset" (K1.check_key eph)
-
-let is_data_unset test eph =
-  is_false test "data unset" (K1.check_data eph)
-
-let make_ra () = ref (ref 1) [@@inline never]
-let make_rb () = ref (ref (ref 2)) [@@inline never]
-let ra = make_ra ()
-let rb = make_rb ()
-
-(** test: key alive data dangling *)
-let test1 () =
-  let test = "test1" in
-  Gc.minor ();
-  Gc.full_major ();
-  let eph : (int ref, int ref) K1.t = K1.create () in
-  K1.set_key eph (!ra);
-  K1.set_data eph (ref 42);
-  is_key_value test eph 1;
-  is_data_value test eph 42;
-  Gc.minor ();
-  is_key_value test eph 1;
-  is_data_value test eph 42;
-  Gc.full_major ();
-  is_key_value test eph 1;
-  is_data_value test eph 42;
-  ra := ref 12;
-  Gc.full_major ();
-  is_key_unset test eph;
-  is_data_unset test eph
-let () = (test1 [@inlined never]) ()
-
-(** test: key dangling data dangling *)
-let test2 () =
-  let test = "test2" in
-  Gc.minor ();
-  Gc.full_major ();
-  let eph : (int ref, int ref) K1.t = K1.create () in
-  K1.set_key eph (ref 125);
-  K1.set_data eph (ref 42);
-  is_key_value test eph 125;
-  is_data_value test eph 42;
-  ra := ref 13;
-  Gc.minor ();
-  is_key_unset test eph;
-  is_data_unset test eph
-let () = (test2 [@inlined never]) ()
-
-(** test: key dangling data alive *)
-let test3 () =
-  let test = "test3" in
-  Gc.minor ();
-  Gc.full_major ();
-  let eph : (int ref, int ref) K1.t = K1.create () in
-  K1.set_key eph (ref 125);
-  K1.set_data eph (!ra);
-  is_key_value test eph 125;
-  is_data_value test eph 13;
-  ra := ref 14;
-  Gc.minor ();
-  is_key_unset test eph;
-  is_data_unset test eph
-let () = (test3 [@inlined never]) ()
-
-(** test: key alive but one away, data dangling *)
-let test4 () =
-  let test = "test4" in
-  Gc.minor ();
-  Gc.full_major ();
-  let eph : (int ref, int ref) K1.t = K1.create () in
-  rb := ref (ref 3);
-  K1.set_key eph (!(!rb));
-  K1.set_data eph (ref 43);
-  is_key_value test eph 3;
-  is_data_value test eph 43;
-  Gc.minor ();
-  Gc.minor ();
-  is_key_value test eph 3;
-  is_data_value test eph 43
-let () = (test4 [@inlined never]) ()
-
-(** test: key dangling but one away, data dangling *)
-let test5 () =
-  let test = "test5" in
-  Gc.minor ();
-  Gc.full_major ();
-  let eph : (int ref, int ref) K1.t = K1.create () in
-  rb := ref (ref 3);
-  K1.set_key eph (!(!rb));
-  K1.set_data eph (ref 43);
-  is_key_value test eph 3;
-  is_data_value test eph 43;
-  !rb := ref 4;
-  Gc.minor ();
-  Gc.minor ();
-  is_key_unset test eph;
-  is_data_unset test eph
-let () = (test5 [@inlined never]) ()
-
-(** test: key accessible from data but all dangling *)
-let test6 () =
-  let test = "test6" in
-  Gc.minor ();
-  Gc.full_major ();
-  let eph : (int ref, int ref ref) K1.t = K1.create () in
-  rb := ref (ref 3);
-  K1.set_key eph (!(!rb));
-  K1.set_data eph (ref (!(!rb)));
-  Gc.minor ();
-  is_key_value test eph 3;
-  !rb := ref 4;
-  Gc.full_major ();
-  is_key_unset test eph;
-  is_data_unset test eph
-let () = (test6 [@inlined never]) ()
-
-(** test: ephemeron accessible from data but they are dangling *)
-type t =
-  | No
-  | Ephe of (int ref, t) K1.t
-
-let rc = ref No
-
-let test7 () =
-  let test = "test7" in
-  Gc.minor ();
-  Gc.full_major ();
-  ra := ref 42;
-  let weak : t Weak.t = Weak.create 1 in
-  let eph : (int ref, t) K1.t ref = ref (K1.create ()) in
-  rc := Ephe !eph;
-  Weak.set weak 0 (Some !rc);
-  K1.set_key !eph !ra;
-  K1.set_data !eph !rc;
-  Gc.minor ();
-  is_true test "before" (Weak.check weak 0);
-  eph := K1.create ();
-  rc := No;
-  Gc.full_major ();
-  Gc.full_major ();
-  Gc.full_major ();
-  is_false test "after" (Weak.check weak 0)
-let () = (test7 [@inlined never]) ()
diff --git a/testsuite/tests/misc/ephetest.reference b/testsuite/tests/misc/ephetest.reference
deleted file mode 100644
index 2699fdf7f943..000000000000
--- a/testsuite/tests/misc/ephetest.reference
+++ /dev/null
@@ -1,29 +0,0 @@
-test1 key set: OK
-test1 data set: OK
-test1 key set: OK
-test1 data set: OK
-test1 key set: OK
-test1 data set: OK
-test1 key unset: OK
-test1 data unset: OK
-test2 key set: OK
-test2 data set: OK
-test2 key unset: OK
-test2 data unset: OK
-test3 key set: OK
-test3 data set: OK
-test3 key unset: OK
-test3 data unset: OK
-test4 key set: OK
-test4 data set: OK
-test4 key set: OK
-test4 data set: OK
-test5 key set: OK
-test5 data set: OK
-test5 key unset: OK
-test5 data unset: OK
-test6 key set: OK
-test6 key unset: OK
-test6 data unset: OK
-test7 before: OK
-test7 after: OK
diff --git a/testsuite/tests/misc/ephetest2.ml b/testsuite/tests/misc/ephetest2.ml
deleted file mode 100644
index 31cb327ca388..000000000000
--- a/testsuite/tests/misc/ephetest2.ml
+++ /dev/null
@@ -1,156 +0,0 @@
-(* TEST
-*)
-
-(***
-   This test evaluate boolean formula composed by conjunction and
-     disjunction using ephemeron:
-   - true == alive, false == garbage collected
-   - and == an n-ephemeron, or == many 1-ephemeron
-
-*)
-
-(* This will have to be ported to the new ephemeron API *)
-[@@@alert "-old_ephemeron_api"]
-
-let nb_test = 4
-let max_level = 10
- (** probability that a branch is not linked to a previous one *)
-let proba_no_shared = 0.2
-let arity_max = 4
-
-let proba_new = proba_no_shared ** (1./.(float_of_int max_level))
-
-open Format
-open Ephemeron
-
-let is_true test s b = printf "%s %s: %s\n" test s (if b then "OK" else "FAIL")
-let is_false test s b = is_true test s (not b)
-
-type varephe = int ref
-type ephe = (varephe,varephe) Kn.t
-
-type formula =
-  | Constant of bool
-  | And of var array
-  | Or of var array
-
-and var = {
-  form: formula;
-  value: bool;
-  ephe: varephe Weak.t;
-}
-
-let print_short_bool fmt b =
-  if b
-  then pp_print_string fmt "t"
-  else pp_print_string fmt "f"
-
-let rec pp_form fmt = function
-  | Constant b ->
-      fprintf fmt "%B" b
-  | And a      ->
-      fprintf fmt "And[@[%a@]]" (fun fmt -> Array.iter (pp_var fmt)) a
-  | Or a       ->
-      fprintf fmt  "Or[@[%a@]]" (fun fmt -> Array.iter (pp_var fmt)) a
-
-and pp_var fmt v =
-  fprintf fmt "%a%a:%a;@ "
-    print_short_bool v.value
-    print_short_bool (Weak.check v.ephe 0)
-    pp_form v.form
-
-type env = {
-  (** resizable array for cheap *)
-  vars : (int,var) Hashtbl.t;
-  (** the ephemerons must be alive *)
-  ephes : ephe Stack.t;
-  (** keep alive the true constant *)
-  varephe_true : varephe Stack.t;
-(** keep temporarily alive the false constant *)
-  varephe_false : varephe Stack.t;
-}
-
-let new_env () = {
-  vars = Hashtbl.create 100;
-  ephes = Stack.create ();
-  varephe_true = Stack.create ();
-  varephe_false = Stack.create ();
-}
-
-let evaluate = function
-  | Constant b -> b
-  | And a -> Array.fold_left (fun acc e -> acc && e.value) true  a
-  | Or a  -> Array.fold_left (fun acc e -> acc || e.value) false a
-
-let get_ephe v =
-  match Weak.get v.ephe 0 with
-  | None ->
-      invalid_arg "Error: weak dead but nothing have been released"
-  | Some r -> r
-
-(** create a variable and its definition in the boolean world and
-    ephemerons world *)
-let rec create env rem_level (** remaining level *) =
-  let varephe = ref 1 in
-  let form =
-    if rem_level = 0 then (** Constant *)
-      if Random.bool ()
-      then (Stack.push varephe env.varephe_true ; Constant true )
-      else (Stack.push varephe env.varephe_false; Constant false)
-    else
-      let size = (Random.int (arity_max - 1)) + 2 in
-      let new_link _ =
-        if (Hashtbl.length env.vars) = 0 || Random.float 1. < proba_new
-        then create env (rem_level -1)
-        else Hashtbl.find env.vars (Random.int (Hashtbl.length env.vars))
-      in
-      let args = Array.init size new_link in
-      if Random.bool ()
-      then begin (** Or *)
-        Array.iter (fun v ->
-            let r = get_ephe v in
-            let e = Kn.create 1 in
-            Kn.set_key e 0 r;
-            Kn.set_data e varephe;
-            Stack.push e env.ephes
-          ) args; Or args
-      end
-      else begin (** And *)
-        let e = Kn.create (Array.length args) in
-        for i=0 to Array.length args - 1 do
-          Kn.set_key e i (get_ephe args.(i));
-        done;
-        Kn.set_data e varephe;
-        Stack.push e env.ephes;
-        And args
-      end
-  in
-  let create_weak e =
-    let w = Weak.create 1 in
-    Weak.set w 0 (Some e);
-    w
-  in
-  let v = {form; value = evaluate form;
-           ephe = create_weak varephe;
-          } in
-  Hashtbl.add env.vars (Hashtbl.length env.vars) v;
-  v
-
-
-let check_var v = v.value = Weak.check v.ephe 0
-
-let run test init =
-  Random.init init;
-  let env = new_env () in
-  let _top = create env max_level in
-  (** release false ref *)
-  Stack.clear env.varephe_false;
-  Gc.full_major ();
-  let res = Hashtbl.fold (fun _ v acc -> acc && check_var v) env.vars true in
-  is_true test "check" res;
-  env (* Keep env.varephe_true alive. *)
-
-let () =
-  for i = 0 to nb_test do
-    ignore (run ("test"^(Int.to_string i)) i);
-  done
diff --git a/testsuite/tests/misc/ephetest2_new.reference b/testsuite/tests/misc/ephetest2_new.reference
deleted file mode 100644
index db17cd7aa6dc..000000000000
--- a/testsuite/tests/misc/ephetest2_new.reference
+++ /dev/null
@@ -1,5 +0,0 @@
-test0 check: OK
-test1 check: OK
-test2 check: OK
-test3 check: OK
-test4 check: OK
diff --git a/testsuite/tests/parallel/atomics.ml b/testsuite/tests/parallel/atomics.ml
new file mode 100644
index 000000000000..9c1d0a3cf2c9
--- /dev/null
+++ b/testsuite/tests/parallel/atomics.ml
@@ -0,0 +1,64 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+type u = U of unit
+let () =
+  (* See https://github.com/ocaml-multicore/ocaml-multicore/issues/252 *)
+  let make_cell (x : unit) : u Atomic.t =
+    let cell = Atomic.make (U x) in
+    Atomic.set cell (U x) ;
+    cell in
+  (* the error shows up with an array of length 256 or larger *)
+  let a = Array.make 256 (make_cell ()) in
+  ignore (Sys.opaque_identity a)
+
+
+let test_fetch_add () =
+  let ndoms = 4 in
+  let count = 10000 in
+  let arr = Array.make (ndoms * count) (-1) in
+  let step = 1493 in
+  let r = Atomic.make 0 in
+  (* step is relatively prime to Array.length arr *)
+  let loop () =
+    let self = (Domain.self () :> int) in
+    for i = 1 to count do
+      let n = Atomic.fetch_and_add r step mod Array.length arr in
+      assert (arr.(n) == (-1));
+      arr.(n) <- self
+    done in
+  let _ = Array.init 4 (fun i ->
+      Domain.spawn loop)
+      |> Array.map Domain.join in
+  assert (Array.for_all (fun x -> x >= 0) arr)
+
+let () =
+  test_fetch_add ();
+  print_endline "ok"
+
+
+
+
+let test v =
+  let open Atomic in
+  assert (get v = 42);
+  set v 10;
+  assert (get v = 10);
+  let b = compare_and_set v 11 20 in
+  assert (b = false);
+  assert (get v = 10);
+  let b = compare_and_set v 10 20 in
+  assert (b = true);
+  assert (get v = 20)
+
+let () =
+  let r = Atomic.make 42 in
+  test r;
+  Atomic.set r 42;
+  Gc.full_major ();
+  test r;
+  print_endline "ok"
diff --git a/testsuite/tests/parallel/atomics.reference b/testsuite/tests/parallel/atomics.reference
new file mode 100644
index 000000000000..79ebd0860f49
--- /dev/null
+++ b/testsuite/tests/parallel/atomics.reference
@@ -0,0 +1,2 @@
+ok
+ok
diff --git a/testsuite/tests/parallel/backup_thread.ml b/testsuite/tests/parallel/backup_thread.ml
new file mode 100644
index 000000000000..05b06cc47caa
--- /dev/null
+++ b/testsuite/tests/parallel/backup_thread.ml
@@ -0,0 +1,17 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+
+let _ =
+  (* start a dummy domain and shut it down to cause a domain reuse *)
+  let d = Domain.spawn (fun _ -> ()) in
+  Domain.join d;
+  let _d = Domain.spawn (fun _ ->
+    Unix.sleep 10;
+    print_endline "Should not reach here!") in
+  Gc.full_major ();
+  print_endline "OK"
diff --git a/testsuite/tests/parallel/backup_thread.reference b/testsuite/tests/parallel/backup_thread.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/parallel/backup_thread.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/parallel/backup_thread_pipe.ml b/testsuite/tests/parallel/backup_thread_pipe.ml
new file mode 100644
index 000000000000..41fd101c1f57
--- /dev/null
+++ b/testsuite/tests/parallel/backup_thread_pipe.ml
@@ -0,0 +1,22 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+
+let producer oc = output_string oc "passed\n"; close_out oc
+
+let consumer ic = print_endline @@ input_line ic; close_in ic
+
+let main () =
+  let (in_fd, out_fd) = Unix.pipe() in
+  let ipipe = Unix.in_channel_of_descr in_fd in
+  let opipe = Unix.out_channel_of_descr out_fd in
+  let prod = Domain.spawn (fun () -> Unix.sleep 3; Gc.full_major(); producer opipe) in
+  let cons = Domain.spawn (fun () -> consumer ipipe) in
+  Domain.join prod;
+  Domain.join cons
+
+let _ = Unix.handle_unix_error main (); exit 0
diff --git a/testsuite/tests/parallel/backup_thread_pipe.reference b/testsuite/tests/parallel/backup_thread_pipe.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/parallel/backup_thread_pipe.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/parallel/constpromote.ml b/testsuite/tests/parallel/constpromote.ml
new file mode 100644
index 000000000000..fc9fd65ff61f
--- /dev/null
+++ b/testsuite/tests/parallel/constpromote.ml
@@ -0,0 +1,19 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+(* when run with the bytecode debug runtime, this test
+   used to trigger a bug where the constant [13]
+   remained unpromoted *)
+
+let rec burn l =
+  if List.hd l > 14 then ()
+  else burn (l @ l |> List.map (fun x -> x + 1))
+
+let () =
+  ignore (Domain.spawn (fun () -> burn [13]));
+  burn [0];
+  Printf.printf "all done\n%!"
diff --git a/testsuite/tests/parallel/constpromote.reference b/testsuite/tests/parallel/constpromote.reference
new file mode 100644
index 000000000000..de6391430555
--- /dev/null
+++ b/testsuite/tests/parallel/constpromote.reference
@@ -0,0 +1 @@
+all done
diff --git a/testsuite/tests/parallel/deadcont.ml b/testsuite/tests/parallel/deadcont.ml
new file mode 100644
index 000000000000..34e59f0e9383
--- /dev/null
+++ b/testsuite/tests/parallel/deadcont.ml
@@ -0,0 +1,56 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+(*
+  Test handling of continuations created by a domain that has since terminated.
+  Bug report and testcase by Ziga Luksic, see:
+    https://github.com/ocamllabs/ocaml-multicore/issues/175
+ *)
+
+open EffectHandlers
+open EffectHandlers.Deep
+
+type _ eff += Poke : unit eff
+
+type result = Done | Poking of (unit -> result)
+
+(* Debug help. *)
+let print s = print_string s; Format.pp_print_flush Format.std_formatter ()
+
+(* Just poke the handler n times. *)
+let rec poke = function
+  | 0 -> ()
+  | n -> perform Poke; poke (n-1)
+
+(* The handler inside the domain, that captures the continuation whenever
+    it gets poked. *)
+let domain_handler f =
+  match_with f ()
+  { retc = (fun () -> Done);
+    exnc = (fun e -> raise e);
+    effc = fun (type a) (e : a eff) ->
+          match e with
+          | Poke -> Some (fun (k : (a, _) continuation) ->
+              Poking (fun () ->
+                print "...";
+                ignore (continue k ());
+                print "success\n";
+                Done))
+          | _ -> None }
+
+(* Re-runs the poker that happened inside a domain. *)
+let rerunner = function
+  | Poking f -> f () (*re-runs the function*)
+  | Done -> Done
+
+(* Test. *)
+let test n =
+  (* Messy handler wrapping. *)
+  let inner () = domain_handler (fun () -> poke n) in
+  rerunner (Domain.join (Domain.spawn inner))
+
+let _ = test 100 |> ignore; print_endline "done"
diff --git a/testsuite/tests/parallel/deadcont.reference b/testsuite/tests/parallel/deadcont.reference
new file mode 100644
index 000000000000..cb738d65240d
--- /dev/null
+++ b/testsuite/tests/parallel/deadcont.reference
@@ -0,0 +1,2 @@
+...success
+done
diff --git a/testsuite/tests/parallel/domain_dls.ml b/testsuite/tests/parallel/domain_dls.ml
new file mode 100644
index 000000000000..db4ebe0c5661
--- /dev/null
+++ b/testsuite/tests/parallel/domain_dls.ml
@@ -0,0 +1,39 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+let check_dls () =
+  let k1 = Domain.DLS.new_key (fun () -> 10) in
+  let k2 = Domain.DLS.new_key (fun () -> 1.0) in
+  Domain.DLS.set k1 100;
+  Domain.DLS.set k2 200.0;
+  let v1 = Domain.DLS.get k1 in
+  let v2 = Domain.DLS.get k2 in
+  assert (v1 = 100);
+  assert (v2 = 200.0);
+  Gc.major ()
+
+let check_dls_domain_reuse () =
+  let k1 = Domain.DLS.new_key (fun () -> 100) in
+  let k2 = Domain.DLS.new_key (fun () -> 200) in
+  let domains = Array.init 4 (fun _ -> Domain.spawn(fun _ ->
+    Domain.DLS.set k1 31415;
+    Domain.DLS.set k2 27182;
+    assert (Domain.DLS.get k1 = 31415);
+    assert (Domain.DLS.get k2 = 27182))) in
+  Array.iter Domain.join domains;
+  Gc.full_major ();
+  let domains2 = Array.init 4 (fun _ -> Domain.spawn(fun _ ->
+    assert(Domain.DLS.get k1 = 100);
+    assert(Domain.DLS.get k2 = 200))) in
+  Array.iter Domain.join domains2
+
+let _ =
+  let domains = Array.init 3 (fun _ -> Domain.spawn(check_dls)) in
+  check_dls ();
+  Array.iter Domain.join domains;
+  check_dls_domain_reuse ();
+  print_endline "ok"
diff --git a/testsuite/tests/parallel/domain_dls.reference b/testsuite/tests/parallel/domain_dls.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/parallel/domain_dls.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/parallel/domain_dls2.ml b/testsuite/tests/parallel/domain_dls2.ml
new file mode 100644
index 000000000000..6c79145ba75c
--- /dev/null
+++ b/testsuite/tests/parallel/domain_dls2.ml
@@ -0,0 +1,17 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+let _ =
+  let key_array =
+    Array.init 128 (fun i -> Domain.DLS.new_key (fun _ -> i))
+  in
+  assert (Domain.DLS.get (key_array.(42)) = 42);
+  let d = Domain.spawn (fun _ ->
+    assert (Domain.DLS.get (key_array.(63)) = 63))
+  in
+  Domain.join d;
+  print_endline "OK"
diff --git a/testsuite/tests/parallel/domain_dls2.reference b/testsuite/tests/parallel/domain_dls2.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/parallel/domain_dls2.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/parallel/domain_id.ml b/testsuite/tests/parallel/domain_id.ml
new file mode 100644
index 000000000000..023d4253462f
--- /dev/null
+++ b/testsuite/tests/parallel/domain_id.ml
@@ -0,0 +1,50 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+open Domain
+
+let id () = ()
+
+
+let newdom_id () =
+  let d = Domain.spawn id in
+  let n = Domain.get_id d in
+  join d;
+  (n :> int)
+
+let test_domain_reuse () =
+  (* checks that domain slots are getting reused quickly,
+     by checking that subsequent domain IDs are an arithmetic
+     progression (implies that you're getting the same domain
+     over and over, but its ID increases by Max_domains.
+
+     this test has to run first, since it makes assumptions
+     about domain IDs *)
+  let first = newdom_id () in
+  let curr = ref (newdom_id ()) in
+  let delta = !curr - first in
+  assert (delta > 0);
+  for i = 1 to 10000 do
+    let next = newdom_id () in
+    assert (next - !curr = delta);
+    curr := next
+  done
+
+
+let test_different_ids () =
+  let d1 = Domain.spawn id in
+  let d2 = Domain.spawn id in
+  assert (get_id d1 <> get_id d2);
+  join d1; join d2;
+  let d3 = Domain.spawn id in
+  assert (get_id d1 <> get_id d3)
+
+
+let () =
+  test_domain_reuse ();
+  test_different_ids ();
+  print_endline "ok"
diff --git a/testsuite/tests/parallel/domain_id.reference b/testsuite/tests/parallel/domain_id.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/parallel/domain_id.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/parallel/domain_parallel_spawn_burn.ml b/testsuite/tests/parallel/domain_parallel_spawn_burn.ml
new file mode 100644
index 000000000000..8dd2398ac8f4
--- /dev/null
+++ b/testsuite/tests/parallel/domain_parallel_spawn_burn.ml
@@ -0,0 +1,43 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+open Domain
+
+(* This test looks to spawn domains while doing a bunch of explicit minor and major GC calls
+   from parallel domains *)
+
+let rec burn l =
+  if List.hd l > 14 then ()
+  else
+    burn (l @ l |> List.map (fun x -> x + 1))
+
+let test_parallel_spawn () =
+  for i = 1 to 20 do
+    let a = Array.init 25 (fun _ -> Domain.spawn (fun () -> burn [0])) in
+    for j = 0 to 24 do
+      join a.(j)
+    done
+  done
+
+let () =
+  let running = ref true in
+  let rec run_until_stop fn () =
+    while !running do
+      fn ();
+    done
+  in
+
+  let domain_minor_gc = Domain.spawn (run_until_stop (fun () -> burn [8]; Gc.minor ())) in
+  let domain_major_gc = Domain.spawn (run_until_stop (fun () -> burn [8]; Gc.major ())) in
+
+  test_parallel_spawn ();
+
+  running := false;
+  join domain_minor_gc;
+  join domain_major_gc;
+
+  print_endline "ok"
diff --git a/testsuite/tests/parallel/domain_parallel_spawn_burn.reference b/testsuite/tests/parallel/domain_parallel_spawn_burn.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/parallel/domain_parallel_spawn_burn.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/parallel/domain_serial_spawn_burn.ml b/testsuite/tests/parallel/domain_serial_spawn_burn.ml
new file mode 100644
index 000000000000..2d1961d9f130
--- /dev/null
+++ b/testsuite/tests/parallel/domain_serial_spawn_burn.ml
@@ -0,0 +1,49 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+open Domain
+
+(* This test looks to spawn domains while doing a bunch of explicit minor and major GC calls
+   from parallel domains *)
+
+let rec burn l =
+  if List.hd l > 14 then ()
+  else
+    burn (l @ l |> List.map (fun x -> x + 1))
+
+let test_serial_domain_spawn () =
+  for i = 1 to 250 do
+    let d = Domain.spawn (fun () -> burn [0]) in
+    join d
+  done
+
+let test_parallel_spawn () =
+  for i = 1 to 10 do
+    let a = Array.init 25 (fun _ -> Domain.spawn (fun () -> burn [0])) in
+    for j = 0 to 24 do
+      join a.(j)
+    done
+  done
+
+let () =
+  let running = ref true in
+  let rec run_until_stop fn () =
+    while !running do
+      fn ();
+    done
+  in
+
+  let domain_minor_gc = Domain.spawn (run_until_stop (fun () -> burn [8]; Gc.minor ())) in
+  let domain_major_gc = Domain.spawn (run_until_stop (fun () -> burn [8]; Gc.major ())) in
+
+  test_serial_domain_spawn ();
+
+  running := false;
+  join domain_minor_gc;
+  join domain_major_gc;
+
+  print_endline "ok"
diff --git a/testsuite/tests/parallel/domain_serial_spawn_burn.reference b/testsuite/tests/parallel/domain_serial_spawn_burn.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/parallel/domain_serial_spawn_burn.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/parallel/fib_threads.ml b/testsuite/tests/parallel/fib_threads.ml
new file mode 100644
index 000000000000..f6005729b72b
--- /dev/null
+++ b/testsuite/tests/parallel/fib_threads.ml
@@ -0,0 +1,23 @@
+(* TEST
+* hassysthreads
+include systhreads
+** bytecode
+** native
+*)
+
+
+let n = 42
+let num_domains = try int_of_string Sys.argv.(1) with _ -> 4
+
+let rec fib n =
+  if (n < 2) then n
+  else fib (n - 1) + fib (n - 2)
+
+let th_create () =
+  let t = Thread.create fib in
+  Thread.join (t n)
+
+let _ =
+  let domains = Array.init num_domains (fun _ -> Domain.spawn(th_create)) in
+  Array.iter Domain.join domains;
+  print_endline "done"
diff --git a/testsuite/tests/parallel/fib_threads.reference b/testsuite/tests/parallel/fib_threads.reference
new file mode 100644
index 000000000000..19f86f493ab1
--- /dev/null
+++ b/testsuite/tests/parallel/fib_threads.reference
@@ -0,0 +1 @@
+done
diff --git a/testsuite/tests/parallel/join.ml b/testsuite/tests/parallel/join.ml
new file mode 100644
index 000000000000..fba31533e350
--- /dev/null
+++ b/testsuite/tests/parallel/join.ml
@@ -0,0 +1,59 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+let main_join n =
+  let a = Array.init n (fun _ -> false) in
+  Array.init n (fun i -> Domain.spawn (fun () ->
+    a.(i) <- true;
+    Some i
+  )) |>
+  Array.iteri (fun i d -> let v = Domain.join d in assert (v = Some i));
+  let ok = Array.to_list a |> List.for_all (fun x -> x) in
+  assert ok
+
+let rec other_join flags n domain =
+  if n > 0 then
+    Domain.spawn (fun () -> flags.(n-1) <- true; Domain.join domain)
+    |> other_join flags (n-1)
+  else
+    Domain.join domain
+
+exception Ex of string
+let join_exn () =
+  match Domain.(join (spawn (fun () -> raise (Ex (String.make 5 '!'))))) with
+  | _ -> assert false
+  | exception (Ex "!!!!!") -> ()
+
+let join_slow () =
+  let rec burn l =
+    if List.hd l > 14 then ()
+  else
+    burn (l @ l |> List.map (fun x -> x + 1)) in
+  assert (Domain.(join (spawn (fun () -> burn [0]; 42))) = 42)
+
+
+let join2 () =
+  let r = ref false in
+  let t = Domain.spawn (fun () -> r := true) in
+  Domain.join t;
+  assert !r;
+  try
+    Domain.join t;
+    assert false
+  with Invalid_argument _ ->
+    assert !r
+
+let () =
+  main_join 100;
+  let flags = Array.make 100 false in
+  other_join flags (Array.length flags) (Domain.spawn ignore);
+  assert (Array.for_all (fun x -> x) flags);
+  join2 ();
+  join_exn ();
+  join_slow ();
+  Gc.full_major ();
+  Gc.full_major ()
diff --git a/testsuite/tests/misc/finaliser.reference b/testsuite/tests/parallel/join.reference
similarity index 100%
rename from testsuite/tests/misc/finaliser.reference
rename to testsuite/tests/parallel/join.reference
diff --git a/testsuite/tests/parallel/major_gc_wait_backup.ml b/testsuite/tests/parallel/major_gc_wait_backup.ml
new file mode 100644
index 000000000000..e418500774ae
--- /dev/null
+++ b/testsuite/tests/parallel/major_gc_wait_backup.ml
@@ -0,0 +1,39 @@
+(* TEST
+* hasunix
+include unix
+** native
+** bytecode
+*)
+
+type 'a tree = Empty | Node of 'a tree * 'a tree
+
+let rec make d =
+  if d = 0 then Node(Empty, Empty)
+  else let d = d - 1 in Node(make d, make d)
+
+(* you need to use Gc.quick_stat, because Gc.stat forces a major cycle *)
+let major_collections () =
+  (Gc.quick_stat ()).major_collections
+
+(* test to force domain to do a full GC while another is waiting *)
+let _ =
+  let sem = Semaphore.Binary.make false in
+  let d = Domain.spawn (fun _ -> Semaphore.Binary.acquire sem) in
+  Gc.full_major ();
+  let n = major_collections () in
+  ignore (make 22);
+  assert ((major_collections ()) > n);
+  Semaphore.Binary.release sem;
+  Domain.join d;
+  print_endline "wait OK"
+
+(* test to force domain to do a full GC while another is blocking *)
+let _ =
+  let _ = Domain.spawn (fun _ ->
+    Unix.sleep 10000
+  ) in
+  Gc.full_major ();
+  let n = major_collections () in
+  ignore (make 22);
+  assert ((major_collections ()) > n);
+  print_endline "sleep OK"
diff --git a/testsuite/tests/parallel/major_gc_wait_backup.reference b/testsuite/tests/parallel/major_gc_wait_backup.reference
new file mode 100644
index 000000000000..05f78790baa4
--- /dev/null
+++ b/testsuite/tests/parallel/major_gc_wait_backup.reference
@@ -0,0 +1,2 @@
+wait OK
+sleep OK
diff --git a/testsuite/tests/parallel/mctest.ml b/testsuite/tests/parallel/mctest.ml
new file mode 100644
index 000000000000..96f34c941cda
--- /dev/null
+++ b/testsuite/tests/parallel/mctest.ml
@@ -0,0 +1,244 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+(*
+ * Copyright (c) 2015, Theo Laurent <theo.laurent@ens.fr>
+ * Copyright (c) 2015, KC Sivaramakrishnan <sk826@cl.cam.ac.uk>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *)
+
+(* Michael-Scott queue *)
+
+module type BS =  sig
+  type t
+  val create : ?max:int -> unit -> t
+  val once : t -> unit
+  val reset : t -> unit
+end
+
+module B : BS = struct
+
+  type t = int * int ref
+
+  let _ = Random.self_init ()
+
+  let create ?(max=32) () = (max, ref 1)
+
+  let once (maxv, r) =
+    let t = Random.int (!r) in
+    r := min (2 * !r) maxv;
+    if t = 0 then ()
+    else ignore (Unix.select [] [] [] (0.001 *. (float_of_int t)))
+
+  let reset (_,r) = r := 1
+
+end
+
+(* TODO KC: Replace with concurrent lock free bag --
+ * http://dl.acm.org/citation.cfm?id=1989550 *)
+
+
+module type QS = sig
+  type 'a t
+  val create      : unit -> 'a t
+  val is_empty    : 'a t -> bool
+  val push        : 'a t -> 'a -> unit
+  val pop         : 'a t -> 'a option
+  val clean_until : 'a t -> ('a -> bool) -> unit
+
+  type 'a cursor
+  val snapshot  : 'a t -> 'a cursor
+  val next      : 'a cursor -> ('a * 'a cursor) option
+end
+
+module Q : QS = struct
+
+  type 'a node =
+    | Nil
+    | Next of 'a * 'a node Atomic.t
+
+  type 'a t =
+    { head : 'a node Atomic.t ;
+      tail : 'a node Atomic.t }
+
+  let create () =
+    let head = (Next (Obj.magic (), Atomic.make Nil)) in
+    { head = Atomic.make head ; tail = Atomic.make head }
+
+  let is_empty q =
+    match Atomic.get q.head with
+    | Nil -> failwith "MSQueue.is_empty: impossible"
+    | Next (_,x) ->
+        ( match Atomic.get x with
+          | Nil -> true
+          | _ -> false )
+
+  let pop q =
+    let b = B.create () in
+    let rec loop () =
+      let s = Atomic.get q.head in
+      let nhead = match s with
+        | Nil -> failwith "MSQueue.pop: impossible"
+        | Next (_, x) -> Atomic.get x
+      in match nhead with
+       | Nil -> None
+       | Next (v, _) when Atomic.compare_and_set q.head s nhead -> Some v
+       | _ -> ( B.once b ; loop () )
+    in loop ()
+
+  let push q v =
+    let rec find_tail_and_enq curr_end node =
+      if Atomic.compare_and_set curr_end Nil node then ()
+      else match Atomic.get curr_end with
+           | Nil -> find_tail_and_enq curr_end node
+           | Next (_, n) -> find_tail_and_enq n node
+    in
+    let newnode = Next (v, Atomic.make Nil) in
+    let tail = Atomic.get q.tail in
+    match tail with
+    | Nil         -> failwith "HW_MSQueue.push: impossible"
+    | Next (_, n) -> begin
+        find_tail_and_enq n newnode;
+        ignore (Atomic.compare_and_set q.tail tail newnode)
+    end
+
+  let rec clean_until q f =
+    let b = B.create () in
+    let rec loop () =
+      let s = Atomic.get q.head in
+      let nhead = match s with
+        | Nil -> failwith "MSQueue.pop: impossible"
+        | Next (_, x) -> Atomic.get x
+      in match nhead with
+       | Nil -> ()
+       | Next (v, _) ->
+           if not (f v) then
+              if Atomic.compare_and_set q.head s nhead
+              then (B.reset b; loop ())
+              else (B.once b; loop ())
+           else ()
+    in loop ()
+
+  type 'a cursor = 'a node
+
+  let snapshot q =
+    match Atomic.get q.head with
+    | Nil -> failwith "MSQueue.snapshot: impossible"
+    | Next (_, n) -> Atomic.get n
+
+  let next c =
+    match c with
+    | Nil -> None
+    | Next (a, n) -> Some (a, Atomic.get n)
+
+end
+
+module Scheduler =
+struct
+  open EffectHandlers
+  open EffectHandlers.Deep
+
+  type 'a cont = ('a, unit) continuation
+
+  type _ eff += Suspend : ('a cont -> 'a option) -> 'a eff
+              | Resume : ('a cont * 'a) -> unit eff
+              | GetTid : int eff
+              | Spawn : (unit -> unit) -> unit eff
+              | Yield : unit eff
+
+  let suspend f = perform (Suspend f)
+  let resume t v = perform (Resume (t, v))
+  let get_tid () = perform GetTid
+  let spawn f = perform (Spawn f)
+  let yield () = perform Yield
+
+  let pqueue = Q.create ()
+
+  let get_free_pid () = Oo.id (object end)
+
+  let enqueue k = Q.push pqueue k; Gc.minor ()
+
+  let rec dequeue () =
+    match Q.pop pqueue with
+      | Some k ->
+          continue k ()
+      | None ->
+          ignore (Unix.select [] [] [] 0.01);
+          dequeue ()
+
+  let rec exec f =
+    let pid = get_free_pid () in
+      match_with f ()
+      { retc = (fun () -> dequeue ());
+        exnc = (fun e -> raise e);
+        effc = fun (type a) (e : a eff) ->
+          match e with
+          | Suspend f -> Some (fun (k : (a, _) continuation) ->
+              match f k with
+                | None -> dequeue ()
+                | Some v -> continue k v)
+        | Resume (t, v) -> Some (fun (k : (a, _) continuation) ->
+            enqueue k;
+            continue t v)
+        | GetTid -> Some (fun (k : (a, _) continuation) ->
+            continue k pid)
+        | Spawn f -> Some (fun (k : (a, _) continuation) ->
+            enqueue k;
+            exec f)
+        | Yield -> Some (fun (k : (a, _) continuation) ->
+            enqueue k;
+            dequeue ())
+        | _ -> None }
+
+  let num_threads = 2
+
+  let start f =
+    for i = 1 to num_threads - 1 do
+      ignore (Domain.spawn dequeue)
+    done;
+    exec f
+
+
+end
+
+let _ =
+  let procs = 4 in
+  let counter = Atomic.make 0 in
+  let rec finish () =
+    let v = Atomic.get counter in
+    if not (Atomic.compare_and_set counter v (v+1)) then finish ();
+    if v + 1 = procs then exit 0
+  in
+  let rec worker n =
+    let r = ref 0 in
+    for i = 1 to 10000 do
+      Scheduler.yield ();
+      for j = 1 to 10000 do
+        incr r
+      done
+    done;
+    print_string (Printf.sprintf "done %d\n" !r); flush stdout;
+    finish ()
+  in
+    Scheduler.start
+      (fun () ->
+         for i = 1 to procs do
+           (*Scheduler.yield ();*)
+           Scheduler.spawn (fun () -> worker i)
+         done;
+      )
diff --git a/testsuite/tests/parallel/mctest.reference b/testsuite/tests/parallel/mctest.reference
new file mode 100644
index 000000000000..dd265608afb7
--- /dev/null
+++ b/testsuite/tests/parallel/mctest.reference
@@ -0,0 +1,4 @@
+done 100000000
+done 100000000
+done 100000000
+done 100000000
diff --git a/testsuite/tests/parallel/multicore_systhreads.ml b/testsuite/tests/parallel/multicore_systhreads.ml
new file mode 100644
index 000000000000..2f0fcf20b0de
--- /dev/null
+++ b/testsuite/tests/parallel/multicore_systhreads.ml
@@ -0,0 +1,32 @@
+(* TEST
+* hassysthreads
+include systhreads
+** bytecode
+** native
+*)
+
+
+let producer oc = output_string oc "passed\n"; close_out oc
+
+let consumer ic = print_endline @@ input_line ic; close_in ic
+
+let main () =
+  let (in_fd, out_fd) = Unix.pipe() in
+  let ipipe = Unix.in_channel_of_descr in_fd in
+  let opipe = Unix.out_channel_of_descr out_fd in
+  let prod = Domain.spawn begin fun () ->
+      let t = Thread.create
+          (fun () -> Unix.sleep 3; Gc.full_major(); producer opipe) ()
+      in
+      Thread.join t
+    end
+  in
+  let cons = Domain.spawn begin fun () ->
+      let t = Thread.create (fun () -> consumer ipipe) () in
+      Thread.join t
+    end
+  in
+  Domain.join prod;
+  Domain.join cons
+
+let _ = Unix.handle_unix_error main (); exit 0
diff --git a/testsuite/tests/parallel/multicore_systhreads.reference b/testsuite/tests/parallel/multicore_systhreads.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/parallel/multicore_systhreads.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/parallel/pingpong.ml b/testsuite/tests/parallel/pingpong.ml
new file mode 100644
index 000000000000..30fb9eee7117
--- /dev/null
+++ b/testsuite/tests/parallel/pingpong.ml
@@ -0,0 +1,45 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+let r = ref (Some 0)
+
+let () = Gc.minor ()
+
+
+let rec even lim put =
+  match !r with
+  | Some n when n = lim -> ()
+  | (Some n) when n mod 2 == 0 ->
+     let next = Some (n + 1) in
+     put next;
+     even lim put
+  | _ -> let _ = [!r] in even lim put
+
+let rec odd lim put =
+  match !r with
+  | Some n when n = lim -> ()
+  | (Some n) when n mod 2 == 1 ->
+     let next = Some (n + 1) in
+     put next;
+     odd lim put
+  | _ -> let _ = [!r] in odd lim put
+
+
+let go n put =
+  r := Some 0;
+  let d = Domain.spawn (fun () -> even n put) in
+  odd n put;
+  (match !r with
+  | Some n ->
+     Printf.printf "%d\n%!" n
+  | None ->
+     assert false);
+  Domain.join d
+
+
+let () =
+  go 100_000 (fun x -> r := x)
diff --git a/testsuite/tests/parallel/pingpong.reference b/testsuite/tests/parallel/pingpong.reference
new file mode 100644
index 000000000000..f7393e847d34
--- /dev/null
+++ b/testsuite/tests/parallel/pingpong.reference
@@ -0,0 +1 @@
+100000
diff --git a/testsuite/tests/parallel/poll.ml b/testsuite/tests/parallel/poll.ml
new file mode 100644
index 000000000000..b6c06be35468
--- /dev/null
+++ b/testsuite/tests/parallel/poll.ml
@@ -0,0 +1,14 @@
+(* TEST
+* hasunix
+include unix
+** bytecode
+** native
+*)
+
+let rec loop () =
+  loop ()
+
+let _ =
+  ignore (Domain.spawn loop);
+  Gc.full_major();
+  print_endline "OK"
diff --git a/testsuite/tests/parallel/poll.reference b/testsuite/tests/parallel/poll.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/parallel/poll.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/parallel/prodcons_domains.ml b/testsuite/tests/parallel/prodcons_domains.ml
new file mode 100644
index 000000000000..67499d342ac2
--- /dev/null
+++ b/testsuite/tests/parallel/prodcons_domains.ml
@@ -0,0 +1,71 @@
+(* TEST
+
+* hassysthreads
+include systhreads
+** bytecode
+** native
+
+*)
+
+(* Classic producer-consumer *)
+
+type 'a prodcons =
+  { buffer: 'a array;
+    lock: Mutex.t;
+    mutable readpos: int;
+    mutable writepos: int;
+    notempty: Condition.t;
+    notfull: Condition.t }
+
+let create size init =
+  { buffer = Array.make size init;
+    lock = Mutex.create();
+    readpos = 0;
+    writepos = 0;
+    notempty = Condition.create();
+    notfull = Condition.create() }
+
+let put p data =
+  Mutex.lock p.lock;
+  while (p.writepos + 1) mod Array.length p.buffer = p.readpos do
+    Condition.wait p.notfull p.lock
+  done;
+  p.buffer.(p.writepos) <- data;
+  p.writepos <- (p.writepos + 1) mod Array.length p.buffer;
+  Condition.signal p.notempty;
+  Mutex.unlock p.lock
+
+let get p =
+  Mutex.lock p.lock;
+  while p.writepos = p.readpos do
+    Condition.wait p.notempty p.lock
+  done;
+  let data = p.buffer.(p.readpos) in
+  p.readpos <- (p.readpos + 1) mod Array.length p.buffer;
+  Condition.signal p.notfull;
+  Mutex.unlock p.lock;
+  data
+
+(* Test *)
+
+let rec produce buff n max =
+  put buff n;
+  if n < max then produce buff (n+1) max
+
+let rec consume buff cur max =
+  let n = get buff in
+  if n <> cur then false
+  else if n = max then true
+  else consume buff (cur + 1) max
+
+let _ =
+  let buff1 = create 20 0 and buff2 = create 30 0 in
+  let ok1 = ref false and ok2 = ref false in
+  let _p1 = Domain.spawn (fun () -> produce buff1 0 10000)
+  and _p2 = Domain.spawn (fun () -> produce buff2 0 8000)
+  and c1 = Domain.spawn (fun () -> ok1 := consume buff1 0 10000) in
+  ok2 := consume buff2 0 8000;
+  Domain.join c1;
+  if !ok1 && !ok2
+  then print_string "passed\n"
+  else print_string "FAILED\n"
diff --git a/testsuite/tests/parallel/prodcons_domains.reference b/testsuite/tests/parallel/prodcons_domains.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/parallel/prodcons_domains.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/parallel/tak.ml b/testsuite/tests/parallel/tak.ml
new file mode 100644
index 000000000000..a9b6874c4fe8
--- /dev/null
+++ b/testsuite/tests/parallel/tak.ml
@@ -0,0 +1,20 @@
+(* TEST
+*)
+
+(* filling minor heaps in parallel to trigger
+   minor heap exhaustion codepath organically *)
+
+let rec tak (x, y, z as _tuple) =
+  if x > y then tak(tak (x-1, y, z), tak (y-1, z, x), tak (z-1, x, y))
+  else z
+
+let work () =
+  for _ = 1 to 100 do
+    assert (7 = tak (18, 12, 6));
+  done
+
+let _ =
+  let a = Array.init 4 (fun _ -> Domain.spawn work) in
+  work ();
+  Array.iter (fun d -> Domain.join d) a;
+  print_endline "OK"
diff --git a/testsuite/tests/parallel/tak.reference b/testsuite/tests/parallel/tak.reference
new file mode 100644
index 000000000000..d86bac9de59a
--- /dev/null
+++ b/testsuite/tests/parallel/tak.reference
@@ -0,0 +1 @@
+OK
diff --git a/testsuite/tests/parallel/test_c_thread_register.ml b/testsuite/tests/parallel/test_c_thread_register.ml
new file mode 100644
index 000000000000..352424a37713
--- /dev/null
+++ b/testsuite/tests/parallel/test_c_thread_register.ml
@@ -0,0 +1,24 @@
+(* TEST
+   modules = "test_c_thread_register_cstubs.c"
+   * hassysthreads
+   include systhreads
+   ** bytecode
+   ** native
+*)
+
+(* spins a external thread from C and register it to the OCaml runtime *)
+
+external spawn_thread : (unit -> unit) -> unit = "spawn_thread"
+
+let passed () = Printf.printf "passed\n"
+
+let _ =
+  let d =
+    Domain.spawn begin fun () ->
+      spawn_thread passed;
+      Thread.delay 0.5
+    end
+  in
+  let t = Thread.create (fun () -> Thread.delay 1.0) () in
+  Thread.join t;
+  Domain.join d
diff --git a/testsuite/tests/parallel/test_c_thread_register.reference b/testsuite/tests/parallel/test_c_thread_register.reference
new file mode 100644
index 000000000000..b0aad4deb5bb
--- /dev/null
+++ b/testsuite/tests/parallel/test_c_thread_register.reference
@@ -0,0 +1 @@
+passed
diff --git a/testsuite/tests/parallel/test_c_thread_register_cstubs.c b/testsuite/tests/parallel/test_c_thread_register_cstubs.c
new file mode 100644
index 000000000000..9fd78589785d
--- /dev/null
+++ b/testsuite/tests/parallel/test_c_thread_register_cstubs.c
@@ -0,0 +1,27 @@
+#include <string.h>
+#include <pthread.h>
+#include "caml/mlvalues.h"
+#include "caml/gc.h"
+#include "caml/memory.h"
+#include "caml/callback.h"
+#include "threads.h"
+
+void *thread_func(void *fn) {
+  caml_c_thread_register();
+  caml_acquire_runtime_system();
+  caml_callback((value) fn, Val_unit);
+  caml_release_runtime_system();
+  caml_c_thread_unregister();
+  return 0;
+}
+
+value spawn_thread(value clos)
+{
+  pthread_t thr;
+  pthread_attr_t attr;
+
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+  pthread_create(&thr, &attr, thread_func, (void *) clos);
+  return Val_unit;
+}
diff --git a/testsuite/tests/regression/pr9326/gc_set.ml b/testsuite/tests/regression/pr9326/gc_set.ml
index e9d7dbcd4c99..2570aec28eb1 100644
--- a/testsuite/tests/regression/pr9326/gc_set.ml
+++ b/testsuite/tests/regression/pr9326/gc_set.ml
@@ -4,32 +4,35 @@
 open Gc
 
 let min_heap_sz = 524288 (* 512k *)
-let maj_heap_inc = 4194304 (* 4M *)
+let space_overhead = 70
+let stack_limit = 4194304 (* 4M *)
+let custom_major_ratio = 40
+let custom_minor_ratio = 99
+let custom_minor_max_size = 4096
 
 let _ =
   let g1 = Gc.get() in
   (* Do not use { g1 with ... }, so that the code will break if more fields
      are added to the Gc.control record type *)
   Gc.set { minor_heap_size = min_heap_sz;
-           major_heap_increment = maj_heap_inc;
-           space_overhead = g1.space_overhead;
+           major_heap_increment = g1.major_heap_increment;
+           space_overhead = space_overhead;
            verbose = g1.verbose;
            max_overhead = g1.max_overhead;
-           stack_limit = g1.stack_limit;
+           stack_limit = stack_limit;
            allocation_policy = g1.allocation_policy;
            window_size = g1.window_size;
-           custom_major_ratio = g1.custom_major_ratio;
-           custom_minor_ratio = g1.custom_minor_ratio;
-           custom_minor_max_size = g1.custom_minor_max_size };
+           custom_major_ratio = custom_major_ratio;
+           custom_minor_ratio = custom_minor_ratio;
+           custom_minor_max_size = custom_minor_max_size };
   let g2 = Gc.get() in
   assert (g2.minor_heap_size = min_heap_sz);
-  assert (g2.major_heap_increment = maj_heap_inc);
-  assert (g2.space_overhead = g1.space_overhead);
+  assert (g2.space_overhead = space_overhead);
   assert (g2.verbose = g1.verbose);
   assert (g2.max_overhead = g1.max_overhead);
-  assert (g2.stack_limit = g1.stack_limit);
+  assert (g2.stack_limit = stack_limit);
   assert (g2.allocation_policy = g1.allocation_policy);
   assert (g2.window_size = g1.window_size);
-  assert (g2.custom_major_ratio = g1.custom_major_ratio);
-  assert (g2.custom_minor_ratio = g1.custom_minor_ratio);
-  assert (g2.custom_minor_max_size = g1.custom_minor_max_size)
+  assert (g2.custom_major_ratio = custom_major_ratio);
+  assert (g2.custom_minor_ratio = custom_minor_ratio);
+  assert (g2.custom_minor_max_size = custom_minor_max_size)
diff --git a/testsuite/tests/runtime-naked-pointers/cstubs.c b/testsuite/tests/runtime-naked-pointers/cstubs.c
deleted file mode 100644
index e9315f3aea07..000000000000
--- a/testsuite/tests/runtime-naked-pointers/cstubs.c
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <string.h>
-#include "caml/mlvalues.h"
-#include "caml/gc.h"
-#include "caml/memory.h"
-
-static int colors[4] = { Caml_white, Caml_gray, Caml_blue, Caml_black };
-
-value make_block(value header_size, value color, value size)
-{
-  intnat sz = Nativeint_val(size);
-  value * p = caml_stat_alloc((1 + sz) * sizeof(value));
-  p[0] = Make_header(Nativeint_val(header_size), 0, colors[Int_val(color)]);
-  memset(p + 1, 0x80, sz * sizeof(value));
-  return (value) (p + 1);
-}
-
-value make_raw_pointer (value v)
-{
-  return (value) Nativeint_val(v);
-}
diff --git a/testsuite/tests/runtime-naked-pointers/np.ml b/testsuite/tests/runtime-naked-pointers/np.ml
deleted file mode 100644
index 1738934ff0ea..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np.ml
+++ /dev/null
@@ -1,11 +0,0 @@
-type color = White | Gray | Blue | Black
-
-external make_block: nativeint -> color -> nativeint -> Obj.t
-         = "make_block"
-
-external make_raw_pointer: nativeint -> Obj.t
-         = "make_raw_pointer"
-
-let do_gc root =
-  Gc.compact();   (* full major + compaction *)
-  root
diff --git a/testsuite/tests/runtime-naked-pointers/np1.ml b/testsuite/tests/runtime-naked-pointers/np1.ml
deleted file mode 100644
index be4c677a2386..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np1.ml
+++ /dev/null
@@ -1,12 +0,0 @@
-(* TEST
-   modules = "cstubs.c np.ml"
-   * bytecode
-   * native
-*)
-
-open Np
-
-(* Out-of-heap object with black header is accepted even in no-naked-pointers
-   mode.  GC doesn't scan black objects. *)
-
-let x = do_gc [ make_block 100n Black 100n ]
diff --git a/testsuite/tests/runtime-naked-pointers/np2.ml b/testsuite/tests/runtime-naked-pointers/np2.ml
deleted file mode 100644
index f24c813c2b69..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np2.ml
+++ /dev/null
@@ -1,13 +0,0 @@
-(* TEST
-   modules = "cstubs.c np.ml"
-   * bytecode
-   * native
-*)
-
-open Np
-
-(* Out-of-heap object with black header is accepted even in no-naked-pointers
-   mode.  GC doesn't scan black objects.  However, if the size in the
-   head is crazily big, the naked pointer detector will warn. *)
-
-let x = do_gc [ make_block (-1n) Black 100n ]
diff --git a/testsuite/tests/runtime-naked-pointers/np2.run b/testsuite/tests/runtime-naked-pointers/np2.run
deleted file mode 100755
index c03f6f688d8e..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np2.run
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-
-exec ${test_source_directory}/runtest.sh
diff --git a/testsuite/tests/runtime-naked-pointers/np3.ml b/testsuite/tests/runtime-naked-pointers/np3.ml
deleted file mode 100644
index d207279df162..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np3.ml
+++ /dev/null
@@ -1,15 +0,0 @@
-(* TEST
-   modules = "cstubs.c np.ml"
-   * naked_pointers
-   ** bytecode
-   ** native
-*)
-
-open Np
-
-(* Out-of-heap object with non-black header is OK in naked pointers mode only *)
-(* Note that the header size can be wrong as it should not be used by the GC *)
-
-let x = do_gc [ make_block 10000n White 10n;
-                make_block 1n Blue 0n;
-                make_block (-1n) Gray 5n ]
diff --git a/testsuite/tests/runtime-naked-pointers/np3.run b/testsuite/tests/runtime-naked-pointers/np3.run
deleted file mode 100755
index c03f6f688d8e..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np3.run
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-
-exec ${test_source_directory}/runtest.sh
diff --git a/testsuite/tests/runtime-naked-pointers/np4.ml b/testsuite/tests/runtime-naked-pointers/np4.ml
deleted file mode 100644
index 98966ddffc8d..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np4.ml
+++ /dev/null
@@ -1,13 +0,0 @@
-(* TEST
-   modules = "cstubs.c np.ml"
-   * naked_pointers
-   ** bytecode
-   ** native
-*)
-
-open Np
-
-(* Null pointers and bad pointers outside the heap are OK
-   in naked pointers mode only *)
-
-let x = do_gc [ make_raw_pointer 0n; make_raw_pointer 42n ]
diff --git a/testsuite/tests/runtime-naked-pointers/np4.run b/testsuite/tests/runtime-naked-pointers/np4.run
deleted file mode 100755
index c03f6f688d8e..000000000000
--- a/testsuite/tests/runtime-naked-pointers/np4.run
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-
-exec ${test_source_directory}/runtest.sh
diff --git a/testsuite/tests/runtime-naked-pointers/runtest.sh b/testsuite/tests/runtime-naked-pointers/runtest.sh
deleted file mode 100755
index f5d4df561c3c..000000000000
--- a/testsuite/tests/runtime-naked-pointers/runtest.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-
-if grep -q "#define NAKED_POINTERS_CHECKER" ${ocamlsrcdir}/runtime/caml/m.h \
-&& (echo ${program} | grep -q '\.opt')
-then
-  (${program} > ${output}) 2>&1 | grep -q '^Out-of-heap '
-  exit $?
-else
-  exec ${program} > ${output}
-fi
diff --git a/testsuite/tests/shapes/comp_units.ml b/testsuite/tests/shapes/comp_units.ml
index 8c81ea50427a..c3417b55b15a 100644
--- a/testsuite/tests/shapes/comp_units.ml
+++ b/testsuite/tests/shapes/comp_units.ml
@@ -25,7 +25,7 @@ module Mproj = Unit
 module F (X : sig type t end) = X
 [%%expect{|
 {
- ("F", module) -> Abs<.4>(X/277, X/277<.3>);
+ ("F", module) -> Abs<.4>(X/282, X/282<.3>);
  }
 module F : functor (X : sig type t end) -> sig type t = X.t end
 |}]
diff --git a/testsuite/tests/shapes/functors.ml b/testsuite/tests/shapes/functors.ml
index e0b32367fb46..0fe6ef7e3ecd 100644
--- a/testsuite/tests/shapes/functors.ml
+++ b/testsuite/tests/shapes/functors.ml
@@ -17,7 +17,7 @@ module type S = sig type t val x : t end
 module Falias (X : S) = X
 [%%expect{|
 {
- ("Falias", module) -> Abs<.4>(X/279, X/279<.3>);
+ ("Falias", module) -> Abs<.4>(X/284, X/284<.3>);
  }
 module Falias : functor (X : S) -> sig type t = X.t val x : t end
 |}]
@@ -29,10 +29,10 @@ end
 {
  ("Finclude", module) ->
      Abs<.6>
-        (X/283,
+        (X/288,
          {
-          ("t", type) -> X/283<.5> . "t"[type];
-          ("x", value) -> X/283<.5> . "x"[value];
+          ("t", type) -> X/288<.5> . "t"[type];
+          ("x", value) -> X/288<.5> . "x"[value];
           });
  }
 module Finclude : functor (X : S) -> sig type t = X.t val x : t end
@@ -45,7 +45,7 @@ end
 [%%expect{|
 {
  ("Fredef", module) ->
-     Abs<.10>(X/290, {
+     Abs<.10>(X/295, {
                       ("t", type) -> <.8>;
                       ("x", value) -> <.9>;
                       });
@@ -225,9 +225,9 @@ module Big_to_small1 : B2S = functor (X : Big) -> X
 {
  ("Big_to_small1", module) ->
      Abs<.40>
-        (shape-var/386,
+        (shape-var/391,
          {<<internal>>
-          ("t", type) -> shape-var/386<<internal>> . "t"[type];
+          ("t", type) -> shape-var/391<<internal>> . "t"[type];
           });
  }
 module Big_to_small1 : B2S
@@ -238,9 +238,9 @@ module Big_to_small2 : B2S = functor (X : Big) -> struct include X end
 {
  ("Big_to_small2", module) ->
      Abs<.42>
-        (shape-var/392,
+        (shape-var/397,
          {
-          ("t", type) -> (shape-var/392<<internal>> . "t"[type])<.41>;
+          ("t", type) -> (shape-var/397<<internal>> . "t"[type])<.41>;
           });
  }
 module Big_to_small2 : B2S
diff --git a/testsuite/tests/shapes/open_arg.ml b/testsuite/tests/shapes/open_arg.ml
index 84123baca274..cc5823822299 100644
--- a/testsuite/tests/shapes/open_arg.ml
+++ b/testsuite/tests/shapes/open_arg.ml
@@ -22,7 +22,7 @@ end = struct end
 
 [%%expect{|
 {
- ("Make", module) -> Abs<.3>(I/279, {
+ ("Make", module) -> Abs<.3>(I/284, {
                                      });
  }
 module Make : functor (I : sig end) -> sig end
diff --git a/testsuite/tests/shapes/recmodules.ml b/testsuite/tests/shapes/recmodules.ml
index 305a80a97dfb..93a3fabacf3d 100644
--- a/testsuite/tests/shapes/recmodules.ml
+++ b/testsuite/tests/shapes/recmodules.ml
@@ -44,10 +44,10 @@ end = B
 [%%expect{|
 {
  ("A", module) -> {<.11>
-                   ("t", type) -> A/302<.11> . "t"[type];
+                   ("t", type) -> A/307<.11> . "t"[type];
                    };
  ("B", module) -> {<.12>
-                   ("t", type) -> B/303<.12> . "t"[type];
+                   ("t", type) -> B/308<.12> . "t"[type];
                    };
  }
 module rec A : sig type t = Leaf of B.t end
@@ -86,13 +86,13 @@ end = Set.Make(A)
  ("ASet", module) ->
      {
       ("compare", value) ->
-          CU Stdlib . "Set"[module] . "Make"[module](A/324<.19>) .
+          CU Stdlib . "Set"[module] . "Make"[module](A/329<.19>) .
           "compare"[value];
       ("elt", type) ->
-          CU Stdlib . "Set"[module] . "Make"[module](A/324<.19>) .
+          CU Stdlib . "Set"[module] . "Make"[module](A/329<.19>) .
           "elt"[type];
       ("t", type) ->
-          CU Stdlib . "Set"[module] . "Make"[module](A/324<.19>) . "t"[type];
+          CU Stdlib . "Set"[module] . "Make"[module](A/329<.19>) . "t"[type];
       };
  }
 module rec A :
diff --git a/testsuite/tests/shapes/rotor_example.ml b/testsuite/tests/shapes/rotor_example.ml
index b78757d0e359..465c63afbf8c 100644
--- a/testsuite/tests/shapes/rotor_example.ml
+++ b/testsuite/tests/shapes/rotor_example.ml
@@ -26,8 +26,8 @@ end
 {
  ("Pair", module) ->
      Abs<.9>
-        (X/279,
-         Abs(Y/280, {
+        (X/284,
+         Abs(Y/285, {
                      ("t", type) -> <.5>;
                      ("to_string", value) -> <.6>;
                      }));
diff --git a/testsuite/tests/statmemprof/alloc_counts.ml b/testsuite/tests/statmemprof/alloc_counts.ml
index f8cbb5658dc9..de0d1e9e49dd 100644
--- a/testsuite/tests/statmemprof/alloc_counts.ml
+++ b/testsuite/tests/statmemprof/alloc_counts.ml
@@ -1,4 +1,7 @@
-(* TEST *)
+(* TEST
+* skip
+reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
+*)
 module MP = Gc.Memprof
 
 let allocs_by_memprof f =
diff --git a/testsuite/tests/statmemprof/arrays_in_major.ml b/testsuite/tests/statmemprof/arrays_in_major.ml
index 78907a18e3dd..eb627324ba64 100644
--- a/testsuite/tests/statmemprof/arrays_in_major.ml
+++ b/testsuite/tests/statmemprof/arrays_in_major.ml
@@ -1,5 +1,7 @@
 (* TEST
    flags = "-g"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 *)
 
 open Gc.Memprof
diff --git a/testsuite/tests/statmemprof/arrays_in_major.reference b/testsuite/tests/statmemprof/arrays_in_major.reference
deleted file mode 100644
index 1f34ad8ec85a..000000000000
--- a/testsuite/tests/statmemprof/arrays_in_major.reference
+++ /dev/null
@@ -1,11 +0,0 @@
-check_nosample
-check_counts_full_major
-check_counts_full_major
-check_no_nested
-check_distrib 300 3000 3 0.000010
-check_distrib 300 3000 1 0.000100
-check_distrib 300 3000 1 0.010000
-check_distrib 300 3000 1 0.900000
-check_distrib 300 300 100000 0.100000
-check_distrib 300000 300000 30 0.100000
-OK !
diff --git a/testsuite/tests/statmemprof/arrays_in_minor.ml b/testsuite/tests/statmemprof/arrays_in_minor.ml
index 432f8b1d0995..4359642d0e64 100644
--- a/testsuite/tests/statmemprof/arrays_in_minor.ml
+++ b/testsuite/tests/statmemprof/arrays_in_minor.ml
@@ -1,5 +1,7 @@
 (* TEST
    flags = "-g"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 *)
 
 open Gc.Memprof
diff --git a/testsuite/tests/statmemprof/arrays_in_minor.reference b/testsuite/tests/statmemprof/arrays_in_minor.reference
deleted file mode 100644
index 1dad91939c80..000000000000
--- a/testsuite/tests/statmemprof/arrays_in_minor.reference
+++ /dev/null
@@ -1,11 +0,0 @@
-check_nosample
-check_counts_full_major
-check_counts_full_major
-check_no_nested
-check_distrib 1 250 1000 0.000010
-check_distrib 1 250 1000 0.000100
-check_distrib 1 250 1000 0.010000
-check_distrib 1 250 1000 0.900000
-check_distrib 1 1 10000000 0.010000
-check_distrib 250 250 100000 0.100000
-OK !
diff --git a/testsuite/tests/statmemprof/blocking_in_callback.ml b/testsuite/tests/statmemprof/blocking_in_callback.ml
index 00f49cfc74e9..e1e40f1fb6a5 100644
--- a/testsuite/tests/statmemprof/blocking_in_callback.ml
+++ b/testsuite/tests/statmemprof/blocking_in_callback.ml
@@ -1,6 +1,8 @@
 (* TEST
 * hassysthreads
 include systhreads
+* skip
+reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 ** bytecode
 ** native
 *)
diff --git a/testsuite/tests/statmemprof/callstacks.ml b/testsuite/tests/statmemprof/callstacks.ml
index ec5a4199f0d3..e864a50019e4 100644
--- a/testsuite/tests/statmemprof/callstacks.ml
+++ b/testsuite/tests/statmemprof/callstacks.ml
@@ -3,11 +3,15 @@
 
    * flat-float-array
      reference = "${test_source_directory}/callstacks.flat-float-array.reference"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
    ** native
    ** bytecode
 
    * no-flat-float-array
      reference = "${test_source_directory}/callstacks.no-flat-float-array.reference"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
    ** native
    ** bytecode
 *)
diff --git a/testsuite/tests/statmemprof/comballoc.ml b/testsuite/tests/statmemprof/comballoc.ml
index 22b25471011a..60ca4a012332 100644
--- a/testsuite/tests/statmemprof/comballoc.ml
+++ b/testsuite/tests/statmemprof/comballoc.ml
@@ -1,8 +1,10 @@
 (* TEST
    flags = "-g"
-   * bytecode
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
+   ** bytecode
      reference = "${test_source_directory}/comballoc.byte.reference"
-   * native
+   ** native
      reference = "${test_source_directory}/comballoc.opt.reference"
 *)
 
diff --git a/testsuite/tests/statmemprof/custom.ml b/testsuite/tests/statmemprof/custom.ml
index f0ddfa7eedfc..41cf902ec892 100644
--- a/testsuite/tests/statmemprof/custom.ml
+++ b/testsuite/tests/statmemprof/custom.ml
@@ -1,4 +1,7 @@
-(* TEST *)
+(* TEST
+* skip
+reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
+*)
 
 open Gc.Memprof
 
diff --git a/testsuite/tests/statmemprof/exception_callback.ml b/testsuite/tests/statmemprof/exception_callback.ml
index e1589372abc9..f9f02f690f7d 100644
--- a/testsuite/tests/statmemprof/exception_callback.ml
+++ b/testsuite/tests/statmemprof/exception_callback.ml
@@ -1,5 +1,7 @@
 (* TEST
    exit_status = "2"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 *)
 
 open Gc.Memprof
diff --git a/testsuite/tests/statmemprof/exception_callback_minor.ml b/testsuite/tests/statmemprof/exception_callback_minor.ml
index f51412327643..0bb37782bea5 100644
--- a/testsuite/tests/statmemprof/exception_callback_minor.ml
+++ b/testsuite/tests/statmemprof/exception_callback_minor.ml
@@ -1,5 +1,7 @@
 (* TEST
    exit_status = "2"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 *)
 
 open Gc.Memprof
diff --git a/testsuite/tests/statmemprof/intern.ml b/testsuite/tests/statmemprof/intern.ml
index bce6f89c5cf5..bebcc3c8fff2 100644
--- a/testsuite/tests/statmemprof/intern.ml
+++ b/testsuite/tests/statmemprof/intern.ml
@@ -1,5 +1,7 @@
 (* TEST
    flags = "-g"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 *)
 
 open Gc.Memprof
diff --git a/testsuite/tests/statmemprof/intern.reference b/testsuite/tests/statmemprof/intern.reference
deleted file mode 100644
index d83e8d6d50e9..000000000000
--- a/testsuite/tests/statmemprof/intern.reference
+++ /dev/null
@@ -1,10 +0,0 @@
-check_nosample
-check_counts_full_major
-check_counts_full_major
-check_no_nested
-check_distrib 2 3000 3 0.000010
-check_distrib 2 3000 1 0.000100
-check_distrib 2 2000 1 0.010000
-check_distrib 2 2000 1 0.900000
-check_distrib 300000 300000 20 0.100000
-OK !
diff --git a/testsuite/tests/statmemprof/lists_in_minor.ml b/testsuite/tests/statmemprof/lists_in_minor.ml
index ebd434857e4f..d08716c71b8f 100644
--- a/testsuite/tests/statmemprof/lists_in_minor.ml
+++ b/testsuite/tests/statmemprof/lists_in_minor.ml
@@ -1,5 +1,7 @@
 (* TEST
    flags = "-g"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 *)
 
 open Gc.Memprof
diff --git a/testsuite/tests/statmemprof/lists_in_minor.reference b/testsuite/tests/statmemprof/lists_in_minor.reference
deleted file mode 100644
index 11cfe0ca1283..000000000000
--- a/testsuite/tests/statmemprof/lists_in_minor.reference
+++ /dev/null
@@ -1,8 +0,0 @@
-check_distrib 10 1000000 0.010000
-check_distrib 1000000 10 0.000010
-check_distrib 1000000 10 0.000100
-check_distrib 1000000 10 0.001000
-check_distrib 1000000 10 0.010000
-check_distrib 100000 10 0.100000
-check_distrib 100000 10 0.900000
-OK !
diff --git a/testsuite/tests/statmemprof/minor_no_postpone.ml b/testsuite/tests/statmemprof/minor_no_postpone.ml
index fcb94cf81db2..21c8bfb2447e 100644
--- a/testsuite/tests/statmemprof/minor_no_postpone.ml
+++ b/testsuite/tests/statmemprof/minor_no_postpone.ml
@@ -1,5 +1,7 @@
 (* TEST
    modules = "minor_no_postpone_stub.c"
+   * skip
+   reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 *)
 
 open Gc.Memprof
diff --git a/testsuite/tests/statmemprof/moved_while_blocking.ml b/testsuite/tests/statmemprof/moved_while_blocking.ml
index 8efc172aeae0..bb8dfc9848e0 100644
--- a/testsuite/tests/statmemprof/moved_while_blocking.ml
+++ b/testsuite/tests/statmemprof/moved_while_blocking.ml
@@ -1,6 +1,8 @@
 (* TEST
 * hassysthreads
 include systhreads
+* skip
+reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 ** bytecode
 ** native
 *)
diff --git a/testsuite/tests/statmemprof/thread_exit_in_callback.ml b/testsuite/tests/statmemprof/thread_exit_in_callback.ml
index 753f7726f531..28e2aabb8b96 100644
--- a/testsuite/tests/statmemprof/thread_exit_in_callback.ml
+++ b/testsuite/tests/statmemprof/thread_exit_in_callback.ml
@@ -1,6 +1,8 @@
 (* TEST
 * hassysthreads
 include systhreads
+* skip
+reason = "port stat-mem-prof : https://github.com/ocaml/ocaml/pull/8634"
 ** bytecode
 ** native
 *)
diff --git a/testsuite/tests/tmc/readable_output.ml b/testsuite/tests/tmc/readable_output.ml
index 9f452a07842c..1e931d7720df 100644
--- a/testsuite/tests/tmc/readable_output.ml
+++ b/testsuite/tests/tmc/readable_output.ml
@@ -12,19 +12,19 @@ let[@tail_mod_cons] rec map f = function
   (map
      (function f param tail_mod_cons
        (if param
-         (let (block = (makemutable 0 (apply f (field 0 param)) 24029))
-           (seq (apply map_dps block 1 f (field 1 param)) block))
+         (let (block = (makemutable 0 (apply f (field_imm 0 param)) 24029))
+           (seq (apply map_dps block 1 f (field_imm 1 param)) block))
          0))
     map_dps
       (function dst offset[int] f param tail_mod_cons
         (if param
           (let
-            (block0_arg0 = (apply f (field 0 param))
+            (block0_arg0 = (apply f (field_imm 0 param))
              block = (makemutable 0 block0_arg0 24029))
             (seq (setfield_ptr(heap-init)_computed dst offset block)
-              (apply map_dps block 1 f (field 1 param) tailcall)))
+              (apply map_dps block 1 f (field_imm 1 param) tailcall)))
           (setfield_ptr(heap-init)_computed dst offset 0))))
-  (apply (field 1 (global Toploop!)) "map" map))
+  (apply (field_mut 1 (global Toploop!)) "map" map))
 val map : ('a -> 'b) -> 'a list -> 'b list = <fun>
 |}]
 
@@ -46,24 +46,26 @@ let[@tail_mod_cons] rec rec_map f = function
   (rec_map
      (function f param tail_mod_cons
        (if param
-         (let (*match* =a (field 0 param))
+         (let (*match* =a (field_imm 0 param))
            (makeblock 0
-             (let (block = (makemutable 0 (apply f (field 0 *match*)) 24029))
-               (seq (apply rec_map_dps block 1 f (field 1 *match*)) block))))
+             (let
+               (block = (makemutable 0 (apply f (field_imm 0 *match*)) 24029))
+               (seq (apply rec_map_dps block 1 f (field_imm 1 *match*))
+                 block))))
          0))
     rec_map_dps
       (function dst offset[int] f param tail_mod_cons
         (if param
           (let
-            (*match* =a (field 0 param)
-             block1_arg0 = (apply f (field 0 *match*))
+            (*match* =a (field_imm 0 param)
+             block1_arg0 = (apply f (field_imm 0 *match*))
              block = (makemutable 0 block1_arg0 24029))
             (seq
               (setfield_ptr(heap-init)_computed dst offset
                 (makeblock 0 block))
-              (apply rec_map_dps block 1 f (field 1 *match*) tailcall)))
+              (apply rec_map_dps block 1 f (field_imm 1 *match*) tailcall)))
           (setfield_ptr(heap-init)_computed dst offset 0))))
-  (apply (field 1 (global Toploop!)) "rec_map" rec_map))
+  (apply (field_mut 1 (global Toploop!)) "rec_map" rec_map))
 val rec_map : ('a -> 'b) -> 'a rec_list -> 'b rec_list = <fun>
 |}]
 
@@ -79,17 +81,17 @@ let[@tail_mod_cons] rec trip = function
   (trip
      (function param tail_mod_cons
        (if param
-         (let (x =a (field 0 param))
+         (let (x =a (field_imm 0 param))
            (makeblock 0 (makeblock 0 (*,int) x 0)
              (makeblock 0 (makeblock 0 (*,int) x 1)
                (let (block = (makemutable 0 (makeblock 0 (*,int) x 2) 24029))
-                 (seq (apply trip_dps block 1 (field 1 param)) block)))))
+                 (seq (apply trip_dps block 1 (field_imm 1 param)) block)))))
          0))
     trip_dps
       (function dst offset[int] param tail_mod_cons
         (if param
           (let
-            (x =a (field 0 param)
+            (x =a (field_imm 0 param)
              block0_arg0 = (makeblock 0 (*,int) x 0)
              block1_arg0 = (makeblock 0 (*,int) x 1)
              block2_arg0 = (makeblock 0 (*,int) x 2)
@@ -97,9 +99,9 @@ let[@tail_mod_cons] rec trip = function
             (seq
               (setfield_ptr(heap-init)_computed dst offset
                 (makeblock 0 block0_arg0 (makeblock 0 block1_arg0 block)))
-              (apply trip_dps block 1 (field 1 param) tailcall)))
+              (apply trip_dps block 1 (field_imm 1 param) tailcall)))
           (setfield_ptr(heap-init)_computed dst offset 0))))
-  (apply (field 1 (global Toploop!)) "trip" trip))
+  (apply (field_mut 1 (global Toploop!)) "trip" trip))
 val trip : 'a list -> ('a * int) list = <fun>
 |}]
 
@@ -115,25 +117,26 @@ let[@tail_mod_cons] rec effects f = function
   (effects
      (function f param tail_mod_cons
        (if param
-         (let (*match* =a (field 0 param))
-           (makeblock 0 (apply f (field 0 *match*))
-             (let (block = (makemutable 0 (apply f (field 1 *match*)) 24029))
-               (seq (apply effects_dps block 1 f (field 1 param)) block))))
+         (let (*match* =a (field_imm 0 param))
+           (makeblock 0 (apply f (field_imm 0 *match*))
+             (let
+               (block = (makemutable 0 (apply f (field_imm 1 *match*)) 24029))
+               (seq (apply effects_dps block 1 f (field_imm 1 param)) block))))
          0))
     effects_dps
       (function dst offset[int] f param tail_mod_cons
         (if param
           (let
-            (*match* =a (field 0 param)
-             block0_arg0 = (apply f (field 0 *match*))
-             block1_arg0 = (apply f (field 1 *match*))
+            (*match* =a (field_imm 0 param)
+             block0_arg0 = (apply f (field_imm 0 *match*))
+             block1_arg0 = (apply f (field_imm 1 *match*))
              block = (makemutable 0 block1_arg0 24029))
             (seq
               (setfield_ptr(heap-init)_computed dst offset
                 (makeblock 0 block0_arg0 block))
-              (apply effects_dps block 1 f (field 1 param) tailcall)))
+              (apply effects_dps block 1 f (field_imm 1 param) tailcall)))
           (setfield_ptr(heap-init)_computed dst offset 0))))
-  (apply (field 1 (global Toploop!)) "effects" effects))
+  (apply (field_mut 1 (global Toploop!)) "effects" effects))
 val effects : ('a -> 'b) -> ('a * 'a) list -> 'b list = <fun>
 |}]
 
@@ -155,8 +158,9 @@ let[@tail_mod_cons] rec map_stutter f xs =
          (if xs
            (let
              (block =
-                (makemutable 0 (apply f (makeblock 0 (field 0 xs))) 24029))
-             (seq (apply map_stutter_dps block 1 f (field 1 xs)) block))
+                (makemutable 0 (apply f (makeblock 0 (field_imm 0 xs)))
+                  24029))
+             (seq (apply map_stutter_dps block 1 f (field_imm 1 xs)) block))
            0)))
     map_stutter_dps
       (function dst offset[int] f xs tail_mod_cons
@@ -166,12 +170,12 @@ let[@tail_mod_cons] rec map_stutter f xs =
           (seq (setfield_ptr(heap-init)_computed dst offset block)
             (if xs
               (let
-                (block0_arg0 = (apply f (makeblock 0 (field 0 xs)))
+                (block0_arg0 = (apply f (makeblock 0 (field_imm 0 xs)))
                  block = (makemutable 0 block0_arg0 24029))
                 (seq (setfield_ptr(heap-init)_computed block 1 block)
-                  (apply map_stutter_dps block 1 f (field 1 xs) tailcall)))
+                  (apply map_stutter_dps block 1 f (field_imm 1 xs) tailcall)))
               (setfield_ptr(heap-init)_computed block 1 0))))))
-  (apply (field 1 (global Toploop!)) "map_stutter" map_stutter))
+  (apply (field_mut 1 (global Toploop!)) "map_stutter" map_stutter))
 val map_stutter : ('a option -> 'b) -> 'a list -> 'b list = <fun>
 |}]
 
@@ -197,10 +201,10 @@ type 'a stream = { hd : 'a; tl : unit -> 'a stream; }
        (if (== n 0) 0
          (makeblock 0 (apply f 0)
            (let
-             (v = (apply f (makeblock 0 (field 0 xs)))
+             (v = (apply f (makeblock 0 (field_imm 0 xs)))
               block = (makemutable 0 v 24029))
              (seq
-               (apply smap_stutter_dps block 1 f (apply (field 1 xs) 0)
+               (apply smap_stutter_dps block 1 f (apply (field_imm 1 xs) 0)
                  (- n 1))
                block)))))
     smap_stutter_dps
@@ -208,13 +212,13 @@ type 'a stream = { hd : 'a; tl : unit -> 'a stream; }
         (if (== n 0) (setfield_ptr(heap-init)_computed dst offset 0)
           (let
             (block0_arg0 = (apply f 0)
-             v = (apply f (makeblock 0 (field 0 xs)))
+             v = (apply f (makeblock 0 (field_imm 0 xs)))
              block = (makemutable 0 v 24029))
             (seq
               (setfield_ptr(heap-init)_computed dst offset
                 (makeblock 0 block0_arg0 block))
-              (apply smap_stutter_dps block 1 f (apply (field 1 xs) 0)
+              (apply smap_stutter_dps block 1 f (apply (field_imm 1 xs) 0)
                 (- n 1) tailcall))))))
-  (apply (field 1 (global Toploop!)) "smap_stutter" smap_stutter))
+  (apply (field_mut 1 (global Toploop!)) "smap_stutter" smap_stutter))
 val smap_stutter : ('a option -> 'b) -> 'a stream -> int -> 'b list = <fun>
 |}]
diff --git a/testsuite/tests/tool-debugger/basic/debuggee.ml b/testsuite/tests/tool-debugger/basic/debuggee.ml
index 91ad3c21dcd8..8604edf2391d 100644
--- a/testsuite/tests/tool-debugger/basic/debuggee.ml
+++ b/testsuite/tests/tool-debugger/basic/debuggee.ml
@@ -2,13 +2,15 @@
 set foo = "bar"
 flags += " -g "
 ocamldebug_script = "${test_source_directory}/input_script"
-* debugger
-** shared-libraries
-*** setup-ocamlc.byte-build-env
-**** ocamlc.byte
-***** check-ocamlc.byte-output
-****** ocamldebug
-******* check-program-output
+* skip
+reason = "ocamldebug is broken (#34)"
+** debugger
+*** shared-libraries
+**** setup-ocamlc.byte-build-env
+***** ocamlc.byte
+****** check-ocamlc.byte-output
+******* ocamldebug
+******** check-program-output
 *)
 
 print_endline Sys.argv.(1);;
diff --git a/testsuite/tests/tool-debugger/dynlink/host.ml b/testsuite/tests/tool-debugger/dynlink/host.ml
index f46ef8a022e8..c445c8acd3b2 100644
--- a/testsuite/tests/tool-debugger/dynlink/host.ml
+++ b/testsuite/tests/tool-debugger/dynlink/host.ml
@@ -7,27 +7,29 @@ libraries = ""
 flags += " -g "
 ocamldebug_script = "${test_source_directory}/input_script"
 
-* debugger
-** shared-libraries
-*** setup-ocamlc.byte-build-env
-**** ocamlc.byte
-module = "host.ml"
+* skip
+reason = "ocamldebug is broken (#34)"
+** debugger
+*** shared-libraries
+**** setup-ocamlc.byte-build-env
 ***** ocamlc.byte
-module = "plugin.ml"
+module = "host.ml"
 ****** ocamlc.byte
+module = "plugin.ml"
+******* ocamlc.byte
 module = ""
 all_modules = "host.cmo"
 program = "${test_build_directory}/host.byte"
 libraries = "dynlink"
 
-******* run
+******** run
 output = "host.output"
-******** check-program-output
+********* check-program-output
 reference = "${test_source_directory}/host.reference"
 
-******** ocamldebug
+********* ocamldebug
 output = "host.debug.output"
-********* check-program-output
+********** check-program-output
 reference = "${test_source_directory}/host.debug.reference"
 
 *)
diff --git a/testsuite/tests/tool-debugger/find-artifacts/debuggee.ml b/testsuite/tests/tool-debugger/find-artifacts/debuggee.ml
index 70aee3ff0484..7c6f00c70f95 100644
--- a/testsuite/tests/tool-debugger/find-artifacts/debuggee.ml
+++ b/testsuite/tests/tool-debugger/find-artifacts/debuggee.ml
@@ -1,25 +1,27 @@
 (* TEST
 ocamldebug_script = "${test_source_directory}/input_script"
-* debugger
-** shared-libraries
-*** setup-ocamlc.byte-build-env
-**** script
+* skip
+reason = "ocamldebug is broken (#34)"
+** debugger
+*** shared-libraries
+**** setup-ocamlc.byte-build-env
+***** script
 script = "mkdir out"
-***** ocamlc.byte
+****** ocamlc.byte
 flags = "-g -c"
 all_modules = "${test_source_directory}/in/blah.ml"
 program = "out/blah.cmo"
-****** ocamlc.byte
+******* ocamlc.byte
 program = "out/foo.cmo"
 flags = "-I out -g -c"
 all_modules = "${test_source_directory}/in/foo.ml"
-******* ocamlc.byte
+******** ocamlc.byte
 all_modules = "out/blah.cmo out/foo.cmo"
 flags = " -g "
 program = "debuggee.exe"
-******** check-ocamlc.byte-output
-********* ocamldebug
-********** check-program-output
+********* check-ocamlc.byte-output
+********** ocamldebug
+*********** check-program-output
 *)
 
 (* This file only contains the specification of how to run the test *)
diff --git a/testsuite/tests/tool-debugger/module_named_main/main.ml b/testsuite/tests/tool-debugger/module_named_main/main.ml
index 25eebe7c0272..e31c16391e8a 100644
--- a/testsuite/tests/tool-debugger/module_named_main/main.ml
+++ b/testsuite/tests/tool-debugger/module_named_main/main.ml
@@ -1,13 +1,15 @@
 (* TEST
 flags += " -g "
 ocamldebug_script = "${test_source_directory}/input_script"
-* debugger
-** shared-libraries
-*** setup-ocamlc.byte-build-env
-**** ocamlc.byte
-***** check-ocamlc.byte-output
-****** ocamldebug
-******* check-program-output
+* skip
+reason = "ocamldebug is broken (#34)"
+** debugger
+*** shared-libraries
+**** setup-ocamlc.byte-build-env
+***** ocamlc.byte
+****** check-ocamlc.byte-output
+******* ocamldebug
+******** check-program-output
 *)
 
 module Submodule = struct
diff --git a/testsuite/tests/tool-debugger/no_debug_event/noev.ml b/testsuite/tests/tool-debugger/no_debug_event/noev.ml
index 4eb334ea845d..f06c14a11798 100644
--- a/testsuite/tests/tool-debugger/no_debug_event/noev.ml
+++ b/testsuite/tests/tool-debugger/no_debug_event/noev.ml
@@ -1,28 +1,30 @@
 (* TEST
 readonly_files = "a.ml b.ml"
 ocamldebug_script = "${test_source_directory}/input_script"
-* debugger
-** shared-libraries
-*** setup-ocamlc.byte-build-env
-**** ocamlc.byte
+* skip
+reason = "ocamldebug is broken (#34)"
+** debugger
+*** shared-libraries
+**** setup-ocamlc.byte-build-env
+***** ocamlc.byte
 module = "a.ml"
 flags = "-g -for-pack foo"
-***** ocamlc.byte
+****** ocamlc.byte
 module = ""
 all_modules = "a.cmo"
 program = "foo.cmo"
 flags = "-g -pack"
-****** ocamlc.byte
+******* ocamlc.byte
 module = "b.ml"
 flags = " -g "
-******* ocamlc.byte
+******** ocamlc.byte
 module = ""
 flags = " -g "
 all_modules = "foo.cmo b.cmo"
 program = "${test_build_directory}/noev.exe"
-******** check-ocamlc.byte-output
-********* ocamldebug
-********** check-program-output
+********* check-ocamlc.byte-output
+********** ocamldebug
+*********** check-program-output
 *)
 
 (* This file only contains the specification of how to run the test *)
diff --git a/testsuite/tests/tool-debugger/printer/debuggee.ml b/testsuite/tests/tool-debugger/printer/debuggee.ml
index f22a4bd50f60..b2d6e6e7c527 100644
--- a/testsuite/tests/tool-debugger/printer/debuggee.ml
+++ b/testsuite/tests/tool-debugger/printer/debuggee.ml
@@ -3,15 +3,17 @@ flags += " -g "
 ocamldebug_script = "${test_source_directory}/input_script"
 readonly_files = "printer.ml"
 include debugger
-* debugger
-** shared-libraries
-*** setup-ocamlc.byte-build-env
-**** ocamlc.byte
+* skip
+reason = "ocamldebug is broken (#34)"
+** debugger
+*** shared-libraries
+**** setup-ocamlc.byte-build-env
+***** ocamlc.byte
 module = "printer.ml"
-**** ocamlc.byte
-***** check-ocamlc.byte-output
-****** ocamldebug
-******* check-program-output
+***** ocamlc.byte
+****** check-ocamlc.byte-output
+******* ocamldebug
+******** check-program-output
 *)
 
 let f x =
diff --git a/testsuite/tests/translprim/comparison_table.compilers.reference b/testsuite/tests/translprim/comparison_table.compilers.reference
index 145d208084a9..8f02cf0cce8e 100644
--- a/testsuite/tests/translprim/comparison_table.compilers.reference
+++ b/testsuite/tests/translprim/comparison_table.compilers.reference
@@ -155,10 +155,12 @@
        (function cmp eq ne lt gt le ge vec
          (let
            (uncurry =
-              (function f param (apply f (field 0 param) (field 1 param)))
+              (function f param
+                (apply f (field_imm 0 param) (field_imm 1 param)))
             map =
               (function f l
-                (apply (field 18 (global Stdlib__List!)) (apply uncurry f) l)))
+                (apply (field_imm 18 (global Stdlib__List!))
+                  (apply uncurry f) l)))
            (makeblock 0
              (makeblock 0 (apply map gen_cmp vec) (apply map cmp vec))
              (apply map
@@ -193,10 +195,10 @@
              (let
                (uncurry =
                   (function f param
-                    (apply f (field 0 param) (field 1 param)))
+                    (apply f (field_imm 0 param) (field_imm 1 param)))
                 map =
                   (function f l
-                    (apply (field 18 (global Stdlib__List!))
+                    (apply (field_imm 18 (global Stdlib__List!))
                       (apply uncurry f) l)))
                (makeblock 0
                  (makeblock 0 (apply map eta_gen_cmp vec)
diff --git a/testsuite/tests/typing-poly/poly.ml b/testsuite/tests/typing-poly/poly.ml
index 0b5abf7c674c..228dbcf375a6 100644
--- a/testsuite/tests/typing-poly/poly.ml
+++ b/testsuite/tests/typing-poly/poly.ml
@@ -1017,6 +1017,7 @@ type 'a t = < a : 'a >
 type u = 'a t as 'a
 |}];;
 
+
 (* pass typetexp, but fails during Typedecl.check_recursion *)
 type ('a1, 'b1) ty1 = 'a1 -> unit constraint 'a1 = [> `V1 of ('a1, 'b1) ty2 as 'b1]
 and  ('a2, 'b2) ty2 = 'b2 -> unit constraint 'b2 = [> `V2 of ('a2, 'b2) ty1 as 'a2];;
diff --git a/testsuite/tests/typing-sigsubst/sigsubst.ml b/testsuite/tests/typing-sigsubst/sigsubst.ml
index 5636e9abe258..0068e25ae52d 100644
--- a/testsuite/tests/typing-sigsubst/sigsubst.ml
+++ b/testsuite/tests/typing-sigsubst/sigsubst.ml
@@ -24,11 +24,11 @@ end
 Line 3, characters 2-36:
 3 |   include Comparable with type t = t
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Error: Illegal shadowing of included type t/284 by t/289
+Error: Illegal shadowing of included type t/289 by t/294
        Line 2, characters 2-19:
-         Type t/284 came from this include
+         Type t/289 came from this include
        Line 3, characters 2-23:
-         The value print has no valid type if t/284 is shadowed
+         The value print has no valid type if t/289 is shadowed
 |}]
 
 module type Sunderscore = sig
diff --git a/testsuite/tests/unwind/check-linker-version.sh b/testsuite/tests/unwind/check-linker-version.sh
index f1120da762ad..fabd0ed9bc11 100755
--- a/testsuite/tests/unwind/check-linker-version.sh
+++ b/testsuite/tests/unwind/check-linker-version.sh
@@ -13,4 +13,4 @@ else
   test_result=${TEST_PASS};
 fi
 
-exit ${TEST_RESULT}
+exit ${test_result}
diff --git a/testsuite/tests/unwind/driver.ml b/testsuite/tests/unwind/driver.ml
index 07aafc0c97d4..7df39daf0eaa 100644
--- a/testsuite/tests/unwind/driver.ml
+++ b/testsuite/tests/unwind/driver.ml
@@ -4,21 +4,29 @@ script = "sh ${test_source_directory}/check-linker-version.sh"
 readonly_files = "mylib.mli mylib.ml stack_walker.c"
 
 * macos
-** arch_amd64
-*** script
-**** setup-ocamlopt.byte-build-env
-***** ocamlopt.byte
+** script
+*** setup-ocamlopt.byte-build-env
+**** ocamlopt.byte
 flags = "-opaque"
 module = "mylib.mli"
-****** ocamlopt.byte
+***** ocamlopt.byte
 module = ""
 flags = "-cclib -Wl,-keep_dwarf_unwind"
 all_modules = "mylib.ml driver.ml stack_walker.c"
 program = "${test_build_directory}/unwind_test"
-******* run
+****** run
+output = "${test_build_directory}/program-output"
+stdout = "${output}"
+stderr = "${output}"
+******* check-program-output
+reference = "${test_source_directory}/unwind_test.reference"
 
 *)
 
 let () =
   Mylib.foo1 Mylib.bar 1 2 3 4 5 6 7 8 9 10;
   Mylib.foo2 Mylib.baz 1 2 3 4 5 6 7 8 9 10
+
+(* https://github.com/ocaml-multicore/ocaml-multicore/issues/274 *)
+let () =
+  Mylib.foo1 Mylib.bob 1 2 3 4 5 6 7 8 9 10
diff --git a/testsuite/tests/unwind/mylib.ml b/testsuite/tests/unwind/mylib.ml
index 318b537ad93a..22b7e9ef608e 100644
--- a/testsuite/tests/unwind/mylib.ml
+++ b/testsuite/tests/unwind/mylib.ml
@@ -18,3 +18,10 @@ let baz x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 =
   func_with_10_params x1 x2 x3 x4 x5 x6 x7 x8 x9 x10;
   func_with_10_params x1 x2 x3 x4 x5 x6 x7 x8 x9 x10;
   perform_stack_walk ()
+
+(* https://github.com/ocaml-multicore/ocaml-multicore/issues/274 *)
+external do_no_alloc: unit -> unit = "ml_do_no_alloc" [@@noalloc]
+
+let bob x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 =
+   func_with_10_params x1 x2 x3 x4 x5 x6 x7 x8 x9 x10;
+   do_no_alloc ()
diff --git a/testsuite/tests/unwind/mylib.mli b/testsuite/tests/unwind/mylib.mli
index c1655228fc76..91bdcae59ebc 100644
--- a/testsuite/tests/unwind/mylib.mli
+++ b/testsuite/tests/unwind/mylib.mli
@@ -8,3 +8,5 @@ val bar:
   int -> int -> int -> int -> int -> int -> int -> int -> int -> int -> unit
 val baz:
   int -> int -> int -> int -> int -> int -> int -> int -> int -> int -> unit
+val bob:
+  int -> int -> int -> int -> int -> int -> int -> int -> int -> int -> unit
diff --git a/testsuite/tests/unwind/stack_walker.c b/testsuite/tests/unwind/stack_walker.c
index 342eb932f393..55eb9505b5fb 100644
--- a/testsuite/tests/unwind/stack_walker.c
+++ b/testsuite/tests/unwind/stack_walker.c
@@ -3,6 +3,7 @@
 #include <string.h>
 #include <caml/callback.h>
 #include <caml/mlvalues.h>
+#define UNW_LOCAL_ONLY
 #include <libunwind.h>
 
 value ml_func_with_10_params_native(value x1, value x2, value x3, value x4,
@@ -11,20 +12,21 @@ value ml_func_with_10_params_native(value x1, value x2, value x3, value x4,
     return Val_unit;
 }
 
-int perform_stack_walk(int dbg) {
+void error() {
+  printf("<error>\n");
+  exit(1);
+}
+
+value ml_perform_stack_walk(value unused) {
     unw_context_t ctxt;
     unw_getcontext(&ctxt);
 
     unw_cursor_t cursor;
     {
         int result = unw_init_local(&cursor, &ctxt);
-        if (result != 0) {
-            if (dbg) printf("unw_init_local failed: %d\n", result);
-            return -1;
-        }
+        if (result != 0) error();
     }
 
-    int reached_main = 0;
 
     for (;;) {
         {
@@ -32,40 +34,33 @@ int perform_stack_walk(int dbg) {
             unw_word_t ip_offset; // IP - start_of_proc
             int result = unw_get_proc_name(&cursor, procname, sizeof(procname),
                                            &ip_offset);
-            if (result != 0) {
-                if (dbg) printf("unw_get_proc_name failed: %d\n", result);
-                return -1;
+            if (result != 0) error();
+            if (strlen(procname) > 4 &&
+                !memcmp(procname, "caml", 4) &&
+                'A' <= procname[4] && procname[4] <= 'Z' &&
+                strstr(procname+4, "__")) {
+              /* mangled OCaml name, unmangle and print */
+              const char* mangled = procname + 4;
+              const char* mod_end = strstr(mangled, "__");
+              const char* id_begin = strchr(mod_end + 2, '_');
+              if (!id_begin) id_begin = mangled + strlen(mangled);
+              printf("%.*s.%.*s\n", mod_end - mangled, mangled, id_begin - (mod_end + 2), mod_end + 2);
+            } else {
+              printf("%s\n", procname);
             }
-
-            if (strcmp(procname, "main") == 0)
-                reached_main = 1;
-            if (dbg) printf("%s + %lld\n", procname, (long long int)ip_offset);
+            if (!strcmp(procname, "main")) break;
         }
 
         {
             int result = unw_step(&cursor);
-            if (result == 0) break;
-            if (result < 0) {
-                if (dbg) printf("unw_step failed: %d\n", result);
-                return -1;
-            }
+            if (result == 0) error(); /* didn't make it to main() */
+            if (result < 0) error();
         }
     }
 
-    if (dbg) printf("Reached end of stack.\n");
-    if (!reached_main) {
-        if (dbg) printf("Failure: Did not reach main.\n");
-        return -1;
-    }
-    return 0;
+    return Val_unit;
 }
 
-value ml_perform_stack_walk() {
-    if (perform_stack_walk(0) != 0) {
-        printf("TEST FAILED\n");
-        /* Re-run the test to produce a trace */
-        perform_stack_walk(1);
-        exit(1);
-    }
-    return Val_unit;
+value ml_do_no_alloc(value unused) {
+    return ml_perform_stack_walk(unused);
 }
diff --git a/testsuite/tests/unwind/unwind_test.reference b/testsuite/tests/unwind/unwind_test.reference
new file mode 100644
index 000000000000..b3804d00e51e
--- /dev/null
+++ b/testsuite/tests/unwind/unwind_test.reference
@@ -0,0 +1,18 @@
+ml_perform_stack_walk
+caml_c_call
+Mylib.baz
+Driver.entry
+caml_program
+caml_start_program
+caml_startup_common
+caml_main
+main
+ml_perform_stack_walk
+ml_do_no_alloc
+Mylib.bob
+Driver.entry
+caml_program
+caml_start_program
+caml_startup_common
+caml_main
+main
diff --git a/testsuite/tests/misc/ephe_infix_new.ml b/testsuite/tests/weak-ephe-final/ephe_infix.ml
similarity index 100%
rename from testsuite/tests/misc/ephe_infix_new.ml
rename to testsuite/tests/weak-ephe-final/ephe_infix.ml
diff --git a/testsuite/tests/misc/ephetest_new.ml b/testsuite/tests/weak-ephe-final/ephetest.ml
similarity index 98%
rename from testsuite/tests/misc/ephetest_new.ml
rename to testsuite/tests/weak-ephe-final/ephetest.ml
index f5d7dedf63c7..2835ab8b7a0c 100644
--- a/testsuite/tests/misc/ephetest_new.ml
+++ b/testsuite/tests/weak-ephe-final/ephetest.ml
@@ -60,7 +60,7 @@ let test2 () =
   is_key_value test flags;
   is_data_value test flags;
   ra := ref 13;
-  Gc.minor ();
+  Gc.full_major ();
   is_key_unset test flags;
   is_data_unset test flags;
   ignore (Sys.opaque_identity eph)
@@ -74,8 +74,8 @@ let test3 () =
   let (eph, flags) = create (ref 125) !ra in
   is_key_value test flags;
   is_data_value test flags;
+  Gc.full_major ();
   ra := ref 14;
-  Gc.minor ();
   is_key_unset test flags;
   is_data_value test flags;
   ignore (Sys.opaque_identity eph)
@@ -107,8 +107,7 @@ let test5 () =
   is_key_value test flags;
   is_data_value test flags;
   !rb := ref 4;
-  Gc.minor ();
-  Gc.minor ();
+  Gc.full_major ();
   is_key_unset test flags;
   is_data_unset test flags;
   ignore (Sys.opaque_identity eph)
diff --git a/testsuite/tests/misc/ephetest_new.reference b/testsuite/tests/weak-ephe-final/ephetest.reference
similarity index 100%
rename from testsuite/tests/misc/ephetest_new.reference
rename to testsuite/tests/weak-ephe-final/ephetest.reference
diff --git a/testsuite/tests/misc/ephetest2_new.ml b/testsuite/tests/weak-ephe-final/ephetest2.ml
similarity index 100%
rename from testsuite/tests/misc/ephetest2_new.ml
rename to testsuite/tests/weak-ephe-final/ephetest2.ml
diff --git a/testsuite/tests/misc/ephetest2.reference b/testsuite/tests/weak-ephe-final/ephetest2.reference
similarity index 100%
rename from testsuite/tests/misc/ephetest2.reference
rename to testsuite/tests/weak-ephe-final/ephetest2.reference
diff --git a/testsuite/tests/misc/ephetest3.ml b/testsuite/tests/weak-ephe-final/ephetest3.ml
similarity index 99%
rename from testsuite/tests/misc/ephetest3.ml
rename to testsuite/tests/weak-ephe-final/ephetest3.ml
index e9c2b0f170dd..541910c91a8c 100644
--- a/testsuite/tests/misc/ephetest3.ml
+++ b/testsuite/tests/weak-ephe-final/ephetest3.ml
@@ -27,7 +27,6 @@ let pp = Int64.to_string
 module HW = Ephemeron.K1.Make(S)
 module SW = Weak.Make(S)
 
-
 let sw = SW.create n
 let hashcons x = SW.merge sw x
 
diff --git a/testsuite/tests/misc/ephetest3.reference b/testsuite/tests/weak-ephe-final/ephetest3.reference
similarity index 100%
rename from testsuite/tests/misc/ephetest3.reference
rename to testsuite/tests/weak-ephe-final/ephetest3.reference
diff --git a/testsuite/tests/weak-ephe-final/ephetest_par.ml b/testsuite/tests/weak-ephe-final/ephetest_par.ml
new file mode 100644
index 000000000000..744c4e844d47
--- /dev/null
+++ b/testsuite/tests/weak-ephe-final/ephetest_par.ml
@@ -0,0 +1,158 @@
+(* TEST *)
+
+(* Due to GCs running at non-deterministic places, the output from these tests
+ * are unreliable except the bad value checks and as a check for catastrophic
+ * failures i.e) segfaults. *)
+
+let debug = false
+
+open Printf
+open Ephemeron
+
+let dprintf x =
+  if debug then printf x
+  else ifprintf stdout x
+
+let is_true test s b =
+  if debug then
+    printf "%s %s: %s\n" test s (if b then "OK" else "FAIL")
+
+let is_false test s b = is_true test s (not b)
+
+let is_data_value test eph k (v:int) =
+  match K1.query eph k with
+  | Some x ->
+      if !x = v
+      then dprintf "%s data set: OK\n" test
+      else printf "%s data set: FAIL(bad value %i)\n" test (!x)
+  | None -> dprintf "%s data set: FAIL\n" test
+
+let is_data_unset test eph k =
+  is_true test "data unset" (Option.is_none (K1.query eph k))
+
+module M () = struct
+
+let make_ra () = ref (ref 1) [@@inline never]
+let make_rb () = ref (ref (ref 2)) [@@inline never]
+let ra = make_ra ()
+let rb = make_rb ()
+
+(** test: key alive data dangling *)
+let test1 () =
+  let test = "test1" in
+  Gc.minor ();
+  Gc.full_major ();
+  let eph : (int ref, int ref) K1.t = K1.make !ra (ref 42) in
+  is_data_value test eph !ra 42;
+  Gc.minor ();
+  is_data_value test eph !ra 42;
+  Gc.full_major ();
+  is_data_value test eph !ra 42;
+  ra := ref 12;
+  Gc.full_major ();
+  is_data_unset test eph !ra
+
+(** test: key dangling data dangling *)
+let test2 () =
+  let test = "test2" in
+  Gc.minor ();
+  Gc.full_major ();
+  let k = ref 125 in
+  let eph : (int ref, int ref) K1.t = K1.make k (ref 42) in
+  is_data_value test eph k 42;
+  Gc.minor ();
+  is_data_unset test eph (ref 42)
+
+(** test: key dangling data alive *)
+let test3 () =
+  let test = "test3" in
+  Gc.minor ();
+  Gc.full_major ();
+  ra := ref 13;
+  let k = ref 125 in
+  let eph : (int ref, int ref) K1.t = K1.make k !ra in
+  is_data_value test eph k 13;
+  ra := ref 14;
+  Gc.minor ();
+  is_data_unset test eph (ref 125)
+
+(** test: key alive but one away, data dangling *)
+let test4 () =
+  let test = "test4" in
+  Gc.minor ();
+  Gc.full_major ();
+  rb := ref (ref 3);
+  let eph : (int ref, int ref) K1.t = K1.make !(!rb) (ref 43) in
+  is_data_value test eph !(!rb) 43;
+  Gc.minor ();
+  Gc.minor ();
+  is_data_value test eph !(!rb) 43
+
+(** test: key dangling but one away, data dangling *)
+let test5 () =
+  let test = "test5" in
+  Gc.minor ();
+  Gc.full_major ();
+  rb := ref (ref 3);
+  let eph : (int ref, int ref) K1.t = K1.make !(!rb) (ref 43) in
+  is_data_value test eph !(!rb) 43;
+  !rb := ref 4;
+  Gc.minor ();
+  Gc.minor ();
+  is_data_unset test eph !(!rb)
+
+(** test: key accessible from data but all dangling *)
+let test6 () =
+  let test = "test6" in
+  Gc.minor ();
+  Gc.full_major ();
+  rb := ref (ref 3);
+  let eph : (int ref, int ref ref) K1.t = K1.make !(!rb) (ref !(!rb)) in
+  Gc.minor ();
+  !rb := ref 4;
+  Gc.full_major ();
+  is_data_unset test eph !(!rb)
+
+(** test: ephemeron accessible from data but they are dangling *)
+type t =
+  | No
+  | Ephe of (int ref, t) K1.t
+
+let rc = ref No
+
+let test7 () =
+  let test = "test7" in
+  Gc.minor ();
+  Gc.full_major ();
+  ra := ref 42;
+  let weak : t Weak.t = Weak.create 1 in
+  let eph : (int ref, t) K1.t ref = ref (K1.make !ra !rc) in
+  rc := Ephe !eph;
+  Weak.set weak 0 (Some !rc);
+  Gc.minor ();
+  is_true test "before" (Weak.check weak 0);
+  eph := K1.make (ref 0) No;
+  rc := No;
+  Gc.full_major ();
+  Gc.full_major ();
+  Gc.full_major ();
+  is_false test "after" (Weak.check weak 0)
+
+let run () =
+  (test1 [@inlined never]) ();
+  (test2 [@inlined never]) ();
+  (test3 [@inlined never]) ();
+  (test4 [@inlined never]) ();
+  (test5 [@inlined never]) ();
+  (test6 [@inlined never]) ();
+  (test7 [@inlined never]) ();
+  ()
+end
+
+let _ =
+  for _ = 1 to 5 do
+    let d = Array.init 3 (fun _ -> let module Mx = M () in Domain.spawn Mx.run) in
+    let module Mx = M() in
+    Mx.run ();
+    Array.iter Domain.join d
+  done
diff --git a/testsuite/tests/misc/weaklifetime.reference b/testsuite/tests/weak-ephe-final/ephetest_par.reference
similarity index 100%
rename from testsuite/tests/misc/weaklifetime.reference
rename to testsuite/tests/weak-ephe-final/ephetest_par.reference
diff --git a/testsuite/tests/misc/finaliser.ml b/testsuite/tests/weak-ephe-final/finaliser.ml
similarity index 100%
rename from testsuite/tests/misc/finaliser.ml
rename to testsuite/tests/weak-ephe-final/finaliser.ml
diff --git a/testsuite/tests/weak-ephe-final/finaliser.reference b/testsuite/tests/weak-ephe-final/finaliser.reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/testsuite/tests/weak-ephe-final/finaliser2.ml b/testsuite/tests/weak-ephe-final/finaliser2.ml
new file mode 100644
index 000000000000..c12d2cfd6b72
--- /dev/null
+++ b/testsuite/tests/weak-ephe-final/finaliser2.ml
@@ -0,0 +1,36 @@
+(* TEST *)
+
+let test1 =
+  let r' = ref 0 in
+  let rec foo () =
+    let r = ref 0 in
+    Gc.finalise_last (fun _ -> r' := 1; print_endline "test1") r
+  in
+  foo();
+  Gc.minor();
+  assert (!r' = 1)
+
+let test2 =
+  let r = ref 0 in
+  Gc.finalise (fun r -> assert (!r = 1); print_endline "test2: 1") r;
+  Gc.finalise (fun r -> assert (!r = 0); print_endline "test2: 2"; r := 1) r;
+  Gc.full_major()
+
+let test3 =
+  Gc.full_major ();
+  let rec foo () =
+    let r = ref 0 in
+    Gc.finalise (fun r -> print_endline "test3: parent.1") r;
+  in
+  foo ();
+  let d = Domain.spawn (fun _ ->
+    let r = ref 0 in
+    let r' = ref 0 in
+    Gc.full_major ();
+    Gc.finalise (fun r -> print_endline "test3: child.1") r;
+    Gc.finalise_last (fun r -> print_endline "test3: child.2") r')
+  in
+  Domain.join d;
+  print_endline "test3: joined";
+  (* Now this domain takes over the finalisers from d *)
+  Gc.full_major()
diff --git a/testsuite/tests/weak-ephe-final/finaliser2.reference b/testsuite/tests/weak-ephe-final/finaliser2.reference
new file mode 100644
index 000000000000..0082f00781b1
--- /dev/null
+++ b/testsuite/tests/weak-ephe-final/finaliser2.reference
@@ -0,0 +1,7 @@
+test1
+test2: 2
+test2: 1
+test3: joined
+test3: parent.1
+test3: child.1
+test3: child.2
diff --git a/testsuite/tests/weak-ephe-final/finaliser_handover.ml b/testsuite/tests/weak-ephe-final/finaliser_handover.ml
new file mode 100644
index 000000000000..fa2d2efc4364
--- /dev/null
+++ b/testsuite/tests/weak-ephe-final/finaliser_handover.ml
@@ -0,0 +1,40 @@
+(* TEST *)
+
+(* ocaml-multicore issues 528 and 468 *)
+
+let tree_size = try int_of_string Sys.argv.(1) with _ -> 9
+let iterations = try int_of_string Sys.argv.(2) with _ -> 10
+let num_domains = try int_of_string Sys.argv.(3) with _ -> 4
+
+type 'a tree = Empty | Node of 'a tree * 'a tree
+
+let rec make d =
+  if d = 0 then Node(Empty, Empty)
+  else let d = d - 1 in Node(make d, make d)
+
+let rec check = function Empty -> 0 | Node(l, r) -> (1 + check l + check r)
+
+let finalise_count = Atomic.make 0
+let work_count = Atomic.make 0
+
+let work () =
+  for _ = 0 to 250 do
+    let v = make tree_size in
+    Gc.finalise (fun v ->
+      ignore @@ check v;
+      Atomic.incr finalise_count)
+      v;
+    Atomic.incr work_count
+  done
+
+let _ =
+  for _ = 0 to iterations do
+    let domains = Array.init (num_domains - 1) (fun _ -> Domain.spawn(work)) in
+    let v = make tree_size in
+    ignore @@ check v;
+    Array.iter Domain.join domains
+  done;
+
+  Gc.full_major ();
+  assert((Atomic.get finalise_count) = (Atomic.get work_count));
+  print_string "ok\n"
diff --git a/testsuite/tests/weak-ephe-final/finaliser_handover.reference b/testsuite/tests/weak-ephe-final/finaliser_handover.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/weak-ephe-final/finaliser_handover.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/misc/weaklifetime.ml b/testsuite/tests/weak-ephe-final/weaklifetime.ml
similarity index 90%
rename from testsuite/tests/misc/weaklifetime.ml
rename to testsuite/tests/weak-ephe-final/weaklifetime.ml
index 2e700290e642..cd6d6b782cd6 100644
--- a/testsuite/tests/misc/weaklifetime.ml
+++ b/testsuite/tests/weak-ephe-final/weaklifetime.ml
@@ -34,7 +34,9 @@ type change = No_change | Fill | Erase;;
 (* Check the correctness condition on the data at (i,j):
    1. if the block is present, the weak pointer must be full
    2. if the block was removed at GC n, and the weak pointer is still
-      full, then the current GC must be at most n+1.
+      full, then the current GC must be at most n+2.
+      (could have promotion from minor during n+1 which keeps alive in n+1,
+      so will die at n+2)
 
    Then modify the data in one of the following ways:
    1. if the block and weak pointer are absent, fill them
@@ -49,7 +51,7 @@ let check_and_change i j =
         and try to observe its collection.  *)
     match data.(i).objs.(j), Weak.check data.(i).wp j with
     | Present x, false -> assert false
-    | Absent n, true -> assert (gc1 <= n+1); No_change
+    | Absent n, true -> assert (gc1 <= n+2); No_change
     | Absent _, false -> Fill
     | Present _, true ->
       if Random.int 10 = 0 then Erase else No_change
diff --git a/testsuite/tests/weak-ephe-final/weaklifetime.reference b/testsuite/tests/weak-ephe-final/weaklifetime.reference
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/testsuite/tests/misc/weaklifetime2.ml b/testsuite/tests/weak-ephe-final/weaklifetime2.ml
similarity index 100%
rename from testsuite/tests/misc/weaklifetime2.ml
rename to testsuite/tests/weak-ephe-final/weaklifetime2.ml
diff --git a/testsuite/tests/misc/weaklifetime2.reference b/testsuite/tests/weak-ephe-final/weaklifetime2.reference
similarity index 100%
rename from testsuite/tests/misc/weaklifetime2.reference
rename to testsuite/tests/weak-ephe-final/weaklifetime2.reference
diff --git a/testsuite/tests/weak-ephe-final/weaklifetime_par.ml b/testsuite/tests/weak-ephe-final/weaklifetime_par.ml
new file mode 100644
index 000000000000..dede2971e3c0
--- /dev/null
+++ b/testsuite/tests/weak-ephe-final/weaklifetime_par.ml
@@ -0,0 +1,100 @@
+(* TEST
+*)
+
+let size = 1000;;
+let num_domains = 4;;
+let random_state = Domain.DLS.new_key Random.State.make_self_init
+
+let random_int = Random.State.int (Domain.DLS.get random_state)
+
+type block = int array;;
+
+type objdata =
+  | Present of block
+  | Absent of int  (* GC count at time of erase *)
+;;
+
+type bunch = {
+  objs : objdata array;
+  wp : block Weak.t;
+};;
+
+let data =
+  Array.init size (fun i ->
+    let n = 1 + random_int size in
+    {
+      objs = Array.make n (Absent 0);
+      wp = Weak.create n;
+    }
+  )
+;;
+
+let gccount () =
+  let res = (Gc.quick_stat ()).Gc.major_collections in
+  res
+
+type change = No_change | Fill | Erase;;
+
+(* Check the correctness condition on the data at (i,j):
+   1. if the block is present, the weak pointer must be full
+   2. if the block was removed at GC n, and the weak pointer is still
+      full, then the current GC must be at most n+2.
+      (could have promotion from minor during n+1 which keeps alive in n+1,
+      so will die at n+2)
+   Then modify the data in one of the following ways:
+   1. if the block and weak pointer are absent, fill them
+   2. if the block and weak pointer are present, randomly erase the block
+*)
+let check_and_change data i j =
+  let gc1 = gccount () in
+  let change =
+    (* we only read data.(i).objs.(j) in this local binding to ensure
+        that it does not remain reachable on the bytecode stack
+        in the rest of the function below, when we overwrite the value
+        and try to observe its collection.  *)
+    match data.(i).objs.(j), Weak.check data.(i).wp j with
+    | Present x, false -> assert false
+    | Absent n, true -> assert (gc1 <= n+2); No_change
+    | Absent _, false -> Fill
+    | Present _, true ->
+      if random_int 10 = 0 then Erase else No_change
+  in
+  match change with
+  | No_change -> ()
+  | Fill ->
+    let x = Array.make (1 + random_int 10) 42 in
+    data.(i).objs.(j) <- Present x;
+    Weak.set data.(i).wp j (Some x);
+  | Erase ->
+    data.(i).objs.(j) <- Absent gc1;
+    let gc2 = gccount () in
+    if gc1 <> gc2 then data.(i).objs.(j) <- Absent gc2;
+;;
+
+let dummy = ref [||];;
+
+let run index () =
+  let domain_data = Array.init 100 (fun i ->
+    let n = 1 + random_int 100 in
+    {
+      objs = Array.make n (Absent 0);
+      wp = Weak.create n;
+    }
+  ) in
+  while gccount () < 5 do
+    dummy := Array.make (random_int 300) 0;
+    let i = (random_int (size/num_domains)) + index * size/num_domains in
+    let j = random_int (Array.length data.(i).objs) in
+    check_and_change data i j;
+    let ix = random_int 100 in
+    let jx = random_int (Array.length domain_data.(ix).objs) in
+    check_and_change domain_data ix jx
+  done
+
+let _ =
+  for index = 0 to 4 do
+    let domains = Array.init (num_domains - 1) (fun i -> Domain.spawn(run ((i + index) mod 5))) in
+    run ((num_domains - 1 + index) mod 5) ();
+    Array.iter Domain.join domains
+  done;
+  print_endline "ok"
diff --git a/testsuite/tests/weak-ephe-final/weaklifetime_par.reference b/testsuite/tests/weak-ephe-final/weaklifetime_par.reference
new file mode 100644
index 000000000000..9766475a4185
--- /dev/null
+++ b/testsuite/tests/weak-ephe-final/weaklifetime_par.reference
@@ -0,0 +1 @@
+ok
diff --git a/testsuite/tests/misc/weaktest.ml b/testsuite/tests/weak-ephe-final/weaktest.ml
similarity index 67%
rename from testsuite/tests/misc/weaktest.ml
rename to testsuite/tests/weak-ephe-final/weaktest.ml
index 292e76622c66..5b66088722e9 100644
--- a/testsuite/tests/misc/weaktest.ml
+++ b/testsuite/tests/weak-ephe-final/weaktest.ml
@@ -60,6 +60,17 @@ for j = 0 to 99 do
   r := [];
   incr added;
 
+  (* Ephemeron / Weak array implementation in multicore OCaml differs
+     significantly from stock OCaml. In particular, ephemerons keys and data in
+     the minor heap are considered roots for the minor collection. Moreover,
+     When blit'ing ephemerons, the source keys and data are marked as live to
+     play nicely with the concurrent major GC. As a result, this test keeps the
+     previous bunches of strings alive longer than on stock. Hence, the test as
+     is fails on multicore. We add a [full_major] call that forces old bunches
+     to be collected and to confirm that ephemeron implementation on multicore
+     does work as intended. *)
+  Gc.full_major ();
+
   for i = 1 to bunch do
     let c = random_string 7 in
     r := c :: !r;
diff --git a/testsuite/tests/misc/weaktest.reference b/testsuite/tests/weak-ephe-final/weaktest.reference
similarity index 100%
rename from testsuite/tests/misc/weaktest.reference
rename to testsuite/tests/weak-ephe-final/weaktest.reference
diff --git a/testsuite/tools/asmgen_amd64.S b/testsuite/tools/asmgen_amd64.S
index fb87307df082..38ca2be7821a 100644
--- a/testsuite/tools/asmgen_amd64.S
+++ b/testsuite/tools/asmgen_amd64.S
@@ -21,14 +21,8 @@
 
 #ifdef SYS_macosx
 #define CALL_GEN_CODE _call_gen_code
-#define CAML_C_CALL _caml_c_call
-#define CAML_NEGF_MASK _caml_negf_mask
-#define CAML_ABSF_MASK _caml_absf_mask
 #else
 #define CALL_GEN_CODE call_gen_code
-#define CAML_C_CALL caml_c_call
-#define CAML_NEGF_MASK caml_negf_mask
-#define CAML_ABSF_MASK caml_absf_mask
 #endif
 
         .globl  CALL_GEN_CODE
@@ -53,31 +47,3 @@ CALL_GEN_CODE:
         popq    %rbp
         popq    %rbx
         ret
-
-        .globl  CAML_C_CALL
-        .align  ALIGN
-CAML_C_CALL:
-        jmp     *%rax
-
-#ifdef SYS_macosx
-        .literal16
-#elif defined(SYS_mingw64) || defined(SYS_cygwin)
-        .section        .rodata.cst8
-#else
-        .section        .rodata.cst8,"aM",@progbits,8
-#endif
-        .globl  CAML_NEGF_MASK
-        .align  ALIGN
-CAML_NEGF_MASK:
-        .quad   0x8000000000000000, 0
-        .globl  CAML_ABSF_MASK
-        .align  ALIGN
-CAML_ABSF_MASK:
-        .quad   0x7FFFFFFFFFFFFFFF, 0
-
-        .comm   young_limit, 8
-
-#if defined(SYS_linux)
-    /* Mark stack as non-executable */
-        .section .note.GNU-stack,"",%progbits
-#endif
diff --git a/testsuite/tools/lexcmm.mll b/testsuite/tools/lexcmm.mll
index 026c2ed35ded..c31317888a57 100644
--- a/testsuite/tools/lexcmm.mll
+++ b/testsuite/tools/lexcmm.mll
@@ -64,8 +64,8 @@ let keyword_table =
     "mulh", MULH;
     "or", OR;
     "proj", PROJ;
-    "raise", RAISE Lambda.Raise_regular;
-    "reraise", RAISE Lambda.Raise_reraise;
+    "raise_regular", RAISE Lambda.Raise_regular;
+    "raise_reraise", RAISE Lambda.Raise_reraise;
     "raise_notrace", RAISE Lambda.Raise_notrace;
     "seq", SEQ;
     "signed", SIGNED;
diff --git a/testsuite/tools/parsecmm.mly b/testsuite/tools/parsecmm.mly
index ff0620cb550a..d9807e24d5f4 100644
--- a/testsuite/tools/parsecmm.mly
+++ b/testsuite/tools/parsecmm.mly
@@ -256,19 +256,27 @@ expr:
                 { unbind_ident $5; Ctrywith($3, $5, $6, debuginfo ()) }
   | LPAREN VAL expr expr RPAREN
       { let open Asttypes in
-        Cop(Cload (Word_val, Mutable), [access_array $3 $4 Arch.size_addr],
+        Cop(Cload {memory_chunk=Word_val;
+                   mutability=Mutable;
+                   is_atomic=false}, [access_array $3 $4 Arch.size_addr],
           debuginfo ()) }
   | LPAREN ADDRAREF expr expr RPAREN
       { let open Asttypes in
-        Cop(Cload (Word_val, Mutable), [access_array $3 $4 Arch.size_addr],
+        Cop(Cload {memory_chunk=Word_val;
+                   mutability=Mutable;
+                   is_atomic=false}, [access_array $3 $4 Arch.size_addr],
           Debuginfo.none) }
   | LPAREN INTAREF expr expr RPAREN
       { let open Asttypes in
-        Cop(Cload (Word_int, Mutable), [access_array $3 $4 Arch.size_int],
+        Cop(Cload {memory_chunk=Word_int;
+                   mutability=Mutable;
+                   is_atomic=false}, [access_array $3 $4 Arch.size_int],
           Debuginfo.none) }
   | LPAREN FLOATAREF expr expr RPAREN
       { let open Asttypes in
-        Cop(Cload (Double, Mutable), [access_array $3 $4 Arch.size_float],
+        Cop(Cload {memory_chunk=Double;
+                   mutability=Mutable;
+                   is_atomic=false}, [access_array $3 $4 Arch.size_float],
           Debuginfo.none) }
   | LPAREN ADDRASET expr expr expr RPAREN
       { let open Lambda in
@@ -324,7 +332,9 @@ chunk:
   | VAL                         { Word_val }
 ;
 unaryop:
-    LOAD chunk                  { Cload ($2, Asttypes.Mutable) }
+    LOAD chunk                  { Cload {memory_chunk=$2;
+                                         mutability=Asttypes.Mutable;
+                                         is_atomic=false} }
   | FLOATOFINT                  { Cfloatofint }
   | INTOFFLOAT                  { Cintoffloat }
   | RAISE                       { Craise $1 }
diff --git a/tools/.depend b/tools/.depend
index 93974de7745d..3fcdd5dd1a56 100644
--- a/tools/.depend
+++ b/tools/.depend
@@ -64,6 +64,8 @@ eqparsetree.cmx : \
     ../parsing/longident.cmx \
     ../parsing/location.cmx \
     ../parsing/asttypes.cmi
+gen_sizeclasses.cmo :
+gen_sizeclasses.cmx :
 lintapidiff.cmo : \
     ../typing/printtyp.cmi \
     ../driver/pparse.cmi \
diff --git a/tools/ci/actions/runner.sh b/tools/ci/actions/runner.sh
index d9114ac4a265..4331f9d790e8 100755
--- a/tools/ci/actions/runner.sh
+++ b/tools/ci/actions/runner.sh
@@ -21,8 +21,6 @@ PREFIX=~/local
 MAKE="make $MAKE_ARG"
 SHELL=dash
 
-MAKE_WARN="$MAKE --warn-undefined-variables"
-
 export PATH=$PREFIX/bin:$PATH
 
 Configure () {
@@ -31,7 +29,6 @@ Configure () {
 ------------------------------------------------------------------------
 This test builds the OCaml compiler distribution with your pull request
 and runs its testsuite.
-
 Failing to build the compiler distribution, or testsuite failures are
 critical errors that must be understood and fixed before your pull
 request can be merged.
@@ -40,9 +37,7 @@ EOF
 
   configure_flags="\
     --prefix=$PREFIX \
-    --enable-flambda-invariants \
-    --enable-ocamltest \
-    --disable-dependency-generation \
+    --enable-debug-runtime \
     $CONFIG_ARG"
 
   case $XARCH in
@@ -65,17 +60,28 @@ EOF
 }
 
 Build () {
-  script --return --command "$MAKE_WARN world.opt" build.log
+  $MAKE world.opt
   echo Ensuring that all names are prefixed in the runtime
   ./tools/check-symbol-names runtime/*.a
 }
 
 Test () {
-  cd testsuite
-  echo Running the testsuite with the normal runtime
-  $MAKE all
-  echo Running the testsuite with the debug runtime
-  $MAKE USE_RUNTIME='d' OCAMLTESTDIR="$(pwd)/_ocamltestd" TESTLOG=_logd all
+  echo Running the testsuite
+  $MAKE -C testsuite all
+  cd ..
+}
+
+TestLoop () {
+  echo Running testsuite for "$@"
+  rm -f to_test.txt
+  for test in "$@"
+  do
+      echo tests/$test >> to_test.txt
+  done
+  for it in {1..$2}
+  do
+      $MAKE -C testsuite one LIST=../to_test.txt || exit 1
+  done || exit 1
   cd ..
 }
 
@@ -89,15 +95,6 @@ Install () {
 }
 
 Checks () {
-  set +x
-  STATUS=0
-  if grep -Fq ' warning: undefined variable ' build.log; then
-    echo -e '\e[31mERROR\e[0m Undefined Makefile variables detected!'
-    grep -F ' warning: undefined variable ' build.log | sort | uniq
-    STATUS=1
-  fi
-  rm build.log
-  set -x
   if fgrep 'SUPPORTS_SHARED_LIBRARIES=true' Makefile.config &>/dev/null ; then
     echo Check the code examples in the manual
     $MAKE manual-pregen
@@ -119,7 +116,6 @@ Checks () {
   test -z "$(git status --porcelain)"
   # Check that there are no ignored files
   test -z "$(git ls-files --others -i --exclude-standard)"
-  exit $STATUS
 }
 
 CheckManual () {
@@ -174,6 +170,7 @@ case $1 in
 configure) Configure;;
 build) Build;;
 test) Test;;
+test_multicore) TestLoop "${@:3}";;
 api-docs) API_Docs;;
 install) Install;;
 manual) BuildManual;;
diff --git a/tools/ci/appveyor/appveyor_build.sh b/tools/ci/appveyor/appveyor_build.sh
index fa43cd3ae3c2..bad458899a9d 100644
--- a/tools/ci/appveyor/appveyor_build.sh
+++ b/tools/ci/appveyor/appveyor_build.sh
@@ -53,41 +53,47 @@ function set_configuration {
     case "$1" in
         cygwin*)
             dep='--disable-dependency-generation'
+            man=''
         ;;
         mingw32)
             build='--build=i686-pc-cygwin'
             host='--host=i686-w64-mingw32'
             dep='--disable-dependency-generation'
+            man=''
         ;;
         mingw64)
             build='--build=i686-pc-cygwin'
             host='--host=x86_64-w64-mingw32'
             dep='--disable-dependency-generation'
+            man='--disable-stdlib-manpages'
         ;;
         msvc32)
             build='--build=i686-pc-cygwin'
             host='--host=i686-pc-windows'
             dep='--disable-dependency-generation'
+            man=''
         ;;
         msvc64)
             build='--build=x86_64-pc-cygwin'
             host='--host=x86_64-pc-windows'
             # Explicitly test dependency generation on msvc64
             dep='--enable-dependency-generation'
+            man=''
         ;;
     esac
 
     mkdir -p "$CACHE_DIRECTORY"
     ./configure --cache-file="$CACHE_DIRECTORY/config.cache-$1" \
-                $dep $build $host --prefix="$2" --enable-ocamltest || ( \
+                $dep $build $man $host --prefix="$2" --enable-ocamltest || ( \
       rm -f "$CACHE_DIRECTORY/config.cache-$1" ; \
       ./configure --cache-file="$CACHE_DIRECTORY/config.cache-$1" \
-                  $dep $build $host --prefix="$2" --enable-ocamltest )
+                  $dep $build $man $host --prefix="$2" --enable-ocamltest )
 
 #    FILE=$(pwd | cygpath -f - -m)/Makefile.config
 #    run "Content of $FILE" cat Makefile.config
 }
 
+PARALLEL_URL='https://ftpmirror.gnu.org/parallel/parallel-latest.tar.bz2'
 APPVEYOR_BUILD_FOLDER=$(echo "$APPVEYOR_BUILD_FOLDER" | cygpath -f -)
 FLEXDLLROOT="$PROGRAMFILES/flexdll"
 OCAMLROOT=$(echo "$OCAMLROOT" | cygpath -f - -m)
@@ -99,8 +105,24 @@ if [[ $BOOTSTRAP_FLEXDLL = 'false' ]] ; then
   esac
 fi
 
+# This is needed at all stages while winpthreads is in use for 5.00
+# This step can be moved back to the test phase (or removed entirely?) when
+# winpthreads stops being used.
+if [[ $PORT = 'mingw64' ]] ; then
+  export PATH="$PATH:/usr/x86_64-w64-mingw32/sys-root/mingw/bin"
+elif [[ $PORT = 'mingw32' ]] ; then
+  export PATH="$PATH:/usr/i686-w64-mingw32/sys-root/mingw/bin"
+fi
+
 case "$1" in
   install)
+    pushd /tmp &>/dev/null
+    curl -Ls $PARALLEL_URL | bunzip2 - | tar x
+    cd parallel-*
+    ./configure
+    make
+    make install
+    popd &/dev/null
     if [[ $BOOTSTRAP_FLEXDLL = 'false' ]] ; then
       mkdir -p "$FLEXDLLROOT"
       cd "$APPVEYOR_BUILD_FOLDER/../flexdll"
@@ -124,12 +146,9 @@ case "$1" in
           "$FULL_BUILD_PREFIX-$PORT/tools/check-symbol-names" \
           $FULL_BUILD_PREFIX-$PORT/runtime/*.a
     fi
-    if [[ $PORT = 'mingw64' ]] ; then
-      export PATH="$PATH:/usr/x86_64-w64-mingw32/sys-root/mingw/bin"
-    elif [[ $PORT = 'mingw32' ]] ; then
-      export PATH="$PATH:/usr/i686-w64-mingw32/sys-root/mingw/bin"
-    fi
-    run "test $PORT" $MAKE -C "$FULL_BUILD_PREFIX-$PORT" tests
+    # FIXME At present, running the testsuite takes too long
+    #run "test $PORT" \
+    #    $MAKE -C "$FULL_BUILD_PREFIX-$PORT/testsuite" SHOW_TIMINGS=1 parallel
     run "install $PORT" $MAKE -C "$FULL_BUILD_PREFIX-$PORT" install
     if [[ $PORT = 'msvc64' ]] ; then
       run "$MAKE check_all_arches" \
diff --git a/tools/dumpobj.ml b/tools/dumpobj.ml
index bb683afba527..7c7ee74fbfe5 100644
--- a/tools/dumpobj.ml
+++ b/tools/dumpobj.ml
@@ -395,6 +395,10 @@ let op_shapes = [
   opUGEINT, Nothing;
   opBULTINT, Uint_Disp;
   opBUGEINT, Uint_Disp;
+  opPERFORM, Nothing;
+  opRESUME, Nothing;
+  opRESUMETERM, Uint;
+  opREPERFORMTERM, Uint;
   opSTOP, Nothing;
   opEVENT, Nothing;
   opBREAK, Nothing;
diff --git a/tools/eventlog_metadata.in b/tools/eventlog_metadata.in
index 2a2b50726ed1..b6a57f978ff5 100644
--- a/tools/eventlog_metadata.in
+++ b/tools/eventlog_metadata.in
@@ -51,9 +51,12 @@ stream {
     id = 0;
     event.header := struct { /* for each event */
         tracing_clock_int_t timestamp;
-        uint32_t pid;
         uint32_t id;
     };
+    event.context := struct {
+        uint32_t tid;
+        uint8_t is_backup_thread;
+    };
 };
 
 /*
@@ -94,7 +97,43 @@ enum gc_phase : uint16_t {
     "minor/copy",
     "minor/update_weak",
     "minor/finalized",
-    "explicit/gc_major_slice"
+    "explicit/gc_major_slice",
+    "domain/spawn",
+    "domain/send_interrupt",
+    "domain/idle_wait",
+    "finalise/update_first",
+    "finalise/update_last",
+    "interrupt/gc",
+    "interrupt/remote",
+    "major/ephe_mark",
+    "major/ephe_sweep",
+    "major/finish_marking",
+    "major_gc/cycle_domains",
+    "major_gc/phase_change",
+    "major_gc/stw",
+    "major/mark_opportunistic",
+    "major/slice",
+    "minor/clear",
+    "minor/finalizers/oldify",
+    "minor/global_roots",
+    "minor/leave_barrier",
+    "stw/api_barrier",
+    "stw/handler",
+    "stw/leader",
+    "minor/clear",
+    "minor_finalized",
+    "minor_finalizers_oldify",
+    "minor_global_roots",
+    "minor_leave_barrier",
+    "minor_local_roots",
+    "minor_ref_tables",
+    "minor_update_weak",
+    "stw_api_barrier",
+    "stw_handler",
+    "stw_leader",
+    "major_finish_sweeping",
+    "minor_finalizers_admin",
+    "domain/condition_wait"
 };
 
 /*
diff --git a/tools/eventlog_to_latencies.py b/tools/eventlog_to_latencies.py
new file mode 100644
index 000000000000..d2d6d5e7463f
--- /dev/null
+++ b/tools/eventlog_to_latencies.py
@@ -0,0 +1,121 @@
+#**************************************************************************
+#*                                                                        *
+#*                                 OCaml                                  *
+#*                                                                        *
+#*         KC Sivaramakrishnan, Indian Institute of Technology, Madras    *
+#*                                                                        *
+#*   Copyright 2020 Indian Institute of Technology, Madras                *
+#*                                                                        *
+#*   All rights reserved.  This file is distributed under the terms of    *
+#*   the GNU Lesser General Public License version 2.1, with the          *
+#*   special exception on linking described in the file LICENSE.          *
+#*                                                                        *
+#**************************************************************************
+
+from intervaltree import IntervalTree, Interval
+import sys
+import subprocess
+import json
+import os
+
+if len(sys.argv) < 2:
+  print ("Usage: %s EVENTLOG\n" % sys.argv[0])
+  print ("Generate GC latency distribution report from eventlog")
+  sys.exit(1)
+
+json_file = sys.argv[1]
+
+percentages = [10,20,30,40,50,60,70,80,90,95,99,99.9]
+
+def distribution(l):
+  to_indices = []
+  for p in percentages:
+    i = int(round(float(len(l))*float(p)/100.0-1,0))
+    to_indices.append(i)
+  i = 0
+  distr = []
+  while (i < len(percentages)):
+    if (to_indices[i] == -1):
+      distr.append(0)
+    else:
+      distr.append(l[to_indices[i]])
+    i+=1
+
+  return distr
+
+def main():
+  trees = {}
+  with open(json_file) as f:
+    data = json.load(f)
+    stacks = {}
+    for event in data["traceEvents"]:
+      if (event["ph"] == "B"):
+        key = str(event["pid"])+":"+str(event["tid"])
+        ts = int(float(event["ts"])*1000.0)
+        name = event["name"]
+        if key in stacks:
+          stacks[key].append((name,ts,0))
+        else:
+          stacks[key] = [(name,ts,0)]
+      elif (event["ph"] == "E"):
+        key = str(event["pid"])+":"+str(event["tid"])
+        ts = int(float(event["ts"])*1000.0)
+        name = event["name"]
+        (nameStart, startTs, overhead) = stacks[key].pop()
+        assert (nameStart == name)
+        if not key in trees:
+          trees[key] = IntervalTree()
+        trees[key].addi(startTs, ts, {'name': name, 'overhead': overhead, 'tid': event["tid"]})
+      elif (event["ph"] == "C" and event["name"] == "overhead#"):
+        key = str(event["pid"])+":"+str(event["tid"])
+        overhead = int(event["args"]["value"])
+        l = []
+        for e in stacks[key]:
+          (name,ts,o) = e
+          l.append((name,ts,o+overhead))
+        stacks[key] = l
+
+  latencies = []
+  intervals = []
+
+  for t in trees.values():
+    domain_terminate_intervals = IntervalTree((i for i in t if i.data['name'].startswith("major_gc/finish_")))
+    t.merge_overlaps((lambda acc,v: {'name': acc['name'], 'overhead': acc['overhead'] + v['overhead'], 'tid': acc['tid']}))
+    latencies = latencies + list(map(lambda x: x.end - x.begin - x.data['overhead'], sorted(t - domain_terminate_intervals)))
+    intervals.extend(t)
+  sorted_latencies = sorted(latencies)
+
+  if (len(sorted_latencies) > 0):
+    max_latency = sorted_latencies[len(sorted_latencies) - 1]
+    avg_latency = sum(sorted_latencies)/len(sorted_latencies)
+  else:
+    max_latency = 0
+    avg_latency = 0
+
+  distr = distribution(sorted_latencies)
+
+  out = {}
+  print ("Mean latency = " + str(avg_latency) + " ns")
+  print ("Max latency = " + str(max_latency) + " ns")
+  print ("")
+  print ("## Latency distribution")
+  print ("")
+  print ("Percentile, Latency(ns)")
+  for (p,l) in zip(percentages,distr):
+    print(str(p) + "," + str(l))
+
+  sorted_intervals = sorted(intervals, key=lambda i: -(i.end - i.begin))
+
+  print ("")
+  print ("## Top slowest events")
+  print ("")
+  print ("Latency(ns), Start Timestamp(ns), End TimeStamp(ns), Event, Overhead, Domain ID")
+  for interval in sorted_intervals[0:32]:
+    print(str(interval.end - interval.begin) + ", " +
+          str(interval.begin) + ", " +
+          str(interval.end) + ", " +
+          interval.data['name'] + ", " +
+          str(interval.data['overhead']) + ", " +
+          str(interval.data['tid']))
+
+main()
diff --git a/tools/gdb_ocamlrun.py b/tools/gdb_ocamlrun.py
new file mode 100644
index 000000000000..12f438bfb794
--- /dev/null
+++ b/tools/gdb_ocamlrun.py
@@ -0,0 +1,168 @@
+#**************************************************************************
+#*                                                                        *
+#*                                 OCaml                                  *
+#*                                                                        *
+#*                 Stephen Dolan, University of Cambridge                 *
+#*                                                                        *
+#*   Copyright 2016 Stephen Dolan.                                        *
+#*                                                                        *
+#*   All rights reserved.  This file is distributed under the terms of    *
+#*   the GNU Lesser General Public License version 2.1, with the          *
+#*   special exception on linking described in the file LICENSE.          *
+#*                                                                        *
+#**************************************************************************
+
+import gdb
+
+TAGS = {
+    246: 'Lazy_tag',
+    247: 'Closure_tag',
+    248: 'Object_tag',
+    249: 'Infix_tag',
+    250: 'Forward_tag',
+    251: 'Abstract_tag',
+    252: 'String_tag',
+    253: 'Double_tag',
+    254: 'Double_array_tag',
+    255: 'Custom_tag'
+}
+
+No_scan_tag = 251
+
+
+debug_tags = {
+    0x00: 'Debug_free_minor',
+    0x01: 'Debug_free_major',
+    0x03: 'Debug_free_shrink',
+    0x04: 'Debug_free_truncat',
+    0x10: 'Debug_uninit_minor',
+    0x11: 'Debug_uninit_major',
+    0x15: 'Debug_uninit_align',
+    0x85: 'Debug_filler_align'
+}
+
+class DoublePrinter:
+    def __init__(self, tag, length, p):
+        assert tag in ['Double_tag', 'Double_array_tag']
+        self.tag = tag
+        self.length = length
+        self.p = p
+
+    def children(self):
+        pass
+
+    def to_string(self):
+        return '%s[%d]' % (self.tag, self.length)
+
+class ConstPrinter:
+    def __init__(self, rep):
+        self.rep = rep
+    def to_string(self):
+        return self.rep
+
+class BlockPrinter:
+    def __init__(self, val):
+        if val & 1 == 1:
+            self.tag = 1000
+            self.tagname = 'I'
+            self.val = val
+        else:
+            self.p = val.cast(val.type.pointer())
+            header = (self.p - 1).dereference()
+            self.length = int(header >> 10)
+            self.gc = int(header & (3 << 8))
+            self.tag = int(header & 255)
+            self.tagname = TAGS.get(self.tag, 'Block')
+
+    def children(self):
+#        if self.tag < No_scan_tag:
+#            fields = self.p.cast(gdb.lookup_type('value').pointer())
+#            for i in range(self.length):
+#                yield '[%d]' % i, (fields + i).dereference()
+#        elif self.tagname == 'Double_array_tag':
+#            words_per_double = \
+#              gdb.lookup_type('double').sizeof / gdb.lookup_type('value').sizeof
+#            fields = self.p.cast(gdb.lookup_type('double').pointer())
+#            for i in range(int(self.length / words_per_double)):
+#                yield '[%d]' % i, (fields + i).dereference()
+#
+        return []
+
+    def to_string(self):
+        if self.tag == 1000:
+            # it's an immediate value
+            if gdb.lookup_type('value').sizeof == 8:
+                debug_mask = 0xff00ffffff00ffff
+                debug_val  = 0xD700D7D7D700D6D7
+            else:
+                debug_mask = 0xff00ffff
+                debug_val  = 0xD700D6D7
+            n = self.val
+            if (n & debug_mask) == debug_val:
+                tag = int((n >> 16) & 0xff)
+                return debug_tags.get(tag,
+                                      "Debug_tag(0x%x)" % int(tag))
+            else:
+                return "I(%d)" % int(n >> 1)
+
+        # otherwise, it's a block
+
+        if self.tagname == 'Double_tag':
+            d = self.p.cast(gdb.lookup_type('double').pointer()).dereference()
+            s = '%f, wosize=1' % d
+        elif self.tagname == 'String_tag':
+            char = gdb.lookup_type('unsigned char')
+            val_size = gdb.lookup_type('value').sizeof
+            lastbyte = ((self.p + self.length - 1).cast(char.pointer()) + val_size - 1).dereference()
+            length_bytes = self.length * val_size - (lastbyte + 1)
+            string = (self.p.cast(char.array(length_bytes).pointer()).dereference())
+            s = str(string).strip()
+        elif self.tagname == 'Infix_tag':
+            s = 'offset=%d' % (-self.length)
+        elif self.tagname == 'Custom_tag':
+            ops = self.p.dereference().cast(gdb.lookup_type('struct custom_operations').pointer())
+            s = '%s, wosize=%d' % (str(ops), self.length)
+        elif self.tagname == 'Block':
+            s = '%d, wosize=%d' % (self.tag,self.length)
+        else:
+            s = 'wosize=%d' % self.length
+
+        markbits = gdb.lookup_symbol("global")[0].value()
+        gc = {
+            int(markbits['MARKED']): 'MARKED',
+            int(markbits['UNMARKED']): 'UNMARKED',
+            int(markbits['GARBAGE']): 'GARBAGE',
+            (3 << 8): 'NOT_MARKABLE'
+        }
+        return '%s(%s, %s)' % (self.tagname, s, gc[self.gc])
+
+
+    def display_hint (self):
+        return 'array'
+
+
+
+class Fields(gdb.Function):
+    def __init__ (self):
+        super (Fields, self).__init__ ("F")
+
+    def invoke (self, val):
+        assert str(val.type) == 'value'
+        p = val.cast(val.type.pointer())
+        header = (p - 1).dereference()
+        length = int(header >> 10)
+
+        return p.cast(val.type.array(length - 1).pointer()).dereference()
+
+
+Fields()
+
+
+def value_printer(val):
+    if str(val.type) != 'value':
+        return None
+
+
+    return BlockPrinter(val)
+
+gdb.pretty_printers = [value_printer]
diff --git a/tools/gen_sizeclasses.ml b/tools/gen_sizeclasses.ml
new file mode 100644
index 000000000000..eb0a80d3bcd4
--- /dev/null
+++ b/tools/gen_sizeclasses.ml
@@ -0,0 +1,81 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*             Stephen Dolan, University of Cambridge                     *)
+(*                                                                        *)
+(*   Copyright 2014 Stephen Dolan.                                        *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+let overhead block slot obj =
+  1. -. float_of_int((block / slot) * obj) /. float_of_int block
+
+let max_overhead = 0.10
+
+let rec blocksizes block slot = function
+  | 0 -> []
+  | obj ->
+    if overhead block slot obj > max_overhead
+    then
+      if overhead block obj obj < max_overhead then
+        obj :: blocksizes block obj (obj - 1)
+      else
+        failwith (Format.sprintf
+          "%d-word objects cannot fit in %d-word arena below %.1f%% overhead"
+                                 obj block (100. *. max_overhead))
+    else blocksizes block slot (obj - 1)
+
+let rec findi_acc i p = function
+  | [] -> raise Not_found
+  | x :: xs -> if p x then i else findi_acc (i + 1) p xs
+let findi = findi_acc 0
+
+let arena = 4096
+let header_size = 4
+let max_slot = 128
+let avail_arena = arena - header_size
+let sizes = List.rev (blocksizes avail_arena max_int max_slot)
+
+let rec size_slots n =
+  if n > max_slot then
+    []
+  else
+    findi (fun x -> n <= x) sizes :: size_slots (n + 1)
+
+let rec wastage =
+  sizes |> List.map (fun s -> avail_arena mod s)
+
+open Format
+
+let rec print_overheads n = function
+  | [] -> ()
+  | s :: ss when n > s -> print_overheads n ss
+  | (s :: _) as ss  ->
+     printf "%3d/%-3d: %.1f%%\n" n s (100. *. overhead avail_arena s n);
+     print_overheads (n+1) ss
+
+(* let () = print_overheads 1 sizes *)
+
+let rec print_list ppf = function
+  | [] -> ()
+  | [x] -> fprintf ppf "%d" x
+  | x :: xs -> fprintf ppf "%d,@ %a" x print_list xs
+
+let _ =
+  printf "/* This file is generated by tools/gen_sizeclasses.ml */\n";
+  printf "#define POOL_WSIZE %d\n" arena;
+  printf "#define POOL_HEADER_WSIZE %d\n" header_size;
+  printf "#define SIZECLASS_MAX %d\n" max_slot;
+  printf "#define NUM_SIZECLASSES %d\n" (List.length sizes);
+  printf "static const unsigned int \
+wsize_sizeclass[NUM_SIZECLASSES] = @[<2>{ %a };@]\n" print_list sizes;
+  printf "static const unsigned char \
+wastage_sizeclass[NUM_SIZECLASSES] = @[<2>{ %a };@]\n" print_list wastage;
+  printf "static const unsigned char \
+sizeclass_wsize[SIZECLASS_MAX + 1] = @[<2>{ %a };@]\n"
+    print_list (255 :: size_slots 1);
diff --git a/tools/list-globals b/tools/list-globals
new file mode 100755
index 000000000000..7ac2805ecab6
--- /dev/null
+++ b/tools/list-globals
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#**************************************************************************
+#*                                                                        *
+#*                                 OCaml                                  *
+#*                                                                        *
+#*                 Stephen Dolan, University of Cambridge                 *
+#*                                                                        *
+#*   Copyright 2017 Stephen Dolan.                                        *
+#*                                                                        *
+#*   All rights reserved.  This file is distributed under the terms of    *
+#*   the GNU Lesser General Public License version 2.1, with the          *
+#*   special exception on linking described in the file LICENSE.          *
+#*                                                                        *
+#**************************************************************************
+
+[ -z "$@" ] && { echo "Usage: $0 <foo.[oa]>" > /dev/stderr; exit 2; }
+
+nm -A -f sysv "$@" |\
+  awk '
+    BEGIN {FS = " *[|] *"}
+    NF > 1 && $4 != "TLS" && $7 != ".text" && $7 != "*COM*" \
+    && $3 !~ /^[TtURr]$/ {print $3 " " $1}
+  '
diff --git a/toplevel/native/topeval.ml b/toplevel/native/topeval.ml
index 0d4cc770821c..a685e0508ab7 100644
--- a/toplevel/native/topeval.ml
+++ b/toplevel/native/topeval.ml
@@ -53,7 +53,7 @@ let close_phrase lam =
   Ident.Set.fold (fun id l ->
     let glb, pos = toplevel_value id in
     let glob =
-      Lprim (Pfield pos,
+      Lprim (Pfield (pos, Pointer, Mutable),
              [Lprim (Pgetglobal glb, [], Loc_unknown)],
              Loc_unknown)
     in
diff --git a/typing/predef.ml b/typing/predef.ml
index 6d28f25f1f8a..cd2da42285fb 100644
--- a/typing/predef.ml
+++ b/typing/predef.ml
@@ -96,6 +96,9 @@ and ident_sys_blocked_io = ident_create "Sys_blocked_io"
 and ident_assert_failure = ident_create "Assert_failure"
 and ident_undefined_recursive_module =
         ident_create "Undefined_recursive_module"
+and ident_continuation_already_taken =
+        ident_create "Continuation_already_taken"
+and ident_unhandled = ident_create "Unhandled"
 
 let all_predef_exns = [
   ident_match_failure;
@@ -110,6 +113,8 @@ let all_predef_exns = [
   ident_sys_blocked_io;
   ident_assert_failure;
   ident_undefined_recursive_module;
+  ident_continuation_already_taken;
+  ident_unhandled;
 ]
 
 let path_match_failure = Pident ident_match_failure
@@ -231,6 +236,7 @@ let common_initial_env add_type add_extension empty_env =
   (* Predefined exceptions - alphabetical order *)
   |> add_extension ident_assert_failure
        [newgenty (Ttuple[type_string; type_int; type_int])]
+  |> add_extension ident_continuation_already_taken []
   |> add_extension ident_division_by_zero []
   |> add_extension ident_end_of_file []
   |> add_extension ident_failure [type_string]
@@ -244,6 +250,7 @@ let common_initial_env add_type add_extension empty_env =
   |> add_extension ident_sys_error [type_string]
   |> add_extension ident_undefined_recursive_module
        [newgenty (Ttuple[type_string; type_int; type_int])]
+  |> add_extension ident_unhandled []
 
 let build_initial_env add_type add_exception empty_env =
   let common = common_initial_env add_type add_exception empty_env in
diff --git a/typing/tast_mapper.ml b/typing/tast_mapper.ml
index 6d359a59a72f..87079db4fc23 100644
--- a/typing/tast_mapper.ml
+++ b/typing/tast_mapper.ml
@@ -275,7 +275,7 @@ let expr sub x =
         Texp_variant (l, Option.map (sub.expr sub) expo)
     | Texp_record { fields; representation; extended_expression } ->
         let fields = Array.map (function
-            | label, Kept t -> label, Kept t
+            | label, Kept (t, mut) -> label, Kept (t, mut)
             | label, Overridden (lid, exp) ->
                 label, Overridden (lid, sub.expr sub exp))
             fields
diff --git a/typing/typecore.ml b/typing/typecore.ml
index 1636bb240f42..1bf31c22249e 100644
--- a/typing/typecore.ml
+++ b/typing/typecore.ml
@@ -3242,7 +3242,7 @@ and type_expect_
                   unify_exp_types loc env ty_arg1 ty_arg2;
                   with_explanation (fun () ->
                     unify_exp_types loc env (instance ty_expected) ty_res2);
-                  Kept ty_arg1
+                  Kept (ty_arg1, lbl.lbl_mut)
                 end
             in
             let label_definitions = Array.map unify_kept lbl.lbl_all in
diff --git a/typing/typedtree.ml b/typing/typedtree.ml
index 9194a59c18c6..9e2257c0f73a 100644
--- a/typing/typedtree.ml
+++ b/typing/typedtree.ml
@@ -161,7 +161,7 @@ and 'k case =
     }
 
 and record_label_definition =
-  | Kept of Types.type_expr
+  | Kept of Types.type_expr * mutable_flag
   | Overridden of Longident.t loc * expression
 
 and binding_op =
diff --git a/typing/typedtree.mli b/typing/typedtree.mli
index f5460d1ea280..440a28cace79 100644
--- a/typing/typedtree.mli
+++ b/typing/typedtree.mli
@@ -293,7 +293,7 @@ and 'k case =
     }
 
 and record_label_definition =
-  | Kept of Types.type_expr
+  | Kept of Types.type_expr * mutable_flag
   | Overridden of Longident.t loc * expression
 
 and binding_op =
diff --git a/utils/Makefile b/utils/Makefile
index 7231fae28e08..352198ec1226 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -90,6 +90,7 @@ config.ml: config.mlp $(ROOTDIR)/Makefile.config Makefile
 	    $(call SUBST,FUNCTION_SECTIONS) \
 	    $(call SUBST,CC_HAS_DEBUG_PREFIX_MAP) \
 	    $(call SUBST,AS_HAS_DEBUG_PREFIX_MAP) \
+	    $(call SUBST,FORCE_INSTRUMENTED_RUNTIME) \
 	    $< > $@
 
 # Test for the substitution functions above
diff --git a/utils/clflags.ml b/utils/clflags.ml
index 46b61f418bcc..af4edc0e6c2d 100644
--- a/utils/clflags.ml
+++ b/utils/clflags.ml
@@ -165,7 +165,11 @@ let pic_code = ref (match Config.architecture with (* -fPIC *)
                      | "amd64" -> true
                      | _       -> false)
 
-let runtime_variant = ref "";;      (* -runtime-variant *)
+let runtime_variant =
+  ref (match Config.force_instrumented_runtime with (* -runtime-variant *)
+        | true -> "i"
+        | false -> "")
+
 let with_runtime = ref true;;         (* -with-runtime *)
 
 let keep_docs = ref false              (* -keep-docs *)
diff --git a/utils/config.mli b/utils/config.mli
index 7f70a52d52bf..4fc9d495d464 100644
--- a/utils/config.mli
+++ b/utils/config.mli
@@ -254,6 +254,10 @@ val supports_shared_libraries: bool
 
     @since 4.08.0 *)
 
+val force_instrumented_runtime: bool
+(** Force runtime-variant to be "i" at configure time
+    when ocamlc or ocamlopt link executables. *)
+
 val afl_instrument : bool
 (** Whether afl-fuzz instrumentation is generated by default *)
 
diff --git a/utils/config.mlp b/utils/config.mlp
index 44c6ff8fa509..7f927f1fe6f5 100644
--- a/utils/config.mlp
+++ b/utils/config.mlp
@@ -85,6 +85,7 @@ let safe_string = %%FORCE_SAFE_STRING%%
 let default_safe_string = %%DEFAULT_SAFE_STRING%%
 let windows_unicode = %%WINDOWS_UNICODE%% != 0
 let naked_pointers = %%NAKED_POINTERS%%
+let force_instrumented_runtime = %%FORCE_INSTRUMENTED_RUNTIME%%
 
 let flat_float_array = %%FLAT_FLOAT_ARRAY%%
 
@@ -114,14 +115,14 @@ and linear_magic_number = "Caml1999L031"
 
 let interface_suffix = ref ".mli"
 
-let max_tag = 245
+let max_tag = 243
 (* This is normally the same as in obj.ml, but we have to define it
    separately because it can differ when we're in the middle of a
    bootstrapping phase. *)
 let lazy_tag = 246
 
 let max_young_wosize = 256
-let stack_threshold = 256 (* see runtime/caml/config.h *)
+let stack_threshold = 16 (* see runtime/caml/config.h *)
 let stack_safety_margin = 60
 
 let architecture = "%%ARCH%%"
diff --git a/utils/domainstate.ml.c b/utils/domainstate.ml.c
index 7ece1ad851b2..6dbae1d07af5 100644
--- a/utils/domainstate.ml.c
+++ b/utils/domainstate.ml.c
@@ -14,6 +14,10 @@
 /*                                                                        */
 /**************************************************************************/
 
+#define CAML_CONFIG_H_NO_TYPEDEFS
+#include "config.h"
+let stack_ctx_words = Stack_ctx_words
+
 type t =
 #define DOMAIN_STATE(type, name) | Domain_##name
 #include "domain_state.tbl"
diff --git a/utils/domainstate.mli.c b/utils/domainstate.mli.c
index 1da60c94aa23..66a4750d4c01 100644
--- a/utils/domainstate.mli.c
+++ b/utils/domainstate.mli.c
@@ -14,6 +14,8 @@
 /*                                                                        */
 /**************************************************************************/
 
+val stack_ctx_words : int
+
 type t =
 #define DOMAIN_STATE(type, name) | Domain_##name
 #include "domain_state.tbl"