From c5d9f63eae767e105f731d3e2b6f37399d19d480 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 01/18] Introduce ggml_compute_threadpool

- OpenMP functional: check
- Vanilla ggml functional: Check
- ggml w/threadpool functional: Check
- OpenMP no regression: No glaring problems
- Vanilla ggml no regression: No glaring problems
- ggml w/threadpool no regression: No glaring problems
---
 CMakePresets.json                             | 256 ++++--
 common/common.cpp                             | 292 ++++++-
 common/common.h                               |  29 +-
 examples/CMakeLists.txt                       |   2 +-
 examples/baby-llama/baby-llama.cpp            |   2 +-
 examples/benchmark/benchmark-matmult.cpp      |   2 +-
 .../cvector-generator/cvector-generator.cpp   |   4 +-
 examples/export-lora/export-lora.cpp          |   2 +-
 examples/llama-bench/llama-bench.cpp          |  51 ++
 examples/llava/llava-cli.cpp                  |   4 +-
 examples/main/main.cpp                        |  30 +
 examples/server/server.cpp                    |   4 +-
 ggml/CMakeLists.txt                           |   2 +-
 ggml/include/ggml-alloc.h                     |   5 +-
 ggml/include/ggml-backend.h                   |   1 +
 ggml/include/ggml.h                           |  28 +-
 ggml/src/ggml-backend.c                       |  16 +-
 ggml/src/ggml.c                               | 800 ++++++++++++++----
 include/llama.h                               |  12 +
 src/llama.cpp                                 |  96 ++-
 tests/test-rope.cpp                           |   2 +-
 21 files changed, 1362 insertions(+), 278 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index bdad38952d3cb..ae2bf25c12786 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,65 +1,197 @@
 {
-  "version": 4,
-  "configurePresets": [
-    {
-        "name":  "base",
-        "hidden": true,
-        "generator":   "Ninja",
-        "binaryDir":   "${sourceDir}/build-${presetName}",
-        "cacheVariables": {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+    "version": 4,
+    "configurePresets": [
+        {
+            "name": "base",
+            "hidden": true,
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/build-${presetName}",
+            "cacheVariables": {
+                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+            }
+        },
+        {
+            "name": "sycl-base",
+            "hidden": true,
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/build-${presetName}",
+            "cacheVariables": {
+                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+                "CMAKE_CXX_COMPILER": "icx",
+                "CMAKE_C_COMPILER": "cl",
+                "GGML_SYCL": "ON",
+                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+            }
+        },
+        {
+            "name": "debug",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
+        },
+        {
+            "name": "release",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release"
+            }
+        },
+        {
+            "name": "reldbg",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "RelWithDebInfo"
+            }
+        },
+        {
+            "name": "static",
+            "hidden": true,
+            "cacheVariables": {
+                "GGML_STATIC": "ON"
+            }
+        },
+        {
+            "name": "arm64-windows-msvc",
+            "hidden": true,
+            "architecture": {
+                "value": "arm64",
+                "strategy": "external"
+            },
+            "toolset": {
+                "value": "host=x86_64",
+                "strategy": "external"
+            },
+            "cacheVariables": {
+                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+            }
+        },
+        {
+            "name": "arm64-windows-llvm",
+            "hidden": true,
+            "architecture": {
+                "value": "arm64",
+                "strategy": "external"
+            },
+            "toolset": {
+                "value": "host=x86_64",
+                "strategy": "external"
+            },
+            "cacheVariables": {
+                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+            }
+        },
+        {
+            "name": "arm64-windows-llvm-debug",
+            "inherits": [
+                "base",
+                "arm64-windows-llvm",
+                "debug"
+            ]
+        },
+        {
+            "name": "arm64-windows-llvm-release",
+            "inherits": [
+                "base",
+                "arm64-windows-llvm",
+                "reldbg"
+            ]
+        },
+        {
+            "name": "arm64-windows-llvm+static-release",
+            "inherits": [
+                "base",
+                "arm64-windows-llvm",
+                "reldbg",
+                "static"
+            ]
+        },
+        {
+            "name": "arm64-windows-msvc-debug",
+            "inherits": [
+                "base",
+                "arm64-windows-msvc",
+                "debug"
+            ]
+        },
+        {
+            "name": "arm64-windows-msvc-release",
+            "inherits": [
+                "base",
+                "arm64-windows-msvc",
+                "reldbg"
+            ]
+        },
+        {
+            "name": "arm64-windows-msvc+static-release",
+            "inherits": [
+                "base",
+                "arm64-windows-msvc",
+                "reldbg",
+                "static"
+            ]
+        },
+        {
+            "name": "x64-windows-msvc-debug",
+            "inherits": [
+                "base",
+                "debug"
+            ]
+        },
+        {
+            "name": "x64-windows-msvc-release",
+            "inherits": [
+                "base",
+                "reldbg"
+            ]
+        },
+        {
+            "name": "x64-windows-msvc+static-release",
+            "inherits": [
+                "base",
+                "reldbg",
+                "static"
+            ]
+        },
+        {
+            "name": "x64-windows-sycl-debug",
+            "inherits": [
+                "sycl-base",
+                "debug"
+            ]
+        },
+        {
+            "name": "x64-windows-sycl-release",
+            "inherits": [
+                "sycl-base",
+                "release"
+            ]
+        },
+        {
+            "name": "clang10",
+            "displayName": "Clang 10.0.0 x86_64-pc-linux-gnu",
+            "description": "Using compilers: C = /usr/bin/clang, CXX = /usr/bin/clang++",
+            "binaryDir": "${sourceDir}/out/build/${presetName}",
+            "cacheVariables": {
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
+                "CMAKE_C_COMPILER": "/usr/bin/clang",
+                "CMAKE_CXX_COMPILER": "/usr/bin/clang++",
+                "CMAKE_RC_COMPILER": "/usr/bin/llvm-rc-10",
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
+        },
+        {
+            "name": "gcc8.4",
+            "displayName": "GCC 8.4.0 x86_64-linux-gnu",
+            "description": "Using compilers: C = /usr/bin/gcc, CXX = /usr/bin/g++",
+            "binaryDir": "${sourceDir}/out/build/${presetName}",
+            "cacheVariables": {
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
+                "CMAKE_C_COMPILER": "/usr/bin/gcc",
+                "CMAKE_CXX_COMPILER": "/usr/bin/g++",
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
         }
-    },
-    {
-        "name": "sycl-base",
-        "hidden": true,
-        "generator": "Ninja",
-        "binaryDir": "${sourceDir}/build-${presetName}",
-        "cacheVariables": {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-            "CMAKE_CXX_COMPILER": "icx",
-            "CMAKE_C_COMPILER": "cl",
-            "GGML_SYCL": "ON",
-            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-        }
-    },
-    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
-
-    {
-        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
-        }
-    },
-
-    {
-        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
-        }
-    },
-
-    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
-
-    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
-
-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
-
-    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
-    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
-  ]
+    ]
 }
diff --git a/common/common.cpp b/common/common.cpp
index 2e8374d50cafa..c08ebda4aa6a1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -222,6 +222,36 @@ void gpt_params_handle_model_default(gpt_params & params) {
     }
 }
 
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+    int32_t n_set = 0;
+
+    if (cpuparams.n_threads < 0) {
+        // Assuming everything about cpuparams is invalid
+        if (role_model != nullptr) {
+            cpuparams = *role_model;
+        } else {
+            cpuparams.n_threads = std::thread::hardware_concurrency();
+        }
+    }
+
+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (cpuparams.cpumask[i]) {
+            n_set++;
+        }
+    }
+
+    if (n_set == 0) {
+        // You hit the jackpot!
+        memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS);
+        n_set = GGML_MAX_N_THREADS;
+    }
+
+    if (n_set < cpuparams.n_threads) {
+        // Not enough set bits, may experience performance issues.
+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+    }
+}
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
@@ -241,6 +271,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
     if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
@@ -285,6 +320,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    size_t dash_loc = range.find('-');
+    if (dash_loc == std::string::npos) {
+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        return false;
+    }
+
+    size_t start_i;
+    size_t end_i;
+
+    if (dash_loc == 0) {
+        start_i = 0;
+    } else {
+        start_i = std::stoull(range.substr(0, dash_loc));
+        if (start_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "Start index out of bounds!\n");
+            return false;
+        }
+    }
+
+    if (dash_loc == range.length() - 1) {
+        end_i = GGML_MAX_N_THREADS - 1;
+    } else {
+        end_i = std::stoull(range.substr(dash_loc + 1));
+        if (end_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "End index out of bounds!\n");
+            return false;
+        }
+    }
+
+    for (size_t i = start_i; i <= end_i; i++) {
+        boolmask[i] = true;
+    }
+
+    return true;
+}
+
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    // Discard potential 0x prefix
+    size_t start_i = 0;
+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+        start_i = 2;
+    }
+
+    size_t num_digits = mask.length() - start_i;
+    if (num_digits > 128) num_digits = 128;
+
+    size_t end_i = num_digits + start_i;
+
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
+        int8_t id = c;
+
+        if ((c >= '0' && c <= '9')) {
+            id -= '0';
+        } else if (c >= 'a' && c <= 'f') {
+            id -= 'a' - 10;
+        } else if (c >= 'A' && c <= 'F') {
+            id -= 'A' - 10;
+        } else {
+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            return false;
+        }
+
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+    }
+
+    return true;
+}
+
 #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
 
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@@ -301,36 +409,137 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "-t" || arg == "--threads") {
         CHECK_ARG
-        params.n_threads = std::stoi(argv[i]);
-        if (params.n_threads <= 0) {
-            params.n_threads = std::thread::hardware_concurrency();
+        params.cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams.n_threads <= 0) {
+            params.cpuparams.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-C" || arg == "--cpu-mask") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Cr" || arg == "--cpu-range") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio") {
+        CHECK_ARG
+        params.cpuparams.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict") {
+        params.cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll") {
+        params.cpuparams.poll = true;
+        return true;
+    }
     if (arg == "-tb" || arg == "--threads-batch") {
         CHECK_ARG
-        params.n_threads_batch = std::stoi(argv[i]);
-        if (params.n_threads_batch <= 0) {
-            params.n_threads_batch = std::thread::hardware_concurrency();
+        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams_batch.n_threads <= 0) {
+            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "-Crb" || arg == "--cpu-range_batch") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch") {
+        CHECK_ARG
+        params.cpuparams_batch.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch") {
+        params.cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch") {
+        params.cpuparams_batch.poll = true;
+        return true;
+    }
     if (arg == "-td" || arg == "--threads-draft") {
         CHECK_ARG
-        params.n_threads_draft = std::stoi(argv[i]);
-        if (params.n_threads_draft <= 0) {
-            params.n_threads_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams.n_threads <= 0) {
+            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
         }
         return true;
+    }
+        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Crd" || arg == "--cpu-range-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio-draft") {
+        CHECK_ARG
+        params.draft_cpuparams.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-draft") {
+        params.draft_cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-draft") {
+        params.draft_cpuparams.poll = true;
+        return true;
     }
     if (arg == "-tbd" || arg == "--threads-batch-draft") {
         CHECK_ARG
-        params.n_threads_batch_draft = std::stoi(argv[i]);
-        if (params.n_threads_batch_draft <= 0) {
-            params.n_threads_batch_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams_batch.n_threads <= 0) {
+            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch-draft") {
+        CHECK_ARG
+        params.draft_cpuparams_batch.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch-draft") {
+        params.draft_cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch-draft") {
+        params.draft_cpuparams_batch.poll = true;
+        return true;
+    }
     if (arg == "-p" || arg == "--prompt") {
         CHECK_ARG
         params.prompt = argv[i];
@@ -1415,11 +1624,38 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
     options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
+    options.push_back({ "*",           "-C,    --cpu-mask M",           "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
+    options.push_back({ "*",           "       --cpu-strict",           "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
+    options.push_back({ "*",           "       --poll",                 "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
+                                                                        "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
+    options.push_back({ "*",           "       --cpu-strict-batch",     "use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --priority-batch N",     "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
+    options.push_back({ "*",           "       --poll-batch",           "use polling to wait for work (default: --poll)"});
     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",     "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi",
+                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
+    options.push_back({ "speculative", "       --cpu-strict-draft",     "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --priority-draft N",     "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
+    options.push_back({ "speculative", "       --poll-draft",           "Use polling to wait for draft model work (default: same as --poll])"});
     options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
                                                                         "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M",
+                                                                        "Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
+    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
+                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
+    options.push_back({ "speculative", "       --cpu-strict-batch-draft",
+                                                                        "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
+    options.push_back({ "speculative", "       --priority-batch-draft N",
+                                                                        "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft",     "Use polling to wait for draft model work (default: --poll-draft)"});
+
     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
     options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
@@ -1691,7 +1927,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
     options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
     options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
     options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
 
     printf("usage: %s [options]\n", argv[0]);
@@ -1723,9 +1958,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 std::string gpt_params_get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
+    if (params.cpuparams_batch.n_threads != -1) {
+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
     }
     os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
 
@@ -2224,8 +2459,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_seq_max         = params.n_parallel;
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
@@ -2251,6 +2487,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     return cparams;
 }
 
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+    struct ggml_threadpool_params tpp;
+
+    tpp.mask_specified = params.mask_valid;
+    if (params.mask_valid) {
+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
+    }
+
+    tpp.n_threads  = params.n_threads;
+    tpp.prio       = params.priority;
+    tpp.poll       = params.poll;
+    tpp.strict_cpu = params.strict_cpu;
+
+    return tpp;
+}
+
 #ifdef LLAMA_USE_CURL
 
 static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -3246,7 +3498,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h
index d88966ece20aa..9865133ed8575 100644
--- a/common/common.h
+++ b/common/common.h
@@ -67,13 +67,18 @@ enum dimre_method {
     DIMRE_METHOD_MEAN,
 };
 
+struct cpu_params {
+    int32_t  n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    bool     poll                        = false;   // Use polling (busywait) to wait for work
+};
+
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
-    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -100,6 +105,11 @@ struct gpt_params {
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
 
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    struct cpu_params draft_cpuparams;
+    struct cpu_params draft_cpuparams_batch;
+
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
 
@@ -204,7 +214,7 @@ struct gpt_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";
@@ -277,6 +287,10 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
 
 std::string gpt_params_get_system_info(const gpt_params & params);
 
+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+
 //
 // String utils
 //
@@ -325,8 +339,9 @@ struct llama_init_result {
 
 struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
 
-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 67b3d27747850..247d52c6d3454 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,6 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    add_subdirectory(speculative)
+    #add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index aca332e9464d2..3ce91070b4ed7 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
 #endif
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 47cb16c69d536..e78f6b388ef6e 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -21,7 +21,7 @@
 #endif
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index a12e90d828275..49a3659e4ad8b 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
     if (use_pca) {
         // run PCA
         PCA::pca_params pca_params;
-        pca_params.n_threads = params.n_threads;
-        pca_params.n_batch = params.n_pca_batch;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
         pca_params.n_iterations = params.n_pca_iterations;
         PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
     } else {
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index d228ae66eeeec..7217b7b6ef354 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -407,7 +407,7 @@ int main(int argc, char ** argv) {
 
     g_verbose = (params.verbosity == 1);
     try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
         ctx.run_merge();
     } catch (const std::exception & err) {
         fprintf(stderr, "%s\n", err.what());
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 42918bfc79f22..5a929ceddafbe 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -235,6 +235,7 @@ struct cmd_params {
     std::vector<bool> use_mmap;
     std::vector<bool> embeddings;
     ggml_numa_strategy numa;
+    cpu_params cpuparams;
     int reps;
     bool verbose;
     output_formats output_format;
@@ -261,6 +262,7 @@ static const cmd_params cmd_params_defaults = {
     /* use_mmap             */ {true},
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* cpuparams            */ {},
     /* reps                 */ 5,
     /* verbose              */ false,
     /* output_format        */ MARKDOWN,
@@ -289,6 +291,11 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
+    printf("  -mt, --max-threads <n>              (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
+    printf("  -C, --cpu-mask <hex>                (default: 0x0)\n");
+    printf("  --cpu-strict <0|1>                  (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
+    printf("  --priority <0|1|2|3>                (default: %d)\n", cmd_params_defaults.cpuparams.priority);
+    printf("  --poll <0|1>                        (default: %d)\n", cmd_params_defaults.cpuparams.poll);
     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
@@ -492,6 +499,30 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
                 else { invalid_param = true; break; }
             }
+        } else if (arg == "-mt" || arg == "--max-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string mask = argv[i];
+            params.cpuparams.mask_valid = true;
+            invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.priority = std::stoul(argv[i]);
+        } else if (arg == "--cpu-strict") {
+            params.cpuparams.strict_cpu = true;
+        } else if (arg == "--poll") {
+            params.cpuparams.poll = true;
         } else if (arg == "-fa" || arg == "--flash-attn") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1402,6 +1433,23 @@ int main(int argc, char ** argv) {
     llama_model * lmodel = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
+    postprocess_cpu_params(params.cpuparams);
+
+    struct ggml_threadpool_params tpp;
+    tpp.n_threads      = params.cpuparams.n_threads;
+    tpp.mask_specified = params.cpuparams.mask_valid;
+    tpp.strict_cpu     = params.cpuparams.strict_cpu;
+    tpp.prio           = params.cpuparams.priority;
+    tpp.poll           = params.cpuparams.poll;
+
+    std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+
+    struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
     for (const auto & inst : params_instances) {
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1427,6 +1475,7 @@ int main(int argc, char ** argv) {
         test t(inst, lmodel, ctx);
 
         llama_kv_cache_clear(ctx);
+        llama_attach_threadpool(ctx, threadpool);
 
         // warmup run
         if (t.n_prompt > 0) {
@@ -1468,6 +1517,8 @@ int main(int argc, char ** argv) {
         llama_free(ctx);
     }
 
+    ggml_release_threadpool(threadpool);
+
     llama_free_model(lmodel);
 
     if (p) {
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 8c7dd2ae3d0dc..86b39f20eea6e 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
         if (!params->image.empty()) {
             LOG_TEE("using base64 encoded image instead of command line image path\n");
         }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
         if (!embed) {
             LOG_TEE("%s: can't load image from prompt\n", __func__);
             return NULL;
         }
         params->prompt = remove_image_from_prompt(prompt);
     } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
         if (!embed) {
             fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
             return NULL;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 6e0635a66cd06..bb4cef1d246e2 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -221,6 +221,33 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    LOG("%s: llama threadpool init = n_threads = %d\n",
+        __func__,
+        (int32_t) params.cpuparams.n_threads
+    );
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
+    if (!threadpool_batch) {
+        LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+        exit(1);
+    }
+    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_batch_threadpool(ctx, threadpool_batch);
+    llama_attach_threadpool(ctx, threadpool);
+    if (ctx_guidance) {
+        llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
+        llama_attach_threadpool(ctx_guidance, threadpool);
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
@@ -989,6 +1016,9 @@ int main(int argc, char ** argv) {
     llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
+    ggml_release_threadpool(threadpool);
+    ggml_release_threadpool(threadpool_batch);
+
 #ifndef LOG_DISABLE_LOGS
     LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 360f571e42867..231611f30916c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2536,8 +2536,8 @@ int main(int argc, char ** argv) {
     });
 
     LOG_INFO("system info", {
-        {"n_threads",       params.n_threads},
-        {"n_threads_batch", params.n_threads_batch},
+        {"n_threads",       params.cpuparams.n_threads},
+        {"n_threads_batch", params.cpuparams_batch.n_threads},
         {"total_threads",   std::thread::hardware_concurrency()},
         {"system_info",     llama_print_system_info()},
     });
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 7fe1661bb96b4..e853f6248af2b 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -146,7 +146,7 @@ option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"
 set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "ggml: metal minimum macOS version")
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
-option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_OPENMP                          "ggml: use OpenMP"                                OFF)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 434c13b34a929..cd85b6ee70560 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,8 +7,9 @@ extern "C" {
 #endif
 
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
+typedef struct  ggml_compute_threadpool * ggml_compute_threadpool_t;
 
 // Tensor allocator
 struct ggml_tallocr {
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 5f3f1e286990e..c59f9f54a44b9 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -102,6 +102,7 @@ extern "C" {
 
     GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
     GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
     GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
     // Create a backend buffer from an existing pointer
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 15602a96df7ad..af3934c2bf88c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -231,6 +231,8 @@
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
+#define GGML_MAX_N_THREADS      512
+
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@@ -622,6 +624,17 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
+    struct ggml_threadpool_params {
+        bool    cpumask[GGML_MAX_N_THREADS];
+        bool    mask_specified;
+        int32_t n_threads;
+        int32_t prio;
+        bool    poll;
+        bool    strict_cpu;
+    };
+
+    struct ggml_compute_threadpool;     // forward declaration, see ggml.c
+
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
@@ -629,6 +642,7 @@ extern "C" {
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
         int n_threads;
+        struct ggml_compute_threadpool * threadpool;
 
         // abort ggml_graph_compute when true
         ggml_abort_callback abort_callback;
@@ -2010,10 +2024,20 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
+    GGML_API int32_t                         ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
+    GGML_API void                            ggml_pause_threadpool        (struct ggml_compute_threadpool * threadpool);
+    GGML_API void                            ggml_resume_threadpool       (struct ggml_compute_threadpool * threadpool);
+
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+            struct ggml_compute_threadpool * threadpool /* = NULL */ );
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
     GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index e1651cc645c42..d45b1e2aca3dc 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -722,7 +722,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 #endif
 
 struct ggml_backend_cpu_context {
-    int n_threads;
+    int                       n_threads;
+    ggml_compute_threadpool_t threadpool;
+
     void * work_data;
     size_t work_size;
 
@@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
 
     if (cpu_plan->cplan.work_size > 0) {
@@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
     if (cpu_ctx->work_size < cplan.work_size) {
         free(cpu_ctx->work_data);
@@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     }
 
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
     ctx->work_data           = NULL;
     ctx->work_size           = 0;
     ctx->abort_callback      = NULL;
@@ -903,6 +906,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->threadpool = threadpool;
+}
+
 void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c937b5e537c54..53d73d00722f5 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1868,28 +1868,104 @@ struct ggml_context_container {
     struct ggml_context context;
 };
 
-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan * cplan;
+//
+// Threading defs
+//
+
+typedef pthread_t          ggml_thread_t;
+
+#if defined(_WIN32)
+
+typedef CONDITION_VARIABLE ggml_cond_t;
+typedef SRWLOCK            ggml_mutex_t;
+
+#define ggml_mutex_init(m)   InitializeSRWLock(m)
+#define ggml_mutex_destroy(m)
+#define ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
+#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
+#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+
+#define ggml_cond_init(c)    InitializeConditionVariable(c)
+#define ggml_cond_destroy(c)
+#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
+
+typedef pthread_cond_t     ggml_cond_t;
+typedef pthread_mutex_t    ggml_mutex_t;
 
-    int n_threads;
+#define ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
+#define ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
+#define ggml_mutex_lock(m)          pthread_mutex_lock(m)
+#define ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
+#define ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
+#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+#define ggml_cond_init(c)      pthread_cond_init(c, NULL)
+#define ggml_cond_destroy(c)   pthread_cond_destroy(c)
+#define ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
+#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
+// Threadpool def
+struct ggml_compute_threadpool {
+    ggml_mutex_t mutex;       // mutex for cond.var
+    ggml_cond_t  cond;        // cond.var for waiting for new work
+
+    struct ggml_cgraph * cgraph;
+    struct ggml_cplan  * cplan;
 
     // synchronization primitives
     atomic_int n_barrier;
     atomic_int n_barrier_passed;
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+
+    volatile bool stop;      // Used for stopping the threadpool altogether
+    volatile bool pause;     // Used for pausing the threadpool or individual threads
+    volatile bool new_work;  // Set when there is work to be done, unset after it's done
+
+    struct ggml_compute_state * workers;   // per thread state
+    int32_t                     n_threads_max; // number of threads in the pool
+    int32_t                     n_threads_cur; // number of threads used in the current graph
+
+    int32_t      prio;       // Scheduling priority
+    bool         disposable; // Doesn't initialize a conv-var
+    bool         poll;       // Use polling (busywait)  // TODO
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
 
-    atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads
-
     enum ggml_status ec;
 };
 
+// Per-thread state
 struct ggml_compute_state {
+#ifndef GGML_USE_OPENMP
     ggml_thread_t thrd;
+    bool cpumask[GGML_MAX_N_THREADS];
+    bool mask_specified;
+#endif
+    struct ggml_compute_threadpool * threadpool;
     int ith;
-    struct ggml_compute_state_shared * shared;
 };
 
 struct ggml_compute_params {
@@ -1900,7 +1976,7 @@ struct ggml_compute_params {
     size_t wsize;
     void * wdata;
 
-    struct ggml_compute_state_shared * shared;
+    struct ggml_compute_threadpool * threadpool;
 };
 
 //
@@ -2995,23 +3071,23 @@ inline static void ggml_critical_section_start(void) {
 }
 
 #ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
 
     #pragma omp barrier
 }
 #else
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
 
-    atomic_int * n_barrier = &shared->n_barrier;
-    atomic_int * n_barrier_passed = &shared->n_barrier_passed;
+    atomic_int * n_barrier = &threadpool->n_barrier;
+    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
 
-    int n_threads = shared->n_threads;
+    int n_threads = threadpool->n_threads_cur;
     int passed_old = atomic_load(n_barrier_passed);
 
     if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
@@ -9998,7 +10074,7 @@ static void ggml_compute_forward_acc_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     const int ith = params->ith;
@@ -12373,10 +12449,10 @@ UseGgmlGemm1:;
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&params->shared->current_chunk, nth);
+        atomic_store(&params->threadpool->current_chunk, nth);
     }
 
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
 #if GGML_USE_LLAMAFILE
     if (src1->type != vec_dot_type) {
@@ -12484,7 +12560,7 @@ UseGgmlGemm2:;
             break;
         }
 
-        current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+        current_chunk = atomic_fetch_add(&params->threadpool->current_chunk, 1);
     }
 }
 
@@ -12579,7 +12655,7 @@ static void ggml_compute_forward_mul_mat_id(
         }
     }
 
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // compute each matrix multiplication in sequence
     for (int cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -12733,7 +12809,7 @@ static void ggml_compute_forward_out_prod_f32(
     if (ith == 0) {
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // dst[:,:,:,:] = 0
     // for i2,i3:
@@ -12851,7 +12927,7 @@ static void ggml_compute_forward_out_prod_q_f32(
     if (ith == 0) {
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // parallelize by last three dimensions
 
@@ -13037,7 +13113,7 @@ static void ggml_compute_forward_set_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     const int ith = params->ith;
@@ -13616,7 +13692,7 @@ static void ggml_compute_forward_diag_mask_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     // TODO: handle transposed/permuted matrices
@@ -14392,7 +14468,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 
@@ -14480,7 +14556,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 
@@ -14767,7 +14843,7 @@ static void ggml_compute_forward_conv_transpose_2d(
 
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t stride = ggml_get_op_params_i32(dst, 0);
 
@@ -15501,7 +15577,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
     if (ith == 0) {
         memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int64_t elem_q = ggml_nelements(q);
     const int64_t elem_k = ggml_nelements(k);
@@ -16273,7 +16349,7 @@ static void ggml_compute_forward_add_rel_pos_f32(
         if (params->ith == 0) {
             memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
     // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
 
@@ -16558,7 +16634,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     if (ith == 0) {
         memset(sums, 0, sizeof(float) * (nth + nth * nc));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const double eps = 1e-9;
 
@@ -16606,7 +16682,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         }
 #endif
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     if (ith == 0) {
         float * dp = (float *) dst->data;
@@ -18347,65 +18423,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
     ggml_hash_set_reset(&cgraph->visited_hash_set);
 }
 
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
@@ -18682,9 +18699,292 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
     return n_tasks;
 }
 
-struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
+
+enum {
+    SCHED_PRIO_NORMAL,
+    SCHED_PRIO_MEDIUM,
+    SCHED_PRIO_HIGH,
+    SCHED_PRIO_REALTIME
+};
+
+#if defined(_WIN32)
+#include "windows.h"
+
+// TODO: support > 64 CPUs
+static bool __thread_affinity(bool * mask) {
+    HANDLE    h = GetCurrentThread();
+    uint64_t  bitmask = 0ULL;
+
+    assert(GGML_MAX_N_THREADS >= 64);
+
+    for (int32_t i = 0; i < 8; i++) {
+        int32_t idx = i * 8;
+        uint8_t val = 0;
+        val |= mask[idx + 0] << 0;
+        val |= mask[idx + 1] << 1;
+        val |= mask[idx + 2] << 2;
+        val |= mask[idx + 3] << 3;
+        val |= mask[idx + 4] << 4;
+        val |= mask[idx + 5] << 5;
+        val |= mask[idx + 6] << 6;
+        val |= mask[idx + 7] << 7;
+        bitmask |= (uint64_t)val << idx;
+    }
+
+    for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+            break;
+        }
+    }
+
+    DWORD_PTR m = (DWORD_PTR)bitmask;
+
+    m = SetThreadAffinityMask(h, m);
+
+    return m != 0;
+}
+
+static bool __process_priority(int32_t prio) {
+    DWORD p = NORMAL_PRIORITY_CLASS;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+
+    return SetPriorityClass(GetCurrentProcess(), p);
+}
+
+static bool __thread_priority(int32_t prio) {
+    DWORD p = NORMAL_PRIORITY_CLASS;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    }
+
+    return SetThreadPriority(GetCurrentThread(), p);
+
+}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+
+static bool __thread_affinity(const bool * mask) {
+    UNUSED(mask);
+    return true;
+}
+
+static bool __process_priority(int32_t prio) {
+    int32_t p = 0;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p =  0;  break;
+        case SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case SCHED_PRIO_HIGH:     p = -10; break;
+        case SCHED_PRIO_REALTIME: p = -20; break;
+    }
+
+    int32_t r = setpriority(PRIO_PROCESS, 0, p);
+    return r != -1;
+}
+
+static bool __thread_priority(int32_t prio) {
+    UNUSED(prio);
+    return true;
+}
+
+#else // posix?
+
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+#include <sched.h>
+
+static bool __thread_affinity(const bool * mask) {
+    cpu_set_t cpuset;
+    int32_t err;
+
+    CPU_ZERO(&cpuset);
+
+    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            CPU_SET(i, &cpuset);
+        }
+    }
+
+#ifdef __ANDROID__
+    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+    if (err < 0) {
+        err = errno;
+    }
+#else
+    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set affinity mask 0x%llx (err %d: %s)\n", (unsigned long long)mask, err, strerror(err));
+        return false;
+    }
+
+    return true;
+}
+
+static bool __process_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    int32_t err = sched_setscheduler(0, policy, &p);
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set process priority %d (err %d)\n", prio, err);
+        return false;
+    }
+
+    return true;
+}
+
+static bool __thread_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set thread priority %d (err %d)\n", prio, err);
+        return false;
+    }
+
+    return true;
+}
+
+#endif
+
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void __cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void __cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void __cpu_relax(void) {;}
+#endif
+
+static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+    if (!global_mask) {
+        memset(local_mask, 1, GGML_MAX_N_THREADS);
+        return;
+    }
+    if (!strict) {
+        memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
+        return;
+    } else {
+        memset(local_mask, 0, GGML_MAX_N_THREADS);
+        int32_t base_idx = *iter;
+        for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+            int32_t idx = base_idx + i;
+            if (idx >= GGML_MAX_N_THREADS) {
+                // Just a cheaper modulo
+                idx -= GGML_MAX_N_THREADS;
+            }
+            if (global_mask[idx]) {
+                local_mask[idx] = 1;
+                *iter = idx + 1;
+                return;
+            }
+        }
+    }
+}
+
+void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
+    if (!threadpool) return;
+
+#ifndef GGML_USE_OPENMP
+    struct ggml_compute_state* workers = threadpool->workers;
+    const int32_t n_threads = threadpool->n_threads_max;
+
+    if (!threadpool->disposable) {
+        ggml_mutex_lock(&threadpool->mutex);
+    }
+    threadpool->n_threads_cur = n_threads;
+    threadpool->stop = true;
+    threadpool->pause = false;
+    if (!threadpool->disposable) {
+        ggml_cond_broadcast(&threadpool->cond);
+        ggml_mutex_unlock(&threadpool->mutex);
+    }
+
+    for (int32_t j = 1; j < n_threads; j++) {
+        int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
+        GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
+        UNUSED(rc);
+    }
+
+    GGML_ALIGNED_FREE(workers);
+
+    if (!threadpool->disposable) {
+        ggml_mutex_destroy(&threadpool->mutex);
+        ggml_cond_destroy(&threadpool->cond);
+    }
+#endif // GGML_USE_OPENMP
+
+    GGML_ALIGNED_FREE(threadpool);
+}
+
+void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    GGML_ASSERT(!threadpool->disposable);
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    GGML_ASSERT(!threadpool->disposable);
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+
+    ggml_mutex_lock(&threadpool->mutex);
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+struct ggml_cplan ggml_graph_plan(
+          const struct ggml_cgraph * cgraph,
+                           int32_t   n_threads,
+    struct ggml_compute_threadpool * threadpool) {
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool\n");
+    }
     if (n_threads <= 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
     }
 
     size_t work_size = 0;
@@ -18840,12 +19140,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     }
 
     if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+        work_size += CACHE_LINE_SIZE*(n_threads);
     }
 
-    cplan.n_threads = MIN(max_tasks, n_threads);
-    cplan.work_size = work_size;
-    cplan.work_data = NULL;
+    cplan.threadpool = threadpool;
+    cplan.n_threads  = MIN(max_tasks, n_threads);
+    cplan.work_size  = work_size;
+    cplan.work_data  = NULL;
 
     return cplan;
 }
@@ -18853,36 +19154,206 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
-    const struct ggml_cgraph * cgraph = state->shared->cgraph;
-    const struct ggml_cplan  * cplan  = state->shared->cplan;
+    const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
+    const struct ggml_cplan  * cplan  = state->threadpool->cplan;
 
     set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
-        /*.ith   =*/ state->ith,
-        /*.nth   =*/ state->shared->n_threads,
-        /*.wsize =*/ cplan->work_size,
-        /*.wdata =*/ cplan->work_data,
-        /*.shared=*/ state->shared,
+        /*.ith       =*/ state->ith,
+        /*.nth       =*/ state->threadpool->n_threads_cur,
+        /*.wsize     =*/ cplan->work_size,
+        /*.wdata     =*/ cplan->work_data,
+        /*.threadpool=*/ state->threadpool,
     };
 
-    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];
+    struct ggml_tensor * node = cgraph->nodes[0];
+
+    ggml_compute_forward(&params, node);
+    if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+        state->threadpool->ec = GGML_STATUS_ABORTED;
+    }
 
+    for (int node_n = 1; node_n < cgraph->n_nodes; node_n++) {
+        ggml_barrier(state->threadpool);
+
+        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
+            break;
+        }
+
+        node = cgraph->nodes[node_n];
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->ec = GGML_STATUS_ABORTED;
+            state->threadpool->ec = GGML_STATUS_ABORTED;
         }
+    }
 
-        ggml_barrier(state->shared);
+    if (!state->threadpool->disposable && state->ith == 0) {
+        state->threadpool->new_work = false;
+    }
 
-        if (state->shared->ec != GGML_STATUS_SUCCESS) {
-            break;
+    ggml_barrier(state->threadpool);
+
+    return 0;
+}
+
+
+
+#ifndef GGML_USE_OPENMP
+
+static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    do {
+        if (threadpool->poll) {
+            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
+                // No new work. Yield and keep polling.
+                //__cpu_relax();
+            }
+        } else {
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
+                // No new work. Wait for the signal.
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+    } while (state->ith >= threadpool->n_threads_cur);
+    return threadpool->new_work;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    GGML_ASSERT(!threadpool->disposable);
+
+    __thread_priority(threadpool->prio);
+    if (state->mask_specified)
+        __thread_affinity(state->cpumask);
+
+    while (true) {
+        // Check if we need to sleep
+        while (threadpool->pause) {
+            GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            if (threadpool->pause) {
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+        // This needs to be checked for after the cond_wait
+        if (threadpool->stop) break;
+
+        // Check if there is new work
+        // The main thread is the only one that can dispatch new work
+
+        bool new_work = ggml_graph_compute_check_for_work(state);
+        if (new_work) {
+            int64_t ret = (int64_t) ggml_graph_compute_thread(state);
+            if (ret == GGML_EXIT_ABORTED)
+                return (thread_ret_t) ret;
+
+            if (ret != GGML_EXIT_SUCCESS && ret != GGML_EXIT_ABORTED) {
+                fprintf(stderr, "ggml_graph_compute_thread exited with an unexpected error: %lld\n", (long long int) ret);
+                GGML_ASSERT(false);
+            }
         }
     }
 
-    return 0;
+    return (thread_ret_t) 0;
+}
+
+#endif // GGML_USE_OPENMP
+
+static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
+    struct ggml_threadpool_params * tpp,
+                             bool   disposable,
+               struct ggml_cgraph * cgraph,
+                struct ggml_cplan * cplan) {
+
+    struct ggml_compute_threadpool * threadpool =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_threadpool));
+    {
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->stop             = false;
+        threadpool->pause            = disposable ? false : true;
+        threadpool->new_work         = false;
+        threadpool->workers          = NULL;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = disposable ? tpp->n_threads : 0;
+        threadpool->disposable       = disposable;
+        threadpool->poll             = tpp->poll;
+        threadpool->prio             = tpp->prio;
+
+        threadpool->abort_callback      = NULL;
+        threadpool->abort_callback_data = NULL;
+        threadpool->ec                  = GGML_STATUS_SUCCESS;
+    }
+
+#ifndef GGML_USE_OPENMP
+    if (!disposable) {
+        ggml_mutex_init(&threadpool->mutex);
+        ggml_cond_init(&threadpool->cond);
+    }
+#endif // GGML_USE_OPENMP
+
+    struct ggml_compute_state * workers =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_state) * tpp->n_threads);
+
+    threadpool->workers = workers;
+
+#ifdef GGML_USE_OPENMP
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j] = (struct ggml_compute_state) {
+            .threadpool     = threadpool,
+            .ith            = j
+        };
+    }
+#else  // Not using OPENMP
+    int32_t cpumask_iter = 0;
+
+    __process_priority(tpp->prio);
+    __thread_priority(tpp->prio);
+
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j] = (struct ggml_compute_state) {
+            .thrd           = 0,
+            .mask_specified = tpp->mask_specified,
+            .threadpool     = threadpool,
+            .ith            = j
+        };
+
+        if (tpp->mask_specified) {
+            __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+        }
+
+        // Disposable threadpools need to have a valid cplan and cgraph immediately.
+        thread_ret_t (*thread_entrypoint)(void*) = disposable ? ggml_graph_compute_thread : ggml_graph_compute_secondary_thread;
+        // Spin threads for all secondary workers
+        if (j > 0) {
+            int32_t rc = ggml_thread_create(
+                &workers[j].thrd,
+                NULL,
+                thread_entrypoint,
+                &workers[j]
+            );
+            GGML_ASSERT(rc == 0);
+        }
+    }
+#endif // GGML_USE_OPENMP
+
+    return threadpool;
+}
+
+struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) {
+    return ggml_create_threadpool_impl(tpp, false, NULL, NULL);
 }
 
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
@@ -18890,19 +19361,41 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     GGML_ASSERT(cplan->n_threads > 0);
     GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
 
-    int n_threads = cplan->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.n_threads               =*/ n_threads,
-        /*.n_barrier               =*/ 0,
-        /*.n_barrier_passed        =*/ 0,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
-        /*.current_chunk           =*/ 0,
-        /*.ec                      =*/ GGML_STATUS_SUCCESS,
-    };
+    int32_t n_threads                           = cplan->n_threads;
+    struct ggml_compute_threadpool * threadpool = cplan->threadpool;
+
+    bool disposable_threadpool = false;
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("NOTE: No threadpool was specified in this cplan. Will create a disposable threadpool\n");
+        disposable_threadpool = true;
+
+        struct ggml_threadpool_params ttp = {
+            .mask_specified = false,
+            .n_threads      = n_threads,
+            .prio           = 1,
+            .poll           = false,
+            .strict_cpu     = false
+        };
+
+        threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
+    } else if (n_threads > threadpool->n_threads_max) {
+        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+    }
+
+    // Set up work
+    threadpool->cgraph        = cgraph;
+    threadpool->cplan         = cplan;
+    threadpool->n_threads_cur = n_threads;
+
+    if (!disposable_threadpool) {
+        // Reset some of the paramters that need resetting
+        // No worker threads should be accessing the parameters below at this stage
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
 
 #ifdef GGML_USE_OPENMP
     if (n_threads > 1) {
@@ -18912,63 +19405,52 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             {
                 // update the number of threads from the actual number of threads that we got from OpenMP
                 n_threads = omp_get_num_threads();
-                state_shared.n_threads = n_threads;
+                threadpool->n_threads_cur = n_threads;
             }
 
             struct ggml_compute_state worker = {
-                .thrd   = 0,
-                .ith    = omp_get_thread_num(),
-                .shared = &state_shared,
+                .ith        = omp_get_thread_num(),
+                .threadpool = threadpool,
             };
             ggml_graph_compute_thread(&worker);
         }
     } else {
         struct ggml_compute_state worker = {
-            .thrd   = 0,
-            .ith    = 0,
-            .shared = &state_shared,
+            .ith        = 0,
+            .threadpool = threadpool,
         };
         ggml_graph_compute_thread(&worker);
     }
 #else
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
-    for (int j = 0; j < n_threads; ++j) {
-        workers[j] = (struct ggml_compute_state) {
-            .thrd   = 0,
-            .ith    = j,
-            .shared = &state_shared,
-        };
-    }
-
-    // create thread pool
-    for (int j = 1; j < n_threads; ++j) {
-        const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-        GGML_ASSERT(rc == 0);
-        UNUSED(rc);
-    }
-
-    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
+    if (!disposable_threadpool) {
+        // Update main thread affinity to match the current threadpool
+        if (threadpool->workers[0].mask_specified) {
+            __thread_affinity(threadpool->workers[0].cpumask);
+        }
 
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
+        threadpool->new_work = true;
+        if (!threadpool->poll) {
+            ggml_mutex_lock(&threadpool->mutex);
+            ggml_cond_broadcast(&threadpool->cond);
+            ggml_mutex_unlock(&threadpool->mutex);
         }
     }
+    // this is a work thread too
+    ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif
 
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
-    return state_shared.ec;
+    if (disposable_threadpool) {
+        ggml_release_threadpool(threadpool);
+    }
+
+    return threadpool->ec;
 }
 
 enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
 
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
 
@@ -19763,7 +20245,7 @@ static enum ggml_opt_result ggml_opt_adam(
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
@@ -20110,7 +20592,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         opt->iter = iter;
     }
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
diff --git a/include/llama.h b/include/llama.h
index 66c266298e86f..b6f1d94de422b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -428,6 +428,18 @@ extern "C" {
     //optional:
     LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
 
+    // Optional: an auto threadpool gets created in ggml if not passed explicitly
+    LLAMA_API void llama_attach_threadpool(
+               struct   llama_context * ctx,
+            ggml_compute_threadpool_t   threadpool);
+    LLAMA_API void llama_attach_batch_threadpool(
+               struct   llama_context * ctx,
+            ggml_compute_threadpool_t   threadpool);
+    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+    LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
+    LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
+
+
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index be6dbf88a7790..2e2b6332e211d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2684,6 +2684,9 @@ struct llama_context {
 #endif
     ggml_backend_t backend_cpu = nullptr;
 
+    ggml_compute_threadpool_t threadpool       = nullptr;
+    ggml_compute_threadpool_t threadpool_batch = nullptr;
+
     bool has_evaluated_once = false;
 
     int64_t t_start_us;
@@ -14410,11 +14413,11 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     return n_outputs_max;
 }
 
-
 static void llama_graph_compute(
-        llama_context & lctx,
-          ggml_cgraph * gf,
-                  int   n_threads) {
+                  llama_context & lctx,
+                    ggml_cgraph * gf,
+                            int   n_threads,
+        ggml_compute_threadpool * threadpool) {
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(lctx.backend_metal)) {
         ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -14423,6 +14426,7 @@ static void llama_graph_compute(
 
     if (lctx.backend_cpu != nullptr) {
         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
 #ifdef GGML_USE_BLAS
@@ -14436,6 +14440,42 @@ static void llama_graph_compute(
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 }
 
+// Optionally swaps the batch and single-tok threadpools.
+// Returns the number of threads, and if a valid threadpool exists, returns it too.
+static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
+        llama_context & lctx,
+              int32_t   n_tokens) {
+
+    const auto & cparams = lctx.cparams;
+    int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+
+    ggml_compute_threadpool_t threadpool = nullptr;  // nullptr -> disposable threadpool
+
+    // A batch threadpool without a non-batch threadpool isn't supported.
+    GGML_ASSERT(!lctx.threadpool_batch || lctx.threadpool);
+
+    if (lctx.threadpool_batch && lctx.threadpool) {
+        // Switch between the 2 threadpools as needed
+        if (n_tokens > 1) {
+            ggml_pause_threadpool(lctx.threadpool);
+            ggml_resume_threadpool(lctx.threadpool_batch);
+            threadpool = lctx.threadpool_batch;
+            n_threads = cparams.n_threads_batch;
+        } else {
+            ggml_pause_threadpool(lctx.threadpool_batch);
+            ggml_resume_threadpool(lctx.threadpool);
+            threadpool = lctx.threadpool;
+            n_threads = cparams.n_threads;
+        }
+    } else if (lctx.threadpool) {
+        ggml_resume_threadpool(lctx.threadpool);
+        threadpool = lctx.threadpool;
+        n_threads = cparams.n_threads;
+    }
+    return std::make_pair(n_threads, threadpool);
+}
+
+
 // decode a batch of tokens by evaluating the transformer
 //
 //   - lctx:      llama context
@@ -14559,7 +14599,12 @@ static int llama_decode_internal(
             lctx.n_outputs = n_outputs_new;
         }
 
-        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        std::pair<int32_t, ggml_compute_threadpool_t> threads =
+            llama_swap_threadpools(lctx, n_tokens);
+
+        int32_t n_threads                    = threads.first;
+        ggml_compute_threadpool_t threadpool = threads.second;
+
         GGML_ASSERT(n_threads > 0);
 
         // helpers for smoother batch API transition
@@ -14644,7 +14689,7 @@ static int llama_decode_internal(
 
         llama_set_inputs(lctx, u_batch);
 
-        llama_graph_compute(lctx, gf, n_threads);
+        llama_graph_compute(lctx, gf, n_threads, threadpool);
 
         // update the kv ring buffer
         {
@@ -14805,7 +14850,11 @@ static int llama_encode_internal(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    std::pair<int32_t, ggml_compute_threadpool_t> threads =
+        llama_swap_threadpools(lctx, n_tokens);
+
+    int32_t n_threads                    = threads.first;
+    ggml_compute_threadpool_t threadpool = threads.second;
     GGML_ASSERT(n_threads > 0);
 
     // helpers for smoother batch API transition
@@ -14848,7 +14897,7 @@ static int llama_encode_internal(
 
     llama_set_inputs(lctx, batch);
 
-    llama_graph_compute(lctx, gf, n_threads);
+    llama_graph_compute(lctx, gf, n_threads, threadpool);
 
     // extract embeddings
     if (embd) {
@@ -15093,7 +15142,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
     ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
 
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 #endif
 
     //const int64_t t_end = ggml_time_us();
@@ -15119,7 +15168,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
             llama_set_k_shift(lctx);
 
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
             need_reserve = true;
         }
@@ -15145,7 +15194,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
             llama_set_s_copy(lctx);
 
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
             need_reserve = true;
         }
@@ -16387,6 +16436,31 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
     }
 }
 
+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool) {
+    ctx->threadpool = threadpool;
+}
+
+void llama_attach_batch_threadpool(
+             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool_batch) {
+    ctx->threadpool_batch = threadpool_batch;
+}
+
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool = nullptr;
+}
+
+void llama_detach_batch_threadpool(struct llama_context * ctx) {
+    ctx->threadpool = nullptr;
+}
+
+void llama_detach_threadpools(struct llama_context * ctx) {
+    llama_detach_threadpool(ctx);
+    llama_detach_batch_threadpool(ctx);
+}
+
 void llama_backend_free(void) {
     ggml_quantize_free();
 }
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 8159e276af617..246bb227d1e19 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
 }
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);

From 76d2461a9c1dd3776417722aa6aefa0c1f612de6 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 02/18] Minor fixes

---
 CMakePresets.json                    | 256 +++++++--------------------
 examples/CMakeLists.txt              |   2 +-
 examples/speculative/speculative.cpp |   7 +-
 ggml/src/ggml.c                      |   4 +-
 include/llama.h                      |   2 +
 src/llama.cpp                        |   9 +
 6 files changed, 80 insertions(+), 200 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index ae2bf25c12786..bdad38952d3cb 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,197 +1,65 @@
 {
-    "version": 4,
-    "configurePresets": [
-        {
-            "name": "base",
-            "hidden": true,
-            "generator": "Ninja",
-            "binaryDir": "${sourceDir}/build-${presetName}",
-            "cacheVariables": {
-                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-            }
-        },
-        {
-            "name": "sycl-base",
-            "hidden": true,
-            "generator": "Ninja",
-            "binaryDir": "${sourceDir}/build-${presetName}",
-            "cacheVariables": {
-                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-                "CMAKE_CXX_COMPILER": "icx",
-                "CMAKE_C_COMPILER": "cl",
-                "GGML_SYCL": "ON",
-                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-            }
-        },
-        {
-            "name": "debug",
-            "hidden": true,
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
-        },
-        {
-            "name": "release",
-            "hidden": true,
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Release"
-            }
-        },
-        {
-            "name": "reldbg",
-            "hidden": true,
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "RelWithDebInfo"
-            }
-        },
-        {
-            "name": "static",
-            "hidden": true,
-            "cacheVariables": {
-                "GGML_STATIC": "ON"
-            }
-        },
-        {
-            "name": "arm64-windows-msvc",
-            "hidden": true,
-            "architecture": {
-                "value": "arm64",
-                "strategy": "external"
-            },
-            "toolset": {
-                "value": "host=x86_64",
-                "strategy": "external"
-            },
-            "cacheVariables": {
-                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
-            }
-        },
-        {
-            "name": "arm64-windows-llvm",
-            "hidden": true,
-            "architecture": {
-                "value": "arm64",
-                "strategy": "external"
-            },
-            "toolset": {
-                "value": "host=x86_64",
-                "strategy": "external"
-            },
-            "cacheVariables": {
-                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
-            }
-        },
-        {
-            "name": "arm64-windows-llvm-debug",
-            "inherits": [
-                "base",
-                "arm64-windows-llvm",
-                "debug"
-            ]
-        },
-        {
-            "name": "arm64-windows-llvm-release",
-            "inherits": [
-                "base",
-                "arm64-windows-llvm",
-                "reldbg"
-            ]
-        },
-        {
-            "name": "arm64-windows-llvm+static-release",
-            "inherits": [
-                "base",
-                "arm64-windows-llvm",
-                "reldbg",
-                "static"
-            ]
-        },
-        {
-            "name": "arm64-windows-msvc-debug",
-            "inherits": [
-                "base",
-                "arm64-windows-msvc",
-                "debug"
-            ]
-        },
-        {
-            "name": "arm64-windows-msvc-release",
-            "inherits": [
-                "base",
-                "arm64-windows-msvc",
-                "reldbg"
-            ]
-        },
-        {
-            "name": "arm64-windows-msvc+static-release",
-            "inherits": [
-                "base",
-                "arm64-windows-msvc",
-                "reldbg",
-                "static"
-            ]
-        },
-        {
-            "name": "x64-windows-msvc-debug",
-            "inherits": [
-                "base",
-                "debug"
-            ]
-        },
-        {
-            "name": "x64-windows-msvc-release",
-            "inherits": [
-                "base",
-                "reldbg"
-            ]
-        },
-        {
-            "name": "x64-windows-msvc+static-release",
-            "inherits": [
-                "base",
-                "reldbg",
-                "static"
-            ]
-        },
-        {
-            "name": "x64-windows-sycl-debug",
-            "inherits": [
-                "sycl-base",
-                "debug"
-            ]
-        },
-        {
-            "name": "x64-windows-sycl-release",
-            "inherits": [
-                "sycl-base",
-                "release"
-            ]
-        },
-        {
-            "name": "clang10",
-            "displayName": "Clang 10.0.0 x86_64-pc-linux-gnu",
-            "description": "Using compilers: C = /usr/bin/clang, CXX = /usr/bin/clang++",
-            "binaryDir": "${sourceDir}/out/build/${presetName}",
-            "cacheVariables": {
-                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
-                "CMAKE_C_COMPILER": "/usr/bin/clang",
-                "CMAKE_CXX_COMPILER": "/usr/bin/clang++",
-                "CMAKE_RC_COMPILER": "/usr/bin/llvm-rc-10",
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
-        },
-        {
-            "name": "gcc8.4",
-            "displayName": "GCC 8.4.0 x86_64-linux-gnu",
-            "description": "Using compilers: C = /usr/bin/gcc, CXX = /usr/bin/g++",
-            "binaryDir": "${sourceDir}/out/build/${presetName}",
-            "cacheVariables": {
-                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
-                "CMAKE_C_COMPILER": "/usr/bin/gcc",
-                "CMAKE_CXX_COMPILER": "/usr/bin/g++",
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
+  "version": 4,
+  "configurePresets": [
+    {
+        "name":  "base",
+        "hidden": true,
+        "generator":   "Ninja",
+        "binaryDir":   "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
         }
-    ]
+    },
+    {
+        "name": "sycl-base",
+        "hidden": true,
+        "generator": "Ninja",
+        "binaryDir": "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_CXX_COMPILER": "icx",
+            "CMAKE_C_COMPILER": "cl",
+            "GGML_SYCL": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
+    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
+
+    {
+        "name": "arm64-windows-msvc", "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+        }
+    },
+
+    {
+        "name": "arm64-windows-llvm", "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+        }
+    },
+
+    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
+
+    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
+
+    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
+
+    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
+  ]
 }
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 247d52c6d3454..67b3d27747850 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,6 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    #add_subdirectory(speculative)
+    add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index b051a18f169c2..1616edecbbef6 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
+    if (params.draft_cpuparams.n_threads > 0) {
+        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
     }
-    params.n_threads_batch = params.n_threads_batch_draft;
+
+    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
     llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
     model_dft = llama_init_dft.model;
     ctx_dft = llama_init_dft.context;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 53d73d00722f5..a140b9a6b309e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18816,7 +18816,7 @@ static bool __thread_affinity(const bool * mask) {
 
     for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
         if (mask[i]) {
-            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
             CPU_SET(i, &cpuset);
         }
     }
@@ -19209,7 +19209,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
         if (threadpool->poll) {
             while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
                 // No new work. Yield and keep polling.
-                //__cpu_relax();
+                __cpu_relax();
             }
         } else {
             ggml_mutex_lock_shared(&threadpool->mutex);
diff --git a/include/llama.h b/include/llama.h
index b6f1d94de422b..495da52101b5b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -439,6 +439,8 @@ extern "C" {
     LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
     LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
 
+    // Pauses all attached threadpools
+    LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);
 
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
diff --git a/src/llama.cpp b/src/llama.cpp
index 2e2b6332e211d..6123510c93b77 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16461,6 +16461,15 @@ void llama_detach_threadpools(struct llama_context * ctx) {
     llama_detach_batch_threadpool(ctx);
 }
 
+void llama_pause_threadpools(struct llama_context * ctx) {
+    if (ctx->threadpool) {
+        ggml_pause_threadpool(ctx->threadpool);
+    }
+    if (ctx->threadpool_batch) {
+        ggml_pause_threadpool(ctx->threadpool_batch);
+    }
+}
+
 void llama_backend_free(void) {
     ggml_quantize_free();
 }

From 9bd436742a24d8c29917abb1e35552c0b2111aac Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 03/18] fixed use after release bug

---
 ggml/src/ggml.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index a140b9a6b309e..f491a11f53c31 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19442,11 +19442,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
+    enum ggml_status ret = threadpool->ec;
+
     if (disposable_threadpool) {
         ggml_release_threadpool(threadpool);
     }
 
-    return threadpool->ec;
+    return ret;
 }
 
 enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {

From dc33f83e43968c66338225c26c835944226fdc9e Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 04/18] fixed a harmless race condition

---
 ggml/src/ggml.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f491a11f53c31..96de421d33d05 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19379,18 +19379,16 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         };
 
         threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
-    } else if (n_threads > threadpool->n_threads_max) {
-        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
-    }
-
-    // Set up work
-    threadpool->cgraph        = cgraph;
-    threadpool->cplan         = cplan;
-    threadpool->n_threads_cur = n_threads;
-
-    if (!disposable_threadpool) {
+    } else {
+        if (n_threads > threadpool->n_threads_max) {
+            GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+        }
+        // Not a disposable threadpool:
         // Reset some of the paramters that need resetting
         // No worker threads should be accessing the parameters below at this stage
+        threadpool->cgraph        = cgraph;
+        threadpool->cplan         = cplan;
+        threadpool->n_threads_cur = n_threads;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;

From 32048c726457e406636fd5d7a8234324167c3f3e Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:30 -0400
Subject: [PATCH 05/18] Fix Android bulid issue

---
 ggml/src/ggml.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 96de421d33d05..547443116812f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -121,8 +121,14 @@ static int sched_yield (void) {
     return 0;
 }
 #else
+
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+
 #include <pthread.h>
 #include <stdatomic.h>
+#include <sched.h>
 
 typedef void * thread_ret_t;
 
@@ -18803,11 +18809,6 @@ static bool __thread_priority(int32_t prio) {
 
 #else // posix?
 
-#ifndef __USE_GNU
-#define __USE_GNU
-#endif
-#include <sched.h>
-
 static bool __thread_affinity(const bool * mask) {
     cpu_set_t cpuset;
     int32_t err;

From 81522b963eeb2082afbdc82a6f9173eec3bb5651 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:31 -0400
Subject: [PATCH 06/18] fix more race conditions

---
 ggml/src/ggml.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 547443116812f..b915a5d052e7c 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18956,7 +18956,10 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
     GGML_PRINT_DEBUG("Pausing threadpool\n");
+    ggml_mutex_lock(&threadpool->mutex);
     threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
 #else
     UNUSED(threadpool);
 #endif
@@ -19427,9 +19430,9 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             __thread_affinity(threadpool->workers[0].cpumask);
         }
 
-        threadpool->new_work = true;
         if (!threadpool->poll) {
             ggml_mutex_lock(&threadpool->mutex);
+            threadpool->new_work = true;
             ggml_cond_broadcast(&threadpool->cond);
             ggml_mutex_unlock(&threadpool->mutex);
         }

From 4512d1a62ff0991f659b2fc76904091ba984fde5 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 31 Jul 2024 12:42:31 -0400
Subject: [PATCH 07/18] fix deadlock for cases where cgraph.n_nodes == 1

and fix --poll case
---
 ggml/src/ggml.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b915a5d052e7c..40fe64874de14 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19193,7 +19193,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
     }
 
+    if (cgraph->n_nodes == 1) {
+        // We need a barrier before disabling new_work in case we have a trivial graph
+        ggml_barrier(state->threadpool);
+    }
+
     if (!state->threadpool->disposable && state->ith == 0) {
+        // Don't need a lock, because there is a barrier after this, and only after that
+        // do the secondary threads go into standby
         state->threadpool->new_work = false;
     }
 
@@ -19435,6 +19442,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             threadpool->new_work = true;
             ggml_cond_broadcast(&threadpool->cond);
             ggml_mutex_unlock(&threadpool->mutex);
+        } else {
+            threadpool->new_work = true;
         }
     }
     // this is a work thread too

From 5f44e286fac344c8bfeb49fa81b22fc85f8f2f52 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 3 Aug 2024 16:14:04 -0700
Subject: [PATCH 08/18] threadpool: use cpu_get_num_math to set the default
 number of threadpool threads

This way we avoid using E-Cores and Hyperthreaded siblings.
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index c08ebda4aa6a1..0e6417489b8fe 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -230,7 +230,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
         if (role_model != nullptr) {
             cpuparams = *role_model;
         } else {
-            cpuparams.n_threads = std::thread::hardware_concurrency();
+            cpuparams.n_threads = cpu_get_num_math();
         }
     }
 

From 152fc7345c1ed7c5264f8ffed21be20cf36f7dfa Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 3 Aug 2024 17:17:39 -0700
Subject: [PATCH 09/18] bench: create fresh threadpool for each test

For benchmarking it's better to start a fresh pool for each test with the exact number of threads
needed for that test. Having larger pools is suboptimal (causes more load, etc).
---
 examples/llama-bench/llama-bench.cpp | 42 ++++++++++++----------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 5a929ceddafbe..1009ac57b7be2 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -291,7 +291,6 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
-    printf("  -mt, --max-threads <n>              (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
     printf("  -C, --cpu-mask <hex>                (default: 0x0)\n");
     printf("  --cpu-strict <0|1>                  (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
     printf("  --priority <0|1|2|3>                (default: %d)\n", cmd_params_defaults.cpuparams.priority);
@@ -499,12 +498,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
                 else { invalid_param = true; break; }
             }
-        } else if (arg == "-mt" || arg == "--max-threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cpuparams.n_threads = std::stoi(argv[i]);
         } else if (arg == "-C" || arg == "--cpu-mask") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1435,21 +1428,6 @@ int main(int argc, char ** argv) {
 
     postprocess_cpu_params(params.cpuparams);
 
-    struct ggml_threadpool_params tpp;
-    tpp.n_threads      = params.cpuparams.n_threads;
-    tpp.mask_specified = params.cpuparams.mask_valid;
-    tpp.strict_cpu     = params.cpuparams.strict_cpu;
-    tpp.prio           = params.cpuparams.priority;
-    tpp.poll           = params.cpuparams.poll;
-
-    std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
-
-    struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
-    if (!threadpool) {
-        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        exit(1);
-    }
-
     for (const auto & inst : params_instances) {
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1475,6 +1453,22 @@ int main(int argc, char ** argv) {
         test t(inst, lmodel, ctx);
 
         llama_kv_cache_clear(ctx);
+
+        struct ggml_threadpool_params tpp;
+        tpp.n_threads      = t.n_threads;
+        tpp.mask_specified = params.cpuparams.mask_valid;
+        tpp.strict_cpu     = params.cpuparams.strict_cpu;
+        tpp.prio           = params.cpuparams.priority;
+        tpp.poll           = params.cpuparams.poll;
+
+        std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+
+        struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
+        if (!threadpool) {
+            LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            exit(1);
+        }
+
         llama_attach_threadpool(ctx, threadpool);
 
         // warmup run
@@ -1515,9 +1509,9 @@ int main(int argc, char ** argv) {
         llama_print_timings(ctx);
 
         llama_free(ctx);
-    }
 
-    ggml_release_threadpool(threadpool);
+        ggml_release_threadpool(threadpool);
+    }
 
     llama_free_model(lmodel);
 

From 3cfce8d7cdefb9bd549c6cb82b35bcbb4a08ffb3 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 5 Aug 2024 14:25:49 -0700
Subject: [PATCH 10/18] atomics: always use stdatomics with clang and use
 relaxed memory order when polling in ggml_barrier

This also removes sched_yield() calls from ggml_barrier() to match OpenMP behavior.
---
 ggml/src/ggml.c | 63 +++++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 40fe64874de14..7ea0252b73b27 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -69,23 +69,38 @@ int ggml_sve_cnt_b = 0;
 #endif
 #include <windows.h>
 
+#if !defined(__clang__)
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
 
 #define ATOMIC_FLAG_INIT 0
 
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
 static void atomic_store(atomic_int * ptr, LONG val) {
     InterlockedExchange(ptr, val);
 }
 static LONG atomic_load(atomic_int * ptr) {
     return InterlockedCompareExchange(ptr, 0, 0);
 }
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
 static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
     return InterlockedExchangeAdd(ptr, inc);
 }
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedExchangeAdd(ptr, inc);
 }
 static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
     return InterlockedExchange(ptr, 1);
@@ -93,6 +108,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
 static void atomic_flag_clear(atomic_flag * ptr) {
     InterlockedExchange(ptr, 0);
 }
+#else // clang
+#include <stdatomic.h>
+#endif
 
 typedef HANDLE pthread_t;
 
@@ -3030,6 +3048,19 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void __cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void __cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void __cpu_relax(void) {;}
+#endif
+
 //
 // NUMA support
 //
@@ -3094,25 +3125,19 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
     atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
 
     int n_threads = threadpool->n_threads_cur;
-    int passed_old = atomic_load(n_barrier_passed);
+    int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
 
     if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
         // last thread
         atomic_store(n_barrier, 0);
-        atomic_fetch_add(n_barrier_passed, 1);
+        atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
     } else {
         // wait for other threads
-        const int n_spin_before_sleep = 100000;
         while (true) {
-            for (int i = 0; i < n_spin_before_sleep; i++) {
-                if (atomic_load(n_barrier_passed) != passed_old) {
-                    return;
-                }
-            #if defined(__SSE3__)
-                _mm_pause();
-            #endif
+            if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
+                return;
             }
-            sched_yield();
+            __cpu_relax();
         }
     }
 }
@@ -18879,18 +18904,6 @@ static bool __thread_priority(int32_t prio) {
 
 #endif
 
-#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
-static inline void __cpu_relax(void) {
-    __asm__ volatile("yield" ::: "memory");
-}
-#elif defined(__x86_64__)
-static inline void __cpu_relax(void) {
-    _mm_pause();
-}
-#else
-static inline void __cpu_relax(void) {;}
-#endif
-
 static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
     if (!global_mask) {
         memset(local_mask, 1, GGML_MAX_N_THREADS);

From 26ff44fed68043a37c5065ba632730aaad2d8855 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Tue, 6 Aug 2024 18:35:53 -0700
Subject: [PATCH 11/18] threadpool: make polling the default to match openmp
 behavior

All command line args now allow for setting poll to 0 (false).
---
 common/common.cpp                    | 34 +++++++++++++++++-----------
 common/common.h                      |  2 +-
 examples/llama-bench/llama-bench.cpp | 12 ++++++++--
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 0e6417489b8fe..5abddaefa6381 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -435,11 +435,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--cpu-strict") {
-        params.cpuparams.strict_cpu = true;
+        CHECK_ARG
+        params.cpuparams.strict_cpu = std::stoul(argv[i]);
         return true;
     }
     if (arg == "--poll") {
-        params.cpuparams.poll = true;
+        CHECK_ARG
+        params.cpuparams.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-tb" || arg == "--threads-batch") {
@@ -474,7 +476,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--poll-batch") {
-        params.cpuparams_batch.poll = true;
+        CHECK_ARG
+        params.cpuparams_batch.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-td" || arg == "--threads-draft") {
@@ -509,7 +512,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--poll-draft") {
-        params.draft_cpuparams.poll = true;
+        CHECK_ARG
+        params.draft_cpuparams.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-tbd" || arg == "--threads-batch-draft") {
@@ -537,7 +541,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "--poll-batch-draft") {
-        params.draft_cpuparams_batch.poll = true;
+        CHECK_ARG
+        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
         return true;
     }
     if (arg == "-p" || arg == "--prompt") {
@@ -1627,34 +1632,37 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
     options.push_back({ "*",           "-C,    --cpu-mask M",           "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
     options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
-    options.push_back({ "*",           "       --cpu-strict",           "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --cpu-strict <0|1>",     "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
     options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll",                 "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
+    options.push_back({ "*",           "       --poll <0|1>",           "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
     options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
     options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
                                                                         "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
-    options.push_back({ "*",           "       --cpu-strict-batch",     "use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --cpu-strict-batch <0|1>",
+                                                                        "use strict CPU placement (default: same as --cpu-strict)"});
     options.push_back({ "*",           "       --priority-batch N",     "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
-    options.push_back({ "*",           "       --poll-batch",           "use polling to wait for work (default: --poll)"});
+    options.push_back({ "*",           "       --poll-batch <0|1>",     "use polling to wait for work (default: same as --poll"});
     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
     options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",     "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
     options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi",
                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
-    options.push_back({ "speculative", "       --cpu-strict-draft",     "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>",
+                                                                        "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
     options.push_back({ "speculative", "       --priority-draft N",     "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
-    options.push_back({ "speculative", "       --poll-draft",           "Use polling to wait for draft model work (default: same as --poll])"});
+    options.push_back({ "speculative", "       --poll-draft <0|1>",     "Use polling to wait for draft model work (default: same as --poll])"});
     options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
                                                                         "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
     options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M",
                                                                         "Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
     options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
-    options.push_back({ "speculative", "       --cpu-strict-batch-draft",
+    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
     options.push_back({ "speculative", "       --priority-batch-draft N",
                                                                         "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
-    options.push_back({ "speculative", "       --poll-batch-draft",     "Use polling to wait for draft model work (default: --poll-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft <0|1>",
+                                                                        "Use polling to wait for draft model work (default: --poll-draft)"});
 
     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
diff --git a/common/common.h b/common/common.h
index 9865133ed8575..b0c32f949b273 100644
--- a/common/common.h
+++ b/common/common.h
@@ -73,7 +73,7 @@ struct cpu_params {
     bool     mask_valid                  = false;   // Default: any CPU
     int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
-    bool     poll                        = false;   // Use polling (busywait) to wait for work
+    bool     poll                        = true;    // Use polling (busywait) to wait for work (default matches OpenMP)
 };
 
 struct gpt_params {
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 1009ac57b7be2..571ca6dd2eb2b 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -513,9 +513,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             params.cpuparams.priority = std::stoul(argv[i]);
         } else if (arg == "--cpu-strict") {
-            params.cpuparams.strict_cpu = true;
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.strict_cpu = std::stoul(argv[i]);
         } else if (arg == "--poll") {
-            params.cpuparams.poll = true;
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.poll = std::stoul(argv[i]);
         } else if (arg == "-fa" || arg == "--flash-attn") {
             if (++i >= argc) {
                 invalid_param = true;

From 5c564e57be2e050c18ce65a96c9939ab32041c5a Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Wed, 7 Aug 2024 23:08:31 -0700
Subject: [PATCH 12/18] threadpool: do not wakeup threads in already paused
 threadpool

---
 ggml/src/ggml.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7ea0252b73b27..8006dd40ce258 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18970,8 +18970,10 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
     GGML_ASSERT(!threadpool->disposable);
     GGML_PRINT_DEBUG("Pausing threadpool\n");
     ggml_mutex_lock(&threadpool->mutex);
-    threadpool->pause = true;
-    ggml_cond_broadcast(&threadpool->cond);
+    if (!threadpool->pause) {
+        threadpool->pause = true;
+        ggml_cond_broadcast(&threadpool->cond);
+    }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
     UNUSED(threadpool);
@@ -18984,8 +18986,10 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
     GGML_PRINT_DEBUG("Resuming threadpool\n");
 
     ggml_mutex_lock(&threadpool->mutex);
-    threadpool->pause = false;
-    ggml_cond_broadcast(&threadpool->cond);
+    if (threadpool->pause) {
+        threadpool->pause = false;
+        ggml_cond_broadcast(&threadpool->cond);
+    }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
     UNUSED(threadpool);

From 220152904cc81010936f336407a481a8e884dcd1 Mon Sep 17 00:00:00 2001
From: fmz <quic_fzaghlou@quic.com>
Date: Thu, 8 Aug 2024 05:59:20 -0700
Subject: [PATCH 13/18] fix potential race condition in check_for_work

---
 examples/llava/minicpmv-cli.cpp |  2 +-
 ggml/src/ggml.c                 | 35 +++++++++++++++++++--------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index f951b57b29158..2417268851688 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -174,7 +174,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
 
 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
     auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
     if (!embeds) {
         std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
         return NULL;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 8006dd40ce258..1d12a50f1f29f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18940,7 +18940,6 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     if (!threadpool->disposable) {
         ggml_mutex_lock(&threadpool->mutex);
     }
-    threadpool->n_threads_cur = n_threads;
     threadpool->stop = true;
     threadpool->pause = false;
     if (!threadpool->disposable) {
@@ -19233,21 +19232,27 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    do {
-        if (threadpool->poll) {
-            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
-                // No new work. Yield and keep polling.
-                __cpu_relax();
-            }
-        } else {
-            ggml_mutex_lock_shared(&threadpool->mutex);
-            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
-                // No new work. Wait for the signal.
-                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
-            }
-            ggml_mutex_unlock_shared(&threadpool->mutex);
+    if (threadpool->poll) {
+        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
+                 threadpool->stop ||
+                 threadpool->pause
+                )
+        ) {
+            // No new work. Yield and keep polling.
+            __cpu_relax();
         }
-    } while (state->ith >= threadpool->n_threads_cur);
+    } else {
+        ggml_mutex_lock_shared(&threadpool->mutex);
+        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
+                 threadpool->stop ||
+                 threadpool->pause
+                )
+        ) {
+            // No new work. Wait for the signal.
+            ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+        }
+        ggml_mutex_unlock_shared(&threadpool->mutex);
+    }
     return threadpool->new_work;
 }
 

From 20db9f4162673aa71496ab8d42836db75d6c4d7d Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Thu, 8 Aug 2024 16:26:49 -0700
Subject: [PATCH 14/18] threadpool: do not create two threadpools if their
 params are identical

---
 examples/main/main.cpp | 21 ++++++++++++++-------
 ggml/include/ggml.h    |  1 +
 ggml/src/ggml.c        | 13 +++++++++++++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index bb4cef1d246e2..56e8730593115 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -230,24 +230,31 @@ int main(int argc, char ** argv) {
     struct ggml_threadpool_params tpp =
             ggml_threadpool_params_from_cpu_params(params.cpuparams);
 
-    struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
-    if (!threadpool_batch) {
-        LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
-        exit(1);
-    }
     struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
     if (!threadpool) {
         LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
         exit(1);
     }
 
-    llama_attach_batch_threadpool(ctx, threadpool_batch);
     llama_attach_threadpool(ctx, threadpool);
     if (ctx_guidance) {
-        llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
         llama_attach_threadpool(ctx_guidance, threadpool);
     }
 
+    struct ggml_compute_threadpool * threadpool_batch = NULL;
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
+        threadpool_batch = ggml_create_threadpool(&tpp_batch);
+        if (!threadpool_batch) {
+            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            exit(1);
+        }
+
+        llama_attach_batch_threadpool(ctx, threadpool_batch);
+        if (ctx_guidance) {
+            llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
+        }
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index af3934c2bf88c..af74231565786 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2024,6 +2024,7 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
     GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
     GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
     GGML_API int32_t                         ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1d12a50f1f29f..81c7a33878253 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19301,6 +19301,19 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
 
 #endif // GGML_USE_OPENMP
 
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    if (p0->mask_specified != p1->mask_specified) return false;
+    if (p0->mask_specified) {
+        return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+    }
+
+    return true;
+}
+
 static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     struct ggml_threadpool_params * tpp,
                              bool   disposable,

From b18719b3027d0bd72d45419e9599c6460ef18908 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sat, 10 Aug 2024 16:12:06 -0700
Subject: [PATCH 15/18] threadpool: reduce pause/resume/wakeup overhead in
 common cases

We now start threadpool in paused state only if we have two.
The resume is now implicit (ie new work) which allows for reduced locking and context-switch overhead.
---
 common/common.cpp      |  1 +
 examples/main/main.cpp | 25 ++++++++++++++-----------
 ggml/include/ggml.h    |  1 +
 ggml/src/ggml.c        | 42 +++++++++++++++++++++++++++++++-----------
 src/llama.cpp          |  3 ---
 5 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 5abddaefa6381..df6e1624ef7b5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2507,6 +2507,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
     tpp.prio       = params.priority;
     tpp.poll       = params.poll;
     tpp.strict_cpu = params.strict_cpu;
+    tpp.paused     = false;
 
     return tpp;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 56e8730593115..e7d42e9cec731 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -230,17 +230,6 @@ int main(int argc, char ** argv) {
     struct ggml_threadpool_params tpp =
             ggml_threadpool_params_from_cpu_params(params.cpuparams);
 
-    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
-    if (!threadpool) {
-        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        exit(1);
-    }
-
-    llama_attach_threadpool(ctx, threadpool);
-    if (ctx_guidance) {
-        llama_attach_threadpool(ctx_guidance, threadpool);
-    }
-
     struct ggml_compute_threadpool * threadpool_batch = NULL;
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
         threadpool_batch = ggml_create_threadpool(&tpp_batch);
@@ -253,6 +242,20 @@ int main(int argc, char ** argv) {
         if (ctx_guidance) {
             llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
         }
+
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+
+    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_threadpool(ctx, threadpool);
+    if (ctx_guidance) {
+        llama_attach_threadpool(ctx_guidance, threadpool);
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index af74231565786..923182d9d9710 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -631,6 +631,7 @@ extern "C" {
         int32_t prio;
         bool    poll;
         bool    strict_cpu;
+        bool    paused;
     };
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 81c7a33878253..308e569856c70 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18964,14 +18964,27 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     GGML_ALIGNED_FREE(threadpool);
 }
 
+#ifndef GGML_USE_OPENMP
+// pause/resume must be called under mutex
+static void __ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+
+static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+}
+#endif
+
 void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
-    GGML_PRINT_DEBUG("Pausing threadpool\n");
     ggml_mutex_lock(&threadpool->mutex);
     if (!threadpool->pause) {
-        threadpool->pause = true;
-        ggml_cond_broadcast(&threadpool->cond);
+       __ggml_pause_threadpool(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -18982,12 +18995,9 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
     GGML_ASSERT(!threadpool->disposable);
-    GGML_PRINT_DEBUG("Resuming threadpool\n");
-
     ggml_mutex_lock(&threadpool->mutex);
     if (threadpool->pause) {
-        threadpool->pause = false;
-        ggml_cond_broadcast(&threadpool->cond);
+       __ggml_resume_threadpool(threadpool);
     }
     ggml_mutex_unlock(&threadpool->mutex);
 #else
@@ -19329,7 +19339,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
-        threadpool->pause            = disposable ? false : true;
+        threadpool->pause            = disposable ? false : tpp->paused;
         threadpool->new_work         = false;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
@@ -19419,9 +19429,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         struct ggml_threadpool_params ttp = {
             .mask_specified = false,
             .n_threads      = n_threads,
-            .prio           = 1,
+            .prio           = 0,
             .poll           = false,
-            .strict_cpu     = false
+            .strict_cpu     = false,
+            .paused         = false
         };
 
         threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
@@ -19475,10 +19486,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         if (!threadpool->poll) {
             ggml_mutex_lock(&threadpool->mutex);
             threadpool->new_work = true;
-            ggml_cond_broadcast(&threadpool->cond);
+            if (threadpool->pause) {
+               __ggml_resume_threadpool(threadpool);
+            } else {
+               ggml_cond_broadcast(&threadpool->cond);
+            }
             ggml_mutex_unlock(&threadpool->mutex);
         } else {
             threadpool->new_work = true;
+            if (threadpool->pause) {
+                ggml_mutex_lock(&threadpool->mutex);
+                __ggml_resume_threadpool(threadpool);
+                ggml_mutex_unlock(&threadpool->mutex);
+            }
         }
     }
     // this is a work thread too
diff --git a/src/llama.cpp b/src/llama.cpp
index 6123510c93b77..70194b54b6db0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -14458,17 +14458,14 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
         // Switch between the 2 threadpools as needed
         if (n_tokens > 1) {
             ggml_pause_threadpool(lctx.threadpool);
-            ggml_resume_threadpool(lctx.threadpool_batch);
             threadpool = lctx.threadpool_batch;
             n_threads = cparams.n_threads_batch;
         } else {
             ggml_pause_threadpool(lctx.threadpool_batch);
-            ggml_resume_threadpool(lctx.threadpool);
             threadpool = lctx.threadpool;
             n_threads = cparams.n_threads;
         }
     } else if (lctx.threadpool) {
-        ggml_resume_threadpool(lctx.threadpool);
         threadpool = lctx.threadpool;
         n_threads = cparams.n_threads;
     }

From 323181f2abb7ab9f03f03b8ee160efe66f5624eb Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Sun, 11 Aug 2024 11:20:32 -0700
Subject: [PATCH 16/18] threadpool: add support for hybrid polling

poll params (--poll, ...) now specify "polling level", i.e. how aggresively we poll before waiting on cond.var.
poll=0 means no polling, 1 means poll for 128K rounds then wait, 2 for 256K rounds, ...

The default value of 50 (ie 50x128K rounds) seems like a decent default across modern platforms.
We can tune this further as things evolve.
---
 common/common.cpp   |  2 +-
 common/common.h     |  2 +-
 ggml/include/ggml.h | 14 ++++----
 ggml/src/ggml.c     | 86 +++++++++++++++++++++++++--------------------
 4 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index df6e1624ef7b5..3a202be1b73d8 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
     options.push_back({ "*",           "       --cpu-strict <0|1>",     "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
     options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll <0|1>",           "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
+    options.push_back({ "*",           "       --poll <0...100>",       "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
     options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
     options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
diff --git a/common/common.h b/common/common.h
index b0c32f949b273..eb87e8880ffa5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -73,7 +73,7 @@ struct cpu_params {
     bool     mask_valid                  = false;   // Default: any CPU
     int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
     bool     strict_cpu                  = false;   // Use strict CPU placement
-    bool     poll                        = true;    // Use polling (busywait) to wait for work (default matches OpenMP)
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
 
 struct gpt_params {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 923182d9d9710..910171c07c00c 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -625,13 +625,13 @@ extern "C" {
     typedef bool (*ggml_abort_callback)(void * data);
 
     struct ggml_threadpool_params {
-        bool    cpumask[GGML_MAX_N_THREADS];
-        bool    mask_specified;
-        int32_t n_threads;
-        int32_t prio;
-        bool    poll;
-        bool    strict_cpu;
-        bool    paused;
+        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
+        bool     mask_specified;              // mask is non-empty
+        int32_t  n_threads;                   // number of threads
+        int32_t  prio;                        // thread priority
+        uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool     strict_cpu;                  // strict cpu placement
+        bool     paused;                      // start in paused state
     };
 
     struct ggml_compute_threadpool;     // forward declaration, see ggml.c
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 308e569856c70..f0c0f4fb0a29a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1973,7 +1973,7 @@ struct ggml_compute_threadpool {
 
     int32_t      prio;       // Scheduling priority
     bool         disposable; // Doesn't initialize a conv-var
-    bool         poll;       // Use polling (busywait)  // TODO
+    uint32_t     poll;       // Polling level (0 - no polling)
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
@@ -19235,35 +19235,50 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     return 0;
 }
 
+#ifndef GGML_USE_OPENMP
 
+static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
+}
 
-#ifndef GGML_USE_OPENMP
+static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+    if (threadpool->stop || threadpool->pause) return true;
+    return ggml_graph_compute_got_work(state);
+}
+
+static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
+    // Perhaps, we can adjust it dynamically based on load and things.
+    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
+
+    for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
+        // No new work. Keep polling.
+        __cpu_relax();
+    }
+
+    return ggml_graph_compute_got_work(state);
+}
 
 static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    if (threadpool->poll) {
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
-                 threadpool->stop ||
-                 threadpool->pause
-                )
-        ) {
-            // No new work. Yield and keep polling.
-            __cpu_relax();
-        }
-    } else {
-        ggml_mutex_lock_shared(&threadpool->mutex);
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
-                 threadpool->stop ||
-                 threadpool->pause
-                )
-        ) {
-            // No new work. Wait for the signal.
-            ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
-        }
-        ggml_mutex_unlock_shared(&threadpool->mutex);
+    if (ggml_graph_compute_poll_for_work(state)) {
+        return ggml_graph_compute_got_work(state);
+    }
+
+    ggml_mutex_lock_shared(&threadpool->mutex);
+    while (!ggml_graph_compute_ready(state)) {
+        // No new work. Wait for the signal.
+        GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
+        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
     }
-    return threadpool->new_work;
+    ggml_mutex_unlock_shared(&threadpool->mutex);
+
+    return ggml_graph_compute_got_work(state);
 }
 
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@@ -19483,24 +19498,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             __thread_affinity(threadpool->workers[0].cpumask);
         }
 
-        if (!threadpool->poll) {
-            ggml_mutex_lock(&threadpool->mutex);
-            threadpool->new_work = true;
-            if (threadpool->pause) {
-               __ggml_resume_threadpool(threadpool);
-            } else {
-               ggml_cond_broadcast(&threadpool->cond);
-            }
-            ggml_mutex_unlock(&threadpool->mutex);
+        // always take the mutex here because the worker threads are doing hybrid poll/wait
+
+        ggml_mutex_lock(&threadpool->mutex);
+        threadpool->new_work = true;
+        if (!threadpool->pause) {
+           ggml_cond_broadcast(&threadpool->cond);
         } else {
-            threadpool->new_work = true;
-            if (threadpool->pause) {
-                ggml_mutex_lock(&threadpool->mutex);
-                __ggml_resume_threadpool(threadpool);
-                ggml_mutex_unlock(&threadpool->mutex);
-            }
+           // resume does cond broadcast
+           __ggml_resume_threadpool(threadpool);
         }
+        ggml_mutex_unlock(&threadpool->mutex);
     }
+
     // this is a work thread too
     ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif

From 160fc8de96503bcd5b32f900fe4d65a2296c3937 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 12 Aug 2024 19:04:01 -0700
Subject: [PATCH 17/18] threadpool: reduce the number of barrier required

New work is now indicated with an atomic counter that is incremented for
each new graph that needs to be computed.
This removes the need for extra barrier for clearing the "new_work" and
removes the special case for trivial graphs.
---
 ggml/src/ggml.c | 78 +++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f0c0f4fb0a29a..3e3061279517a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1959,13 +1959,13 @@ struct ggml_compute_threadpool {
     struct ggml_cplan  * cplan;
 
     // synchronization primitives
+    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
     atomic_int n_barrier;
     atomic_int n_barrier_passed;
     atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
 
     volatile bool stop;      // Used for stopping the threadpool altogether
     volatile bool pause;     // Used for pausing the threadpool or individual threads
-    volatile bool new_work;  // Set when there is work to be done, unset after it's done
 
     struct ggml_compute_state * workers;   // per thread state
     int32_t                     n_threads_max; // number of threads in the pool
@@ -1987,6 +1987,8 @@ struct ggml_compute_state {
     ggml_thread_t thrd;
     bool cpumask[GGML_MAX_N_THREADS];
     bool mask_specified;
+    int  last_graph;
+    bool pending;
 #endif
     struct ggml_compute_threadpool * threadpool;
     int ith;
@@ -19197,55 +19199,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.threadpool=*/ state->threadpool,
     };
 
-    struct ggml_tensor * node = cgraph->nodes[0];
+    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
+        struct ggml_tensor * node = cgraph->nodes[node_n];
 
-    ggml_compute_forward(&params, node);
-    if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-        state->threadpool->ec = GGML_STATUS_ABORTED;
-    }
-
-    for (int node_n = 1; node_n < cgraph->n_nodes; node_n++) {
-        ggml_barrier(state->threadpool);
-
-        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
-            break;
-        }
-
-        node = cgraph->nodes[node_n];
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
             state->threadpool->ec = GGML_STATUS_ABORTED;
         }
-    }
 
-    if (cgraph->n_nodes == 1) {
-        // We need a barrier before disabling new_work in case we have a trivial graph
         ggml_barrier(state->threadpool);
-    }
 
-    if (!state->threadpool->disposable && state->ith == 0) {
-        // Don't need a lock, because there is a barrier after this, and only after that
-        // do the secondary threads go into standby
-        state->threadpool->new_work = false;
+        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
+            break;
+        }
     }
 
-    ggml_barrier(state->threadpool);
-
     return 0;
 }
 
 #ifndef GGML_USE_OPENMP
 
-static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
-    struct ggml_compute_threadpool * threadpool = state->threadpool;
-    return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
-}
-
 static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
-    if (threadpool->stop || threadpool->pause) return true;
-    return ggml_graph_compute_got_work(state);
+    if (threadpool->stop || threadpool->pause || state->pending) { return true; }
+
+    // check for new graph/work
+    int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
+    if (new_graph != state->last_graph) {
+        state->pending    = (state->ith < threadpool->n_threads_cur);
+        state->last_graph = new_graph;
+    }
+
+    return state->pending;
 }
 
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
@@ -19260,14 +19246,14 @@ static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state *
         __cpu_relax();
     }
 
-    return ggml_graph_compute_got_work(state);
+    return state->pending;
 }
 
-static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
     if (ggml_graph_compute_poll_for_work(state)) {
-        return ggml_graph_compute_got_work(state);
+        return state->pending;
     }
 
     ggml_mutex_lock_shared(&threadpool->mutex);
@@ -19278,7 +19264,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
     }
     ggml_mutex_unlock_shared(&threadpool->mutex);
 
-    return ggml_graph_compute_got_work(state);
+    return state->pending;
 }
 
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@@ -19308,8 +19294,10 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
         // Check if there is new work
         // The main thread is the only one that can dispatch new work
 
-        bool new_work = ggml_graph_compute_check_for_work(state);
-        if (new_work) {
+        ggml_graph_compute_check_for_work(state);
+        if (state->pending) {
+            state->pending = false;
+
             int64_t ret = (int64_t) ggml_graph_compute_thread(state);
             if (ret == GGML_EXIT_ABORTED)
                 return (thread_ret_t) ret;
@@ -19350,12 +19338,12 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     {
         threadpool->cgraph           = cgraph;
         threadpool->cplan            = cplan;
+        threadpool->n_graph          = 0;
         threadpool->n_barrier        = 0;
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
         threadpool->pause            = disposable ? false : tpp->paused;
-        threadpool->new_work         = false;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
         threadpool->n_threads_cur    = disposable ? tpp->n_threads : 0;
@@ -19398,7 +19386,9 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
             .thrd           = 0,
             .mask_specified = tpp->mask_specified,
             .threadpool     = threadpool,
-            .ith            = j
+            .ith            = j,
+            .last_graph     = 0,
+            .pending        = false
         };
 
         if (tpp->mask_specified) {
@@ -19501,12 +19491,12 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         // always take the mutex here because the worker threads are doing hybrid poll/wait
 
         ggml_mutex_lock(&threadpool->mutex);
-        threadpool->new_work = true;
-        if (!threadpool->pause) {
-           ggml_cond_broadcast(&threadpool->cond);
-        } else {
+        atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+        if (threadpool->pause) {
            // resume does cond broadcast
            __ggml_resume_threadpool(threadpool);
+        } else {
+           ggml_cond_broadcast(&threadpool->cond);
         }
         ggml_mutex_unlock(&threadpool->mutex);
     }

From fee969dd9de160a01cb21227bdbb801ffe3e1340 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <quic_maxk@quicinc.com>
Date: Mon, 12 Aug 2024 22:18:16 -0700
Subject: [PATCH 18/18] threadpool: remove special-casing for disposable
 threadpools

With the efficient hybrid polling there is no need to make disposable pools any different.
This simplifies the overall logic and reduces branching.
---
 ggml/src/ggml.c | 117 +++++++++++++++++++++++-------------------------
 1 file changed, 55 insertions(+), 62 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 3e3061279517a..c8a5f02459a8f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1972,7 +1972,6 @@ struct ggml_compute_threadpool {
     int32_t                     n_threads_cur; // number of threads used in the current graph
 
     int32_t      prio;       // Scheduling priority
-    bool         disposable; // Doesn't initialize a conv-var
     uint32_t     poll;       // Polling level (0 - no polling)
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
@@ -18939,15 +18938,13 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
     struct ggml_compute_state* workers = threadpool->workers;
     const int32_t n_threads = threadpool->n_threads_max;
 
-    if (!threadpool->disposable) {
-        ggml_mutex_lock(&threadpool->mutex);
-    }
+    ggml_mutex_lock(&threadpool->mutex);
+
     threadpool->stop = true;
     threadpool->pause = false;
-    if (!threadpool->disposable) {
-        ggml_cond_broadcast(&threadpool->cond);
-        ggml_mutex_unlock(&threadpool->mutex);
-    }
+
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
 
     for (int32_t j = 1; j < n_threads; j++) {
         int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
@@ -18957,10 +18954,8 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
 
     GGML_ALIGNED_FREE(workers);
 
-    if (!threadpool->disposable) {
-        ggml_mutex_destroy(&threadpool->mutex);
-        ggml_cond_destroy(&threadpool->cond);
-    }
+    ggml_mutex_destroy(&threadpool->mutex);
+    ggml_cond_destroy(&threadpool->cond);
 #endif // GGML_USE_OPENMP
 
     GGML_ALIGNED_FREE(threadpool);
@@ -18983,7 +18978,6 @@ static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool
 
 void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
-    GGML_ASSERT(!threadpool->disposable);
     ggml_mutex_lock(&threadpool->mutex);
     if (!threadpool->pause) {
        __ggml_pause_threadpool(threadpool);
@@ -18996,7 +18990,6 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
 
 void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
 #ifndef GGML_USE_OPENMP
-    GGML_ASSERT(!threadpool->disposable);
     ggml_mutex_lock(&threadpool->mutex);
     if (threadpool->pause) {
        __ggml_resume_threadpool(threadpool);
@@ -19271,8 +19264,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     struct ggml_compute_threadpool * threadpool = state->threadpool;
 
-    GGML_ASSERT(!threadpool->disposable);
-
     __thread_priority(threadpool->prio);
     if (state->mask_specified)
         __thread_affinity(state->cpumask);
@@ -19312,6 +19303,25 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
     return (thread_ret_t) 0;
 }
 
+// Start processing new graph
+static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpool)
+{
+    // always take the mutex here because the worker threads are doing hybrid poll/wait
+
+    ggml_mutex_lock(&threadpool->mutex);
+
+    atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
+
+    if (threadpool->pause) {
+       // resume does cond broadcast
+       __ggml_resume_threadpool(threadpool);
+    } else {
+       ggml_cond_broadcast(&threadpool->cond);
+    }
+
+    ggml_mutex_unlock(&threadpool->mutex);
+}
+
 #endif // GGML_USE_OPENMP
 
 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
@@ -19329,7 +19339,6 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons
 
 static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     struct ggml_threadpool_params * tpp,
-                             bool   disposable,
                struct ggml_cgraph * cgraph,
                 struct ggml_cplan * cplan) {
 
@@ -19343,11 +19352,10 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
         threadpool->n_barrier_passed = 0;
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
-        threadpool->pause            = disposable ? false : tpp->paused;
+        threadpool->pause            = tpp->paused;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
-        threadpool->n_threads_cur    = disposable ? tpp->n_threads : 0;
-        threadpool->disposable       = disposable;
+        threadpool->n_threads_cur    = tpp->n_threads;
         threadpool->poll             = tpp->poll;
         threadpool->prio             = tpp->prio;
 
@@ -19357,10 +19365,8 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
     }
 
 #ifndef GGML_USE_OPENMP
-    if (!disposable) {
-        ggml_mutex_init(&threadpool->mutex);
-        ggml_cond_init(&threadpool->cond);
-    }
+    ggml_mutex_init(&threadpool->mutex);
+    ggml_cond_init(&threadpool->cond);
 #endif // GGML_USE_OPENMP
 
     struct ggml_compute_state * workers =
@@ -19395,14 +19401,12 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
             __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
         }
 
-        // Disposable threadpools need to have a valid cplan and cgraph immediately.
-        thread_ret_t (*thread_entrypoint)(void*) = disposable ? ggml_graph_compute_thread : ggml_graph_compute_secondary_thread;
         // Spin threads for all secondary workers
         if (j > 0) {
             int32_t rc = ggml_thread_create(
                 &workers[j].thrd,
                 NULL,
-                thread_entrypoint,
+                ggml_graph_compute_secondary_thread,
                 &workers[j]
             );
             GGML_ASSERT(rc == 0);
@@ -19414,7 +19418,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
 }
 
 struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) {
-    return ggml_create_threadpool_impl(tpp, false, NULL, NULL);
+    return ggml_create_threadpool_impl(tpp, NULL, NULL);
 }
 
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
@@ -19435,28 +19439,28 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             .mask_specified = false,
             .n_threads      = n_threads,
             .prio           = 0,
-            .poll           = false,
+            .poll           = 1,
             .strict_cpu     = false,
             .paused         = false
         };
 
-        threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
-    } else {
-        if (n_threads > threadpool->n_threads_max) {
-            GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
-        }
-        // Not a disposable threadpool:
-        // Reset some of the paramters that need resetting
-        // No worker threads should be accessing the parameters below at this stage
-        threadpool->cgraph        = cgraph;
-        threadpool->cplan         = cplan;
-        threadpool->n_threads_cur = n_threads;
-        threadpool->n_barrier        = 0;
-        threadpool->n_barrier_passed = 0;
-        threadpool->current_chunk    = 0;
-        threadpool->ec               = GGML_STATUS_SUCCESS;
+        threadpool = ggml_create_threadpool_impl(&ttp, cgraph, cplan);
     }
 
+    if (n_threads > threadpool->n_threads_max) {
+        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+    }
+
+    // Reset some of the parameters that need resetting
+    // No worker threads should be accessing the parameters below at this stage
+    threadpool->cgraph           = cgraph;
+    threadpool->cplan            = cplan;
+    threadpool->n_threads_cur    = n_threads;
+    threadpool->n_barrier        = 0;
+    threadpool->n_barrier_passed = 0;
+    threadpool->current_chunk    = 0;
+    threadpool->ec               = GGML_STATUS_SUCCESS;
+
 #ifdef GGML_USE_OPENMP
     if (n_threads > 1) {
         #pragma omp parallel num_threads(n_threads)
@@ -19482,26 +19486,15 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         ggml_graph_compute_thread(&worker);
     }
 #else
-    if (!disposable_threadpool) {
-        // Update main thread affinity to match the current threadpool
-        if (threadpool->workers[0].mask_specified) {
-            __thread_affinity(threadpool->workers[0].cpumask);
-        }
-
-        // always take the mutex here because the worker threads are doing hybrid poll/wait
-
-        ggml_mutex_lock(&threadpool->mutex);
-        atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
-        if (threadpool->pause) {
-           // resume does cond broadcast
-           __ggml_resume_threadpool(threadpool);
-        } else {
-           ggml_cond_broadcast(&threadpool->cond);
-        }
-        ggml_mutex_unlock(&threadpool->mutex);
+    // Update main thread affinity to match the current threadpool
+    if (threadpool->workers[0].mask_specified) {
+        __thread_affinity(threadpool->workers[0].cpumask);
     }
 
-    // this is a work thread too
+    // Kick all threads to start the new graph
+    ggml_graph_compute_kickoff(threadpool);
+
+    // This is a work thread too
     ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif