From 9328133b3b3a57747442c7f3566edbbd4cf5ae9f Mon Sep 17 00:00:00 2001
From: fmz <quic_fzaghlou@quic.com>
Date: Thu, 11 Jul 2024 07:52:23 -0700
Subject: [PATCH 1/4] Introduce ggml_compute_threadpool

- OpenMP functional: check
- Vanilla ggml functional: Check
- ggml w/threadpool functional: Check
- OpenMP no regression: No glaring problems
- Vanilla ggml no regression: No glaring problems
- ggml w/threadpool no regression: No glaring problems
---
 CMakePresets.json                             | 256 ++++--
 common/common.cpp                             | 291 ++++++-
 common/common.h                               |  29 +-
 examples/CMakeLists.txt                       |   2 +-
 examples/baby-llama/baby-llama.cpp            |   2 +-
 examples/benchmark/benchmark-matmult.cpp      |   2 +-
 .../cvector-generator/cvector-generator.cpp   |   4 +-
 examples/llama-bench/llama-bench.cpp          |  51 ++
 examples/llava/llava-cli.cpp                  |   4 +-
 examples/main/main.cpp                        |  30 +
 examples/server/server.cpp                    |   4 +-
 ggml/CMakeLists.txt                           |   2 +-
 ggml/include/ggml-alloc.h                     |   5 +-
 ggml/include/ggml-backend.h                   |   1 +
 ggml/include/ggml.h                           |  27 +-
 ggml/src/ggml-backend.c                       |  16 +-
 ggml/src/ggml.c                               | 800 ++++++++++++++----
 include/llama.h                               |  12 +
 src/llama.cpp                                 |  96 ++-
 tests/test-rope.cpp                           |   2 +-
 20 files changed, 1360 insertions(+), 276 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index bdad38952d3cb..ae2bf25c12786 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,65 +1,197 @@
 {
-  "version": 4,
-  "configurePresets": [
-    {
-        "name":  "base",
-        "hidden": true,
-        "generator":   "Ninja",
-        "binaryDir":   "${sourceDir}/build-${presetName}",
-        "cacheVariables": {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+    "version": 4,
+    "configurePresets": [
+        {
+            "name": "base",
+            "hidden": true,
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/build-${presetName}",
+            "cacheVariables": {
+                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+            }
+        },
+        {
+            "name": "sycl-base",
+            "hidden": true,
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/build-${presetName}",
+            "cacheVariables": {
+                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+                "CMAKE_CXX_COMPILER": "icx",
+                "CMAKE_C_COMPILER": "cl",
+                "GGML_SYCL": "ON",
+                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+            }
+        },
+        {
+            "name": "debug",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
+        },
+        {
+            "name": "release",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release"
+            }
+        },
+        {
+            "name": "reldbg",
+            "hidden": true,
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "RelWithDebInfo"
+            }
+        },
+        {
+            "name": "static",
+            "hidden": true,
+            "cacheVariables": {
+                "GGML_STATIC": "ON"
+            }
+        },
+        {
+            "name": "arm64-windows-msvc",
+            "hidden": true,
+            "architecture": {
+                "value": "arm64",
+                "strategy": "external"
+            },
+            "toolset": {
+                "value": "host=x86_64",
+                "strategy": "external"
+            },
+            "cacheVariables": {
+                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+            }
+        },
+        {
+            "name": "arm64-windows-llvm",
+            "hidden": true,
+            "architecture": {
+                "value": "arm64",
+                "strategy": "external"
+            },
+            "toolset": {
+                "value": "host=x86_64",
+                "strategy": "external"
+            },
+            "cacheVariables": {
+                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+            }
+        },
+        {
+            "name": "arm64-windows-llvm-debug",
+            "inherits": [
+                "base",
+                "arm64-windows-llvm",
+                "debug"
+            ]
+        },
+        {
+            "name": "arm64-windows-llvm-release",
+            "inherits": [
+                "base",
+                "arm64-windows-llvm",
+                "reldbg"
+            ]
+        },
+        {
+            "name": "arm64-windows-llvm+static-release",
+            "inherits": [
+                "base",
+                "arm64-windows-llvm",
+                "reldbg",
+                "static"
+            ]
+        },
+        {
+            "name": "arm64-windows-msvc-debug",
+            "inherits": [
+                "base",
+                "arm64-windows-msvc",
+                "debug"
+            ]
+        },
+        {
+            "name": "arm64-windows-msvc-release",
+            "inherits": [
+                "base",
+                "arm64-windows-msvc",
+                "reldbg"
+            ]
+        },
+        {
+            "name": "arm64-windows-msvc+static-release",
+            "inherits": [
+                "base",
+                "arm64-windows-msvc",
+                "reldbg",
+                "static"
+            ]
+        },
+        {
+            "name": "x64-windows-msvc-debug",
+            "inherits": [
+                "base",
+                "debug"
+            ]
+        },
+        {
+            "name": "x64-windows-msvc-release",
+            "inherits": [
+                "base",
+                "reldbg"
+            ]
+        },
+        {
+            "name": "x64-windows-msvc+static-release",
+            "inherits": [
+                "base",
+                "reldbg",
+                "static"
+            ]
+        },
+        {
+            "name": "x64-windows-sycl-debug",
+            "inherits": [
+                "sycl-base",
+                "debug"
+            ]
+        },
+        {
+            "name": "x64-windows-sycl-release",
+            "inherits": [
+                "sycl-base",
+                "release"
+            ]
+        },
+        {
+            "name": "clang10",
+            "displayName": "Clang 10.0.0 x86_64-pc-linux-gnu",
+            "description": "Using compilers: C = /usr/bin/clang, CXX = /usr/bin/clang++",
+            "binaryDir": "${sourceDir}/out/build/${presetName}",
+            "cacheVariables": {
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
+                "CMAKE_C_COMPILER": "/usr/bin/clang",
+                "CMAKE_CXX_COMPILER": "/usr/bin/clang++",
+                "CMAKE_RC_COMPILER": "/usr/bin/llvm-rc-10",
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
+        },
+        {
+            "name": "gcc8.4",
+            "displayName": "GCC 8.4.0 x86_64-linux-gnu",
+            "description": "Using compilers: C = /usr/bin/gcc, CXX = /usr/bin/g++",
+            "binaryDir": "${sourceDir}/out/build/${presetName}",
+            "cacheVariables": {
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
+                "CMAKE_C_COMPILER": "/usr/bin/gcc",
+                "CMAKE_CXX_COMPILER": "/usr/bin/g++",
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
         }
-    },
-    {
-        "name": "sycl-base",
-        "hidden": true,
-        "generator": "Ninja",
-        "binaryDir": "${sourceDir}/build-${presetName}",
-        "cacheVariables": {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-            "CMAKE_CXX_COMPILER": "icx",
-            "CMAKE_C_COMPILER": "cl",
-            "GGML_SYCL": "ON",
-            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-        }
-    },
-    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
-
-    {
-        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
-        }
-    },
-
-    {
-        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
-        }
-    },
-
-    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
-
-    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
-
-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
-
-    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
-    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
-  ]
+    ]
 }
diff --git a/common/common.cpp b/common/common.cpp
index ec44a05521c9d..7eb0850d7f889 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -222,6 +222,36 @@ void gpt_params_handle_model_default(gpt_params & params) {
     }
 }
 
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
+    int32_t n_set = 0;
+
+    if (cpuparams.n_threads < 0) {
+        // Assuming everything about cpuparams is invalid
+        if (role_model != nullptr) {
+            cpuparams = *role_model;
+        } else {
+            cpuparams.n_threads = std::thread::hardware_concurrency();
+        }
+    }
+
+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (cpuparams.cpumask[i]) {
+            n_set++;
+        }
+    }
+
+    if (n_set == 0) {
+        // You hit the jackpot!
+        memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS);
+        n_set = GGML_MAX_N_THREADS;
+    }
+
+    if (n_set < cpuparams.n_threads) {
+        // Not enough set bits, may experience performance issues.
+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+    }
+}
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
@@ -241,6 +271,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
     if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
@@ -285,6 +320,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    size_t dash_loc = range.find('-');
+    if (dash_loc == std::string::npos) {
+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        return false;
+    }
+
+    size_t start_i;
+    size_t end_i;
+
+    if (dash_loc == 0) {
+        start_i = 0;
+    } else {
+        start_i = std::stoull(range.substr(0, dash_loc));
+        if (start_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "Start index out of bounds!\n");
+            return false;
+        }
+    }
+
+    if (dash_loc == range.length() - 1) {
+        end_i = GGML_MAX_N_THREADS - 1;
+    } else {
+        end_i = std::stoull(range.substr(dash_loc + 1));
+        if (end_i >= GGML_MAX_N_THREADS) {
+            fprintf(stderr, "End index out of bounds!\n");
+            return false;
+        }
+    }
+
+    for (size_t i = start_i; i <= end_i; i++) {
+        boolmask[i] = true;
+    }
+
+    return true;
+}
+
+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
+    // Discard potential 0x prefix
+    size_t start_i = 0;
+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
+        start_i = 2;
+    }
+
+    size_t num_digits = mask.length() - start_i;
+    if (num_digits > 128) num_digits = 128;
+
+    size_t end_i = num_digits + start_i;
+
+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
+        char c = mask.at(i);
+        int8_t id = c;
+
+        if ((c >= '0' && c <= '9')) {
+            id -= '0';
+        } else if (c >= 'a' && c <= 'f') {
+            id -= 'a' - 10;
+        } else if (c >= 'A' && c <= 'F') {
+            id -= 'A' - 10;
+        } else {
+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            return false;
+        }
+
+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
+    }
+
+    return true;
+}
+
 #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
 
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@@ -301,36 +409,137 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     }
     if (arg == "-t" || arg == "--threads") {
         CHECK_ARG
-        params.n_threads = std::stoi(argv[i]);
-        if (params.n_threads <= 0) {
-            params.n_threads = std::thread::hardware_concurrency();
+        params.cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams.n_threads <= 0) {
+            params.cpuparams.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-C" || arg == "--cpu-mask") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Cr" || arg == "--cpu-range") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio") {
+        CHECK_ARG
+        params.cpuparams.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict") {
+        params.cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll") {
+        params.cpuparams.poll = true;
+        return true;
+    }
     if (arg == "-tb" || arg == "--threads-batch") {
         CHECK_ARG
-        params.n_threads_batch = std::stoi(argv[i]);
-        if (params.n_threads_batch <= 0) {
-            params.n_threads_batch = std::thread::hardware_concurrency();
+        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.cpuparams_batch.n_threads <= 0) {
+            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "-Crb" || arg == "--cpu-range_batch") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch") {
+        CHECK_ARG
+        params.cpuparams_batch.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch") {
+        params.cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch") {
+        params.cpuparams_batch.poll = true;
+        return true;
+    }
     if (arg == "-td" || arg == "--threads-draft") {
         CHECK_ARG
-        params.n_threads_draft = std::stoi(argv[i]);
-        if (params.n_threads_draft <= 0) {
-            params.n_threads_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams.n_threads <= 0) {
+            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
         }
         return true;
+    }
+        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
+        CHECK_ARG
+        std::string mask = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "-Crd" || arg == "--cpu-range-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
+        return true;
+    }
+    if (arg == "--prio-draft") {
+        CHECK_ARG
+        params.draft_cpuparams.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-draft") {
+        params.draft_cpuparams.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-draft") {
+        params.draft_cpuparams.poll = true;
+        return true;
     }
     if (arg == "-tbd" || arg == "--threads-batch-draft") {
         CHECK_ARG
-        params.n_threads_batch_draft = std::stoi(argv[i]);
-        if (params.n_threads_batch_draft <= 0) {
-            params.n_threads_batch_draft = std::thread::hardware_concurrency();
+        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
+        if (params.draft_cpuparams_batch.n_threads <= 0) {
+            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
         }
         return true;
     }
+    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
+        CHECK_ARG
+        std::string range = argv[i];
+        params.draft_cpuparams_batch.mask_valid = true;
+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
+        return true;
+    }
+    if (arg == "--prio-batch-draft") {
+        CHECK_ARG
+        params.draft_cpuparams_batch.priority = std::stoul(argv[i]);
+        return true;
+    }
+    if (arg == "--cpu-strict-batch-draft") {
+        params.draft_cpuparams_batch.strict_cpu = true;
+        return true;
+    }
+    if (arg == "--poll-batch-draft") {
+        params.draft_cpuparams_batch.poll = true;
+        return true;
+    }
     if (arg == "-p" || arg == "--prompt") {
         CHECK_ARG
         params.prompt = argv[i];
@@ -1401,11 +1610,38 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
     options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
+    options.push_back({ "*",           "-C,    --cpu-mask M",           "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
+    options.push_back({ "*",           "       --cpu-strict",           "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
+    options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
+    options.push_back({ "*",           "       --poll",                 "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
+                                                                        "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
+    options.push_back({ "*",           "       --cpu-strict-batch",     "use strict CPU placement (default: same as --cpu-strict)"});
+    options.push_back({ "*",           "       --priority-batch N",     "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
+    options.push_back({ "*",           "       --poll-batch",           "use polling to wait for work (default: --poll)"});
     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",     "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi",
+                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
+    options.push_back({ "speculative", "       --cpu-strict-draft",     "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
+    options.push_back({ "speculative", "       --priority-draft N",     "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
+    options.push_back({ "speculative", "       --poll-draft",           "Use polling to wait for draft model work (default: same as --poll])"});
     options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
                                                                         "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M",
+                                                                        "Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
+    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
+                                                                        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
+    options.push_back({ "speculative", "       --cpu-strict-batch-draft",
+                                                                        "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
+    options.push_back({ "speculative", "       --priority-batch-draft N",
+                                                                        "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
+    options.push_back({ "speculative", "       --poll-batch-draft",     "Use polling to wait for draft model work (default: --poll-draft)"});
+
     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
     options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
@@ -1707,9 +1943,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 std::string gpt_params_get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
+    if (params.cpuparams_batch.n_threads != -1) {
+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
     }
     os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
 
@@ -2192,8 +2428,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_seq_max         = params.n_parallel;
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.n_threads         = params.cpuparams.n_threads;
+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
+                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
@@ -2219,6 +2456,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     return cparams;
 }
 
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
+    struct ggml_threadpool_params tpp;
+
+    tpp.mask_specified = params.mask_valid;
+    if (params.mask_valid) {
+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
+    }
+
+    tpp.n_threads  = params.n_threads;
+    tpp.prio       = params.priority;
+    tpp.poll       = params.poll;
+    tpp.strict_cpu = params.strict_cpu;
+
+    return tpp;
+}
+
 #ifdef LLAMA_USE_CURL
 
 static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -3215,7 +3468,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h
index 8240ff99b8e2a..f9aa61edf4a6b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -58,13 +58,18 @@ enum dimre_method {
     DIMRE_METHOD_MEAN,
 };
 
+struct cpu_params {
+    int32_t  n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    bool     poll                        = false;   // Use polling (busywait) to wait for work
+};
+
 struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
-    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -91,6 +96,11 @@ struct gpt_params {
     int32_t yarn_orig_ctx         =     0; // YaRN original context length
     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
 
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+    struct cpu_params draft_cpuparams;
+    struct cpu_params draft_cpuparams_batch;
+
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
 
@@ -195,7 +205,7 @@ struct gpt_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";
@@ -268,6 +278,10 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
 
 std::string gpt_params_get_system_info(const gpt_params & params);
 
+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+
 //
 // String utils
 //
@@ -311,8 +325,9 @@ std::string fs_get_cache_file(const std::string & filename);
 // TODO: avoid tuplue, use struct
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
 
-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
 
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 67b3d27747850..247d52c6d3454 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,6 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    add_subdirectory(speculative)
+    #add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 4f6c3746a106c..22818304fc6ff 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -19,7 +19,7 @@ constexpr float rms_norm_eps = 5e-6f;
 #endif
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 47cb16c69d536..e78f6b388ef6e 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -21,7 +21,7 @@
 #endif
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index d4e126ac22e6f..fa40be670268f 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -485,8 +485,8 @@ int main(int argc, char ** argv) {
     if (use_pca) {
         // run PCA
         PCA::pca_params pca_params;
-        pca_params.n_threads = params.n_threads;
-        pca_params.n_batch = params.n_pca_batch;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
         pca_params.n_iterations = params.n_pca_iterations;
         PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
     } else {
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index a6497b6e0bf82..a4b3b3bb8cd8f 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -206,6 +206,7 @@ struct cmd_params {
     std::vector<bool> use_mmap;
     std::vector<bool> embeddings;
     ggml_numa_strategy numa;
+    cpu_params cpuparams;
     int reps;
     bool verbose;
     output_formats output_format;
@@ -232,6 +233,7 @@ static const cmd_params cmd_params_defaults = {
     /* use_mmap             */ {true},
     /* embeddings           */ {false},
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* cpuparams            */ {},
     /* reps                 */ 5,
     /* verbose              */ false,
     /* output_format        */ MARKDOWN,
@@ -260,6 +262,11 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
     printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
+    printf("  -mt, --max-threads <n>              (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
+    printf("  -C, --cpu-mask <hex>                (default: 0x0)\n");
+    printf("  --cpu-strict <0|1>                  (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
+    printf("  --priority <0|1|2|3>                (default: %d)\n", cmd_params_defaults.cpuparams.priority);
+    printf("  --poll <0|1>                        (default: %d)\n", cmd_params_defaults.cpuparams.poll);
     printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
@@ -463,6 +470,30 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
                 else { invalid_param = true; break; }
             }
+        } else if (arg == "-mt" || arg == "--max-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-C" || arg == "--cpu-mask") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string mask = argv[i];
+            params.cpuparams.mask_valid = true;
+            invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
+        } else if (arg == "--prio") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.cpuparams.priority = std::stoul(argv[i]);
+        } else if (arg == "--cpu-strict") {
+            params.cpuparams.strict_cpu = true;
+        } else if (arg == "--poll") {
+            params.cpuparams.poll = true;
         } else if (arg == "-fa" || arg == "--flash-attn") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1373,6 +1404,23 @@ int main(int argc, char ** argv) {
     llama_model * lmodel = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
+    postprocess_cpu_params(params.cpuparams);
+
+    struct ggml_threadpool_params tpp;
+    tpp.n_threads      = params.cpuparams.n_threads;
+    tpp.mask_specified = params.cpuparams.mask_valid;
+    tpp.strict_cpu     = params.cpuparams.strict_cpu;
+    tpp.prio           = params.cpuparams.priority;
+    tpp.poll           = params.cpuparams.poll;
+
+    std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
+
+    struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
     for (const auto & inst : params_instances) {
         // keep the same model between tests when possible
         if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1398,6 +1446,7 @@ int main(int argc, char ** argv) {
         test t(inst, lmodel, ctx);
 
         llama_kv_cache_clear(ctx);
+        llama_attach_threadpool(ctx, threadpool);
 
         // warmup run
         if (t.n_prompt > 0) {
@@ -1439,6 +1488,8 @@ int main(int argc, char ** argv) {
         llama_free(ctx);
     }
 
+    ggml_release_threadpool(threadpool);
+
     llama_free_model(lmodel);
 
     if (p) {
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 8c7dd2ae3d0dc..86b39f20eea6e 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
         if (!params->image.empty()) {
             LOG_TEE("using base64 encoded image instead of command line image path\n");
         }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
         if (!embed) {
             LOG_TEE("%s: can't load image from prompt\n", __func__);
             return NULL;
         }
         params->prompt = remove_image_from_prompt(prompt);
     } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
         if (!embed) {
             fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
             return NULL;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 61e960ea2abe6..ef5b0946c0a1e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -218,6 +218,33 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    LOG("%s: llama threadpool init = n_threads = %d\n",
+        __func__,
+        (int32_t) params.cpuparams.n_threads
+    );
+    struct ggml_threadpool_params tpp_batch =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    struct ggml_threadpool_params tpp =
+            ggml_threadpool_params_from_cpu_params(params.cpuparams);
+
+    struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
+    if (!threadpool_batch) {
+        LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+        exit(1);
+    }
+    struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
+    if (!threadpool) {
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
+    }
+
+    llama_attach_batch_threadpool(ctx, threadpool_batch);
+    llama_attach_threadpool(ctx, threadpool);
+    if (ctx_guidance) {
+        llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
+        llama_attach_threadpool(ctx_guidance, threadpool);
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
     LOG("n_ctx: %d\n", n_ctx);
@@ -986,6 +1013,9 @@ int main(int argc, char ** argv) {
     llama_sampling_free(ctx_sampling);
     llama_backend_free();
 
+    ggml_release_threadpool(threadpool);
+    ggml_release_threadpool(threadpool_batch);
+
 #ifndef LOG_DISABLE_LOGS
     LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7813a2957d6bc..96404fc53d544 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2520,8 +2520,8 @@ int main(int argc, char ** argv) {
     });
 
     LOG_INFO("system info", {
-        {"n_threads",       params.n_threads},
-        {"n_threads_batch", params.n_threads_batch},
+        {"n_threads",       params.cpuparams.n_threads},
+        {"n_threads_batch", params.cpuparams_batch.n_threads},
         {"total_threads",   std::thread::hardware_concurrency()},
         {"system_info",     llama_print_system_info()},
     });
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index be22a74606c0b..a8050fa67b473 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -139,7 +139,7 @@ option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"
 set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "ggml: metal minimum macOS version")
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
-option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
+option(GGML_OPENMP                          "ggml: use OpenMP"                                OFF)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 434c13b34a929..cd85b6ee70560 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,8 +7,9 @@ extern "C" {
 #endif
 
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
+typedef struct  ggml_compute_threadpool * ggml_compute_threadpool_t;
 
 // Tensor allocator
 struct ggml_tallocr {
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 5f3f1e286990e..c59f9f54a44b9 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -102,6 +102,7 @@ extern "C" {
 
     GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
     GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
     GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
     // Create a backend buffer from an existing pointer
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 548661b9bb636..e58ef9f340d77 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -231,6 +231,8 @@
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
+#define GGML_MAX_N_THREADS      512
+
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@@ -617,6 +619,17 @@ extern "C" {
     // If it returns true, the computation is aborted
     typedef bool (*ggml_abort_callback)(void * data);
 
+    struct ggml_threadpool_params {
+        bool    cpumask[GGML_MAX_N_THREADS];
+        bool    mask_specified;
+        int32_t n_threads;
+        int32_t prio;
+        bool    poll;
+        bool    strict_cpu;
+    };
+
+    struct ggml_compute_threadpool;     // forward declaration, see ggml.c
+
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
@@ -624,6 +637,7 @@ extern "C" {
         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
         int n_threads;
+        struct ggml_compute_threadpool * threadpool;
 
         // abort ggml_graph_compute when true
         ggml_abort_callback abort_callback;
@@ -2003,10 +2017,19 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    GGML_API struct ggml_compute_threadpool* ggml_create_threadpool       (struct ggml_threadpool_params  * params);
+    GGML_API void                            ggml_release_threadpool      (struct ggml_compute_threadpool * threadpool);
+    GGML_API int32_t                         ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
+    GGML_API void                            ggml_pause_threadpool        (struct ggml_compute_threadpool * threadpool);
+    GGML_API void                            ggml_resume_threadpool       (struct ggml_compute_threadpool * threadpool);
+
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan            (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads,
+            struct ggml_compute_threadpool * threadpool);
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
     GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
index d39cfed8886f4..d63f49cfdaf8c 100644
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@@ -727,7 +727,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 #endif
 
 struct ggml_backend_cpu_context {
-    int n_threads;
+    int                       n_threads;
+    ggml_compute_threadpool_t threadpool;
+
     void * work_data;
     size_t work_size;
 
@@ -764,7 +766,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
 
     if (cpu_plan->cplan.work_size > 0) {
@@ -801,7 +803,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
     if (cpu_ctx->work_size < cplan.work_size) {
         free(cpu_ctx->work_data);
@@ -878,6 +880,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     }
 
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
     ctx->work_data           = NULL;
     ctx->work_size           = 0;
     ctx->abort_callback      = NULL;
@@ -908,6 +911,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->threadpool = threadpool;
+}
+
 void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f65837e856ac3..b0630729f9175 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1779,28 +1779,104 @@ struct ggml_context_container {
     struct ggml_context context;
 };
 
-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan * cplan;
+//
+// Threading defs
+//
+
+typedef pthread_t          ggml_thread_t;
+
+#if defined(_WIN32)
+
+typedef CONDITION_VARIABLE ggml_cond_t;
+typedef SRWLOCK            ggml_mutex_t;
+
+#define ggml_mutex_init(m)   InitializeSRWLock(m)
+#define ggml_mutex_destroy(m)
+#define ggml_mutex_lock(m)   AcquireSRWLockExclusive(m)
+#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m)
+#define ggml_mutex_lock_shared(m)   AcquireSRWLockShared(m)
+#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m)
+
+#define ggml_cond_init(c)    InitializeConditionVariable(c)
+#define ggml_cond_destroy(c)
+#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED)
+#define ggml_cond_broadcast(c) WakeAllConditionVariable(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#else
+
+typedef pthread_cond_t     ggml_cond_t;
+typedef pthread_mutex_t    ggml_mutex_t;
 
-    int n_threads;
+#define ggml_mutex_init(m)          pthread_mutex_init(m, NULL)
+#define ggml_mutex_destroy(m)       pthread_mutex_destroy(m)
+#define ggml_mutex_lock(m)          pthread_mutex_lock(m)
+#define ggml_mutex_unlock(m)        pthread_mutex_unlock(m)
+#define ggml_mutex_lock_shared(m)   pthread_mutex_lock(m)
+#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m)
+
+#define ggml_lock_init(x)    UNUSED(x)
+#define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
+#define ggml_lock_lock(x)    UNUSED(x)
+#endif
+#define ggml_lock_unlock(x)  UNUSED(x)
+
+#define GGML_LOCK_INITIALIZER 0
+#define ggml_cond_init(c)      pthread_cond_init(c, NULL)
+#define ggml_cond_destroy(c)   pthread_cond_destroy(c)
+#define ggml_cond_wait(c, m)   pthread_cond_wait(c, m)
+#define ggml_cond_broadcast(c) pthread_cond_broadcast(c)
+
+#define ggml_thread_create pthread_create
+#define ggml_thread_join   pthread_join
+
+#endif
+
+// Threadpool def
+struct ggml_compute_threadpool {
+    ggml_mutex_t mutex;       // mutex for cond.var
+    ggml_cond_t  cond;        // cond.var for waiting for new work
+
+    struct ggml_cgraph * cgraph;
+    struct ggml_cplan  * cplan;
 
     // synchronization primitives
     atomic_int n_barrier;
     atomic_int n_barrier_passed;
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+
+    volatile bool stop;      // Used for stopping the threadpool altogether
+    volatile bool pause;     // Used for pausing the threadpool or individual threads
+    volatile bool new_work;  // Set when there is work to be done, unset after it's done
+
+    struct ggml_compute_state * workers;   // per thread state
+    int32_t                     n_threads_max; // number of threads in the pool
+    int32_t                     n_threads_cur; // number of threads used in the current graph
+
+    int32_t      prio;       // Scheduling priority
+    bool         disposable; // Doesn't initialize a conv-var
+    bool         poll;       // Use polling (busywait)  // TODO
 
     ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
 
-    atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads
-
     enum ggml_status ec;
 };
 
+// Per-thread state
 struct ggml_compute_state {
+#ifndef GGML_USE_OPENMP
     ggml_thread_t thrd;
+    bool cpumask[GGML_MAX_N_THREADS];
+    bool mask_specified;
+#endif
+    struct ggml_compute_threadpool * threadpool;
     int ith;
-    struct ggml_compute_state_shared * shared;
 };
 
 struct ggml_compute_params {
@@ -1811,7 +1887,7 @@ struct ggml_compute_params {
     size_t wsize;
     void * wdata;
 
-    struct ggml_compute_state_shared * shared;
+    struct ggml_compute_threadpool * threadpool;
 };
 
 //
@@ -2906,23 +2982,23 @@ inline static void ggml_critical_section_start(void) {
 }
 
 #ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
 
     #pragma omp barrier
 }
 #else
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
-    if (shared->n_threads == 1) {
+static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
+    if (threadpool->n_threads_cur == 1) {
         return;
     }
 
-    atomic_int * n_barrier = &shared->n_barrier;
-    atomic_int * n_barrier_passed = &shared->n_barrier_passed;
+    atomic_int * n_barrier = &threadpool->n_barrier;
+    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
 
-    int n_threads = shared->n_threads;
+    int n_threads = threadpool->n_threads_cur;
     int passed_old = atomic_load(n_barrier_passed);
 
     if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
@@ -9904,7 +9980,7 @@ static void ggml_compute_forward_acc_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     const int ith = params->ith;
@@ -12278,10 +12354,10 @@ UseGgmlGemm1:;
 
     if (ith == 0) {
         // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        atomic_store(&params->shared->current_chunk, nth);
+        atomic_store(&params->threadpool->current_chunk, nth);
     }
 
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
 #if GGML_USE_LLAMAFILE
     if (src1->type != vec_dot_type) {
@@ -12389,7 +12465,7 @@ UseGgmlGemm2:;
             break;
         }
 
-        current_chunk = atomic_fetch_add(&params->shared->current_chunk, 1);
+        current_chunk = atomic_fetch_add(&params->threadpool->current_chunk, 1);
     }
 }
 
@@ -12484,7 +12560,7 @@ static void ggml_compute_forward_mul_mat_id(
         }
     }
 
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // compute each matrix multiplication in sequence
     for (int cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -12638,7 +12714,7 @@ static void ggml_compute_forward_out_prod_f32(
     if (ith == 0) {
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // dst[:,:,:,:] = 0
     // for i2,i3:
@@ -12756,7 +12832,7 @@ static void ggml_compute_forward_out_prod_q_f32(
     if (ith == 0) {
         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     // parallelize by last three dimensions
 
@@ -12942,7 +13018,7 @@ static void ggml_compute_forward_set_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     const int ith = params->ith;
@@ -13521,7 +13597,7 @@ static void ggml_compute_forward_diag_mask_f32(
                 ((char *) src0->data),
                 ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
 
     // TODO: handle transposed/permuted matrices
@@ -14297,7 +14373,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 
@@ -14385,7 +14461,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
         // need to zero dst since we are accumulating into it
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
 
@@ -14672,7 +14748,7 @@ static void ggml_compute_forward_conv_transpose_2d(
 
         memset(dst->data, 0, ggml_nbytes(dst));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int32_t stride = ggml_get_op_params_i32(dst, 0);
 
@@ -15406,7 +15482,7 @@ static void ggml_compute_forward_flash_attn_back_f32(
     if (ith == 0) {
         memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3);
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const int64_t elem_q = ggml_nelements(q);
     const int64_t elem_k = ggml_nelements(k);
@@ -16178,7 +16254,7 @@ static void ggml_compute_forward_add_rel_pos_f32(
         if (params->ith == 0) {
             memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
         }
-        ggml_barrier(params->shared);
+        ggml_barrier(params->threadpool);
     }
     // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
 
@@ -16463,7 +16539,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
     if (ith == 0) {
         memset(sums, 0, sizeof(float) * (nth + nth * nc));
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     const double eps = 1e-9;
 
@@ -16511,7 +16587,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
         }
 #endif
     }
-    ggml_barrier(params->shared);
+    ggml_barrier(params->threadpool);
 
     if (ith == 0) {
         float * dp = (float *) dst->data;
@@ -18282,65 +18358,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
     memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
 }
 
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
@@ -18617,9 +18634,292 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
     return n_tasks;
 }
 
-struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
+
+enum {
+    SCHED_PRIO_NORMAL,
+    SCHED_PRIO_MEDIUM,
+    SCHED_PRIO_HIGH,
+    SCHED_PRIO_REALTIME
+};
+
+#if defined(_WIN32)
+#include "windows.h"
+
+// TODO: support > 64 CPUs
+static bool __thread_affinity(bool * mask) {
+    HANDLE    h = GetCurrentThread();
+    uint64_t  bitmask = 0ULL;
+
+    assert(GGML_MAX_N_THREADS >= 64);
+
+    for (int32_t i = 0; i < 8; i++) {
+        int32_t idx = i * 8;
+        uint8_t val = 0;
+        val |= mask[idx + 0] << 0;
+        val |= mask[idx + 1] << 1;
+        val |= mask[idx + 2] << 2;
+        val |= mask[idx + 3] << 3;
+        val |= mask[idx + 4] << 4;
+        val |= mask[idx + 5] << 5;
+        val |= mask[idx + 6] << 6;
+        val |= mask[idx + 7] << 7;
+        bitmask |= (uint64_t)val << idx;
+    }
+
+    for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n");
+            break;
+        }
+    }
+
+    DWORD_PTR m = (DWORD_PTR)bitmask;
+
+    m = SetThreadAffinityMask(h, m);
+
+    return m != 0;
+}
+
+static bool __process_priority(int32_t prio) {
+    DWORD p = NORMAL_PRIORITY_CLASS;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
+        case SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
+        case SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
+        case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
+    }
+
+    return SetPriorityClass(GetCurrentProcess(), p);
+}
+
+static bool __thread_priority(int32_t prio) {
+    DWORD p = NORMAL_PRIORITY_CLASS;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p = THREAD_PRIORITY_NORMAL;        break;
+        case SCHED_PRIO_MEDIUM:   p = THREAD_PRIORITY_ABOVE_NORMAL;  break;
+        case SCHED_PRIO_HIGH:     p = THREAD_PRIORITY_HIGHEST;       break;
+        case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
+    }
+
+    return SetThreadPriority(GetCurrentThread(), p);
+
+}
+
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/resource.h>
+
+static bool __thread_affinity(const bool * mask) {
+    UNUSED(mask);
+    return true;
+}
+
+static bool __process_priority(int32_t prio) {
+    int32_t p = 0;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   p =  0;  break;
+        case SCHED_PRIO_MEDIUM:   p = -5;  break;
+        case SCHED_PRIO_HIGH:     p = -10; break;
+        case SCHED_PRIO_REALTIME: p = -20; break;
+    }
+
+    int32_t r = setpriority(PRIO_PROCESS, 0, p);
+    return r != -1;
+}
+
+static bool __thread_priority(int32_t prio) {
+    UNUSED(prio);
+    return true;
+}
+
+#else // posix?
+
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+#include <sched.h>
+
+static bool __thread_affinity(const bool * mask) {
+    cpu_set_t cpuset;
+    int32_t err;
+
+    CPU_ZERO(&cpuset);
+
+    for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+        if (mask[i]) {
+            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            CPU_SET(i, &cpuset);
+        }
+    }
+
+#ifdef __ANDROID__
+    err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+    if (err < 0) {
+        err = errno;
+    }
+#else
+    err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+#endif
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set affinity mask 0x%llx (err %d: %s)\n", (unsigned long long)mask, err, strerror(err));
+        return false;
+    }
+
+    return true;
+}
+
+static bool __process_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    int32_t err = sched_setscheduler(0, policy, &p);
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set process priority %d (err %d)\n", prio, err);
+        return false;
+    }
+
+    return true;
+}
+
+static bool __thread_priority(int32_t prio) {
+    struct sched_param p;
+    int32_t policy = SCHED_OTHER;
+    switch (prio) {
+        case SCHED_PRIO_NORMAL:   policy = SCHED_OTHER; p.sched_priority = 0;  break;
+        case SCHED_PRIO_MEDIUM:   policy = SCHED_FIFO;  p.sched_priority = 40; break;
+        case SCHED_PRIO_HIGH:     policy = SCHED_FIFO;  p.sched_priority = 80; break;
+        case SCHED_PRIO_REALTIME: policy = SCHED_FIFO;  p.sched_priority = 90; break;
+    }
+
+    int32_t err = pthread_setschedparam(pthread_self(), policy, &p);
+    if (err != 0) {
+        //fprintf(stderr, "warn: failed to set thread priority %d (err %d)\n", prio, err);
+        return false;
+    }
+
+    return true;
+}
+
+#endif
+
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void __cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void __cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void __cpu_relax(void) {;}
+#endif
+
+static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
+    if (!global_mask) {
+        memset(local_mask, 1, GGML_MAX_N_THREADS);
+        return;
+    }
+    if (!strict) {
+        memcpy(local_mask, global_mask, GGML_MAX_N_THREADS);
+        return;
+    } else {
+        memset(local_mask, 0, GGML_MAX_N_THREADS);
+        int32_t base_idx = *iter;
+        for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
+            int32_t idx = base_idx + i;
+            if (idx >= GGML_MAX_N_THREADS) {
+                // Just a cheaper modulo
+                idx -= GGML_MAX_N_THREADS;
+            }
+            if (global_mask[idx]) {
+                local_mask[idx] = 1;
+                *iter = idx + 1;
+                return;
+            }
+        }
+    }
+}
+
+void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
+    if (!threadpool) return;
+
+#ifndef GGML_USE_OPENMP
+    struct ggml_compute_state* workers = threadpool->workers;
+    const int32_t n_threads = threadpool->n_threads_max;
+
+    if (!threadpool->disposable) {
+        ggml_mutex_lock(&threadpool->mutex);
+    }
+    threadpool->n_threads_cur = n_threads;
+    threadpool->stop = true;
+    threadpool->pause = false;
+    if (!threadpool->disposable) {
+        ggml_cond_broadcast(&threadpool->cond);
+        ggml_mutex_unlock(&threadpool->mutex);
+    }
+
+    for (int32_t j = 1; j < n_threads; j++) {
+        int32_t rc = ggml_thread_join(workers[j].thrd, NULL);
+        GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED);
+        UNUSED(rc);
+    }
+
+    GGML_ALIGNED_FREE(workers);
+
+    if (!threadpool->disposable) {
+        ggml_mutex_destroy(&threadpool->mutex);
+        ggml_cond_destroy(&threadpool->cond);
+    }
+#endif // GGML_USE_OPENMP
+
+    GGML_ALIGNED_FREE(threadpool);
+}
+
+void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    GGML_ASSERT(!threadpool->disposable);
+    GGML_PRINT_DEBUG("Pausing threadpool\n");
+    threadpool->pause = true;
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
+#ifndef GGML_USE_OPENMP
+    GGML_ASSERT(!threadpool->disposable);
+    GGML_PRINT_DEBUG("Resuming threadpool\n");
+
+    ggml_mutex_lock(&threadpool->mutex);
+    threadpool->pause = false;
+    ggml_cond_broadcast(&threadpool->cond);
+    ggml_mutex_unlock(&threadpool->mutex);
+#else
+    UNUSED(threadpool);
+#endif
+}
+
+struct ggml_cplan ggml_graph_plan(
+          const struct ggml_cgraph * cgraph,
+                           int32_t   n_threads,
+    struct ggml_compute_threadpool * threadpool) {
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool\n");
+    }
     if (n_threads <= 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
+        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
     }
 
     size_t work_size = 0;
@@ -18775,12 +19075,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
     }
 
     if (work_size > 0) {
-        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+        work_size += CACHE_LINE_SIZE*(n_threads);
     }
 
-    cplan.n_threads = MIN(max_tasks, n_threads);
-    cplan.work_size = work_size;
-    cplan.work_data = NULL;
+    cplan.threadpool = threadpool;
+    cplan.n_threads  = MIN(max_tasks, n_threads);
+    cplan.work_size  = work_size;
+    cplan.work_data  = NULL;
 
     return cplan;
 }
@@ -18788,36 +19089,206 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
-    const struct ggml_cgraph * cgraph = state->shared->cgraph;
-    const struct ggml_cplan  * cplan  = state->shared->cplan;
+    const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
+    const struct ggml_cplan  * cplan  = state->threadpool->cplan;
 
     set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
-        /*.ith   =*/ state->ith,
-        /*.nth   =*/ state->shared->n_threads,
-        /*.wsize =*/ cplan->work_size,
-        /*.wdata =*/ cplan->work_data,
-        /*.shared=*/ state->shared,
+        /*.ith       =*/ state->ith,
+        /*.nth       =*/ state->threadpool->n_threads_cur,
+        /*.wsize     =*/ cplan->work_size,
+        /*.wdata     =*/ cplan->work_data,
+        /*.threadpool=*/ state->threadpool,
     };
 
-    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];
+    struct ggml_tensor * node = cgraph->nodes[0];
+
+    ggml_compute_forward(&params, node);
+    if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
+        state->threadpool->ec = GGML_STATUS_ABORTED;
+    }
 
+    for (int node_n = 1; node_n < cgraph->n_nodes; node_n++) {
+        ggml_barrier(state->threadpool);
+
+        if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
+            break;
+        }
+
+        node = cgraph->nodes[node_n];
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
-            state->shared->ec = GGML_STATUS_ABORTED;
+            state->threadpool->ec = GGML_STATUS_ABORTED;
         }
+    }
 
-        ggml_barrier(state->shared);
+    if (!state->threadpool->disposable && state->ith == 0) {
+        state->threadpool->new_work = false;
+    }
 
-        if (state->shared->ec != GGML_STATUS_SUCCESS) {
-            break;
+    ggml_barrier(state->threadpool);
+
+    return 0;
+}
+
+
+
+#ifndef GGML_USE_OPENMP
+
+static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    do {
+        if (threadpool->poll) {
+            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
+                // No new work. Yield and keep polling.
+                //__cpu_relax();
+            }
+        } else {
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
+                // No new work. Wait for the signal.
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+    } while (state->ith >= threadpool->n_threads_cur);
+    return threadpool->new_work;
+}
+
+static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
+    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    struct ggml_compute_threadpool * threadpool = state->threadpool;
+
+    GGML_ASSERT(!threadpool->disposable);
+
+    __thread_priority(threadpool->prio);
+    if (state->mask_specified)
+        __thread_affinity(state->cpumask);
+
+    while (true) {
+        // Check if we need to sleep
+        while (threadpool->pause) {
+            GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith);
+            ggml_mutex_lock_shared(&threadpool->mutex);
+            if (threadpool->pause) {
+                ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
+            }
+            GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith);
+            ggml_mutex_unlock_shared(&threadpool->mutex);
+        }
+        // This needs to be checked for after the cond_wait
+        if (threadpool->stop) break;
+
+        // Check if there is new work
+        // The main thread is the only one that can dispatch new work
+
+        bool new_work = ggml_graph_compute_check_for_work(state);
+        if (new_work) {
+            int64_t ret = (int64_t) ggml_graph_compute_thread(state);
+            if (ret == GGML_EXIT_ABORTED)
+                return (thread_ret_t) ret;
+
+            if (ret != GGML_EXIT_SUCCESS && ret != GGML_EXIT_ABORTED) {
+                fprintf(stderr, "ggml_graph_compute_thread exited with an unexpected error: %lld\n", (long long int) ret);
+                GGML_ASSERT(false);
+            }
         }
     }
 
-    return 0;
+    return (thread_ret_t) 0;
+}
+
+#endif // GGML_USE_OPENMP
+
+static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
+    struct ggml_threadpool_params * tpp,
+                             bool   disposable,
+               struct ggml_cgraph * cgraph,
+                struct ggml_cplan * cplan) {
+
+    struct ggml_compute_threadpool * threadpool =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_threadpool));
+    {
+        threadpool->cgraph           = cgraph;
+        threadpool->cplan            = cplan;
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->stop             = false;
+        threadpool->pause            = disposable ? false : true;
+        threadpool->new_work         = false;
+        threadpool->workers          = NULL;
+        threadpool->n_threads_max    = tpp->n_threads;
+        threadpool->n_threads_cur    = disposable ? tpp->n_threads : 0;
+        threadpool->disposable       = disposable;
+        threadpool->poll             = tpp->poll;
+        threadpool->prio             = tpp->prio;
+
+        threadpool->abort_callback      = NULL;
+        threadpool->abort_callback_data = NULL;
+        threadpool->ec                  = GGML_STATUS_SUCCESS;
+    }
+
+#ifndef GGML_USE_OPENMP
+    if (!disposable) {
+        ggml_mutex_init(&threadpool->mutex);
+        ggml_cond_init(&threadpool->cond);
+    }
+#endif // GGML_USE_OPENMP
+
+    struct ggml_compute_state * workers =
+        GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_state) * tpp->n_threads);
+
+    threadpool->workers = workers;
+
+#ifdef GGML_USE_OPENMP
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j] = (struct ggml_compute_state) {
+            .threadpool     = threadpool,
+            .ith            = j
+        };
+    }
+#else  // Not using OPENMP
+    int32_t cpumask_iter = 0;
+
+    __process_priority(tpp->prio);
+    __thread_priority(tpp->prio);
+
+    for (int j = 0; j < tpp->n_threads; j++) {
+        workers[j] = (struct ggml_compute_state) {
+            .thrd           = 0,
+            .mask_specified = tpp->mask_specified,
+            .threadpool     = threadpool,
+            .ith            = j
+        };
+
+        if (tpp->mask_specified) {
+            __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+        }
+
+        // Disposable threadpools need to have a valid cplan and cgraph immediately.
+        thread_ret_t (*thread_entrypoint)(void*) = disposable ? ggml_graph_compute_thread : ggml_graph_compute_secondary_thread;
+        // Spin threads for all secondary workers
+        if (j > 0) {
+            int32_t rc = ggml_thread_create(
+                &workers[j].thrd,
+                NULL,
+                thread_entrypoint,
+                &workers[j]
+            );
+            GGML_ASSERT(rc == 0);
+        }
+    }
+#endif // GGML_USE_OPENMP
+
+    return threadpool;
+}
+
+struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) {
+    return ggml_create_threadpool_impl(tpp, false, NULL, NULL);
 }
 
 enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
@@ -18825,19 +19296,41 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     GGML_ASSERT(cplan->n_threads > 0);
     GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL);
 
-    int n_threads = cplan->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ cplan,
-        /*.n_threads               =*/ n_threads,
-        /*.n_barrier               =*/ 0,
-        /*.n_barrier_passed        =*/ 0,
-        /*.abort_callback          =*/ NULL,
-        /*.abort_callback_data     =*/ NULL,
-        /*.current_chunk           =*/ 0,
-        /*.ec                      =*/ GGML_STATUS_SUCCESS,
-    };
+    int32_t n_threads                           = cplan->n_threads;
+    struct ggml_compute_threadpool * threadpool = cplan->threadpool;
+
+    bool disposable_threadpool = false;
+
+    if (threadpool == NULL) {
+        GGML_PRINT_DEBUG("NOTE: No threadpool was specified in this cplan. Will create a disposable threadpool\n");
+        disposable_threadpool = true;
+
+        struct ggml_threadpool_params ttp = {
+            .mask_specified = false,
+            .n_threads      = n_threads,
+            .prio           = 1,
+            .poll           = false,
+            .strict_cpu     = false
+        };
+
+        threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
+    } else if (n_threads > threadpool->n_threads_max) {
+        GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
+    }
+
+    // Set up work
+    threadpool->cgraph        = cgraph;
+    threadpool->cplan         = cplan;
+    threadpool->n_threads_cur = n_threads;
+
+    if (!disposable_threadpool) {
+        // Reset some of the paramters that need resetting
+        // No worker threads should be accessing the parameters below at this stage
+        threadpool->n_barrier        = 0;
+        threadpool->n_barrier_passed = 0;
+        threadpool->current_chunk    = 0;
+        threadpool->ec               = GGML_STATUS_SUCCESS;
+    }
 
 #ifdef GGML_USE_OPENMP
     if (n_threads > 1) {
@@ -18847,63 +19340,52 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             {
                 // update the number of threads from the actual number of threads that we got from OpenMP
                 n_threads = omp_get_num_threads();
-                state_shared.n_threads = n_threads;
+                threadpool->n_threads_cur = n_threads;
             }
 
             struct ggml_compute_state worker = {
-                .thrd   = 0,
-                .ith    = omp_get_thread_num(),
-                .shared = &state_shared,
+                .ith        = omp_get_thread_num(),
+                .threadpool = threadpool,
             };
             ggml_graph_compute_thread(&worker);
         }
     } else {
         struct ggml_compute_state worker = {
-            .thrd   = 0,
-            .ith    = 0,
-            .shared = &state_shared,
+            .ith        = 0,
+            .threadpool = threadpool,
         };
         ggml_graph_compute_thread(&worker);
     }
 #else
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
-
-    for (int j = 0; j < n_threads; ++j) {
-        workers[j] = (struct ggml_compute_state) {
-            .thrd   = 0,
-            .ith    = j,
-            .shared = &state_shared,
-        };
-    }
-
-    // create thread pool
-    for (int j = 1; j < n_threads; ++j) {
-        const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-        GGML_ASSERT(rc == 0);
-        UNUSED(rc);
-    }
-
-    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
+    if (!disposable_threadpool) {
+        // Update main thread affinity to match the current threadpool
+        if (threadpool->workers[0].mask_specified) {
+            __thread_affinity(threadpool->workers[0].cpumask);
+        }
 
-    // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
+        threadpool->new_work = true;
+        if (!threadpool->poll) {
+            ggml_mutex_lock(&threadpool->mutex);
+            ggml_cond_broadcast(&threadpool->cond);
+            ggml_mutex_unlock(&threadpool->mutex);
         }
     }
+    // this is a work thread too
+    ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif
 
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 
-    return state_shared.ec;
+    if (disposable_threadpool) {
+        ggml_release_threadpool(threadpool);
+    }
+
+    return threadpool->ec;
 }
 
 enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL);
 
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
 
@@ -19698,7 +20180,7 @@ static enum ggml_opt_result ggml_opt_adam(
 
     float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
@@ -20045,7 +20527,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         opt->iter = iter;
     }
 
-    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads);
+    struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL);
     struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size);
     cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
 
diff --git a/include/llama.h b/include/llama.h
index 413070d95a5c4..9a2ccb1710f9a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -430,6 +430,18 @@ extern "C" {
     //optional:
     LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
 
+    // Optional: an auto threadpool gets created in ggml if not passed explicitly
+    LLAMA_API void llama_attach_threadpool(
+               struct   llama_context * ctx,
+            ggml_compute_threadpool_t   threadpool);
+    LLAMA_API void llama_attach_batch_threadpool(
+               struct   llama_context * ctx,
+            ggml_compute_threadpool_t   threadpool);
+    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+    LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
+    LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
+
+
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 972f870b072b8..30a39e34d60b3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2690,6 +2690,9 @@ struct llama_context {
 #endif
     ggml_backend_t backend_cpu = nullptr;
 
+    ggml_compute_threadpool_t threadpool       = nullptr;
+    ggml_compute_threadpool_t threadpool_batch = nullptr;
+
     bool has_evaluated_once = false;
 
     int64_t t_start_us;
@@ -14384,11 +14387,11 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     return n_outputs_max;
 }
 
-
 static void llama_graph_compute(
-        llama_context & lctx,
-          ggml_cgraph * gf,
-                  int   n_threads) {
+                  llama_context & lctx,
+                    ggml_cgraph * gf,
+                            int   n_threads,
+        ggml_compute_threadpool * threadpool) {
 #ifdef GGML_USE_METAL
     if (ggml_backend_is_metal(lctx.backend_metal)) {
         ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@@ -14397,6 +14400,7 @@ static void llama_graph_compute(
 
     if (lctx.backend_cpu != nullptr) {
         ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
         ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
     }
 #ifdef GGML_USE_BLAS
@@ -14410,6 +14414,42 @@ static void llama_graph_compute(
     // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 }
 
+// Optionally swaps the batch and single-tok threadpools.
+// Returns the number of threads, and if a valid threadpool exists, returns it too.
+static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
+        llama_context & lctx,
+              int32_t   n_tokens) {
+
+    const auto & cparams = lctx.cparams;
+    int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+
+    ggml_compute_threadpool_t threadpool = nullptr;  // nullptr -> disposable threadpool
+
+    // A batch threadpool without a non-batch threadpool isn't supported.
+    GGML_ASSERT(!lctx.threadpool_batch || lctx.threadpool);
+
+    if (lctx.threadpool_batch && lctx.threadpool) {
+        // Switch between the 2 threadpools as needed
+        if (n_tokens > 1) {
+            ggml_pause_threadpool(lctx.threadpool);
+            ggml_resume_threadpool(lctx.threadpool_batch);
+            threadpool = lctx.threadpool_batch;
+            n_threads = cparams.n_threads_batch;
+        } else {
+            ggml_pause_threadpool(lctx.threadpool_batch);
+            ggml_resume_threadpool(lctx.threadpool);
+            threadpool = lctx.threadpool;
+            n_threads = cparams.n_threads;
+        }
+    } else if (lctx.threadpool) {
+        ggml_resume_threadpool(lctx.threadpool);
+        threadpool = lctx.threadpool;
+        n_threads = cparams.n_threads;
+    }
+    return std::make_pair(n_threads, threadpool);
+}
+
+
 // decode a batch of tokens by evaluating the transformer
 //
 //   - lctx:      llama context
@@ -14533,7 +14573,12 @@ static int llama_decode_internal(
             lctx.n_outputs = n_outputs_new;
         }
 
-        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+        std::pair<int32_t, ggml_compute_threadpool_t> threads =
+            llama_swap_threadpools(lctx, n_tokens);
+
+        int32_t n_threads                    = threads.first;
+        ggml_compute_threadpool_t threadpool = threads.second;
+
         GGML_ASSERT(n_threads > 0);
 
         // helpers for smoother batch API transition
@@ -14618,7 +14663,7 @@ static int llama_decode_internal(
 
         llama_set_inputs(lctx, u_batch);
 
-        llama_graph_compute(lctx, gf, n_threads);
+        llama_graph_compute(lctx, gf, n_threads, threadpool);
 
         // update the kv ring buffer
         {
@@ -14779,7 +14824,11 @@ static int llama_encode_internal(
     lctx.inp_embd_enc = NULL;
     lctx.n_outputs = n_tokens;
 
-    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    std::pair<int32_t, ggml_compute_threadpool_t> threads =
+        llama_swap_threadpools(lctx, n_tokens);
+
+    int32_t n_threads                    = threads.first;
+    ggml_compute_threadpool_t threadpool = threads.second;
     GGML_ASSERT(n_threads > 0);
 
     // helpers for smoother batch API transition
@@ -14822,7 +14871,7 @@ static int llama_encode_internal(
 
     llama_set_inputs(lctx, batch);
 
-    llama_graph_compute(lctx, gf, n_threads);
+    llama_graph_compute(lctx, gf, n_threads, threadpool);
 
     // extract embeddings
     if (embd) {
@@ -15067,7 +15116,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
     ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
 
-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 #endif
 
     //const int64_t t_end = ggml_time_us();
@@ -15093,7 +15142,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
             llama_set_k_shift(lctx);
 
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
             need_reserve = true;
         }
@@ -15119,7 +15168,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
 
             llama_set_s_copy(lctx);
 
-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 
             need_reserve = true;
         }
@@ -16361,6 +16410,31 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
     }
 }
 
+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool) {
+    ctx->threadpool = threadpool;
+}
+
+void llama_attach_batch_threadpool(
+             struct llama_context * ctx,
+        ggml_compute_threadpool_t   threadpool_batch) {
+    ctx->threadpool_batch = threadpool_batch;
+}
+
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool = nullptr;
+}
+
+void llama_detach_batch_threadpool(struct llama_context * ctx) {
+    ctx->threadpool = nullptr;
+}
+
+void llama_detach_threadpools(struct llama_context * ctx) {
+    llama_detach_threadpool(ctx);
+    llama_detach_batch_threadpool(ctx);
+}
+
 void llama_backend_free(void) {
     ggml_quantize_free();
 }
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 8159e276af617..246bb227d1e19 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
 }
 
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
 
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);

From a4e97f320c7c1fd82d104a844cafc0fa3bb53d13 Mon Sep 17 00:00:00 2001
From: fmz <quic_fzaghlou@quic.com>
Date: Tue, 23 Jul 2024 06:57:43 -0700
Subject: [PATCH 2/4] uncomment cpu-relax

---
 ggml/src/ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b0630729f9175..d2deb0f6530dd 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -19144,7 +19144,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
         if (threadpool->poll) {
             while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
                 // No new work. Yield and keep polling.
-                //__cpu_relax();
+                __cpu_relax();
             }
         } else {
             ggml_mutex_lock_shared(&threadpool->mutex);

From bc7eaecfe42435aa0aa699f389cefb53c46aa808 Mon Sep 17 00:00:00 2001
From: fmz <quic_fzaghlou@quic.com>
Date: Tue, 23 Jul 2024 12:15:08 -0700
Subject: [PATCH 3/4] re-enable speculative

... facing segfaults on master ...
---
 CMakePresets.json                    | 256 +++++++--------------------
 examples/CMakeLists.txt              |   2 +-
 examples/speculative/speculative.cpp |  48 ++++-
 ggml/src/ggml.c                      |   2 +-
 include/llama.h                      |   2 +
 src/llama.cpp                        |   9 +
 6 files changed, 119 insertions(+), 200 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index ae2bf25c12786..bdad38952d3cb 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,197 +1,65 @@
 {
-    "version": 4,
-    "configurePresets": [
-        {
-            "name": "base",
-            "hidden": true,
-            "generator": "Ninja",
-            "binaryDir": "${sourceDir}/build-${presetName}",
-            "cacheVariables": {
-                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-            }
-        },
-        {
-            "name": "sycl-base",
-            "hidden": true,
-            "generator": "Ninja",
-            "binaryDir": "${sourceDir}/build-${presetName}",
-            "cacheVariables": {
-                "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-                "CMAKE_CXX_COMPILER": "icx",
-                "CMAKE_C_COMPILER": "cl",
-                "GGML_SYCL": "ON",
-                "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-            }
-        },
-        {
-            "name": "debug",
-            "hidden": true,
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
-        },
-        {
-            "name": "release",
-            "hidden": true,
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "Release"
-            }
-        },
-        {
-            "name": "reldbg",
-            "hidden": true,
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "RelWithDebInfo"
-            }
-        },
-        {
-            "name": "static",
-            "hidden": true,
-            "cacheVariables": {
-                "GGML_STATIC": "ON"
-            }
-        },
-        {
-            "name": "arm64-windows-msvc",
-            "hidden": true,
-            "architecture": {
-                "value": "arm64",
-                "strategy": "external"
-            },
-            "toolset": {
-                "value": "host=x86_64",
-                "strategy": "external"
-            },
-            "cacheVariables": {
-                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
-            }
-        },
-        {
-            "name": "arm64-windows-llvm",
-            "hidden": true,
-            "architecture": {
-                "value": "arm64",
-                "strategy": "external"
-            },
-            "toolset": {
-                "value": "host=x86_64",
-                "strategy": "external"
-            },
-            "cacheVariables": {
-                "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
-            }
-        },
-        {
-            "name": "arm64-windows-llvm-debug",
-            "inherits": [
-                "base",
-                "arm64-windows-llvm",
-                "debug"
-            ]
-        },
-        {
-            "name": "arm64-windows-llvm-release",
-            "inherits": [
-                "base",
-                "arm64-windows-llvm",
-                "reldbg"
-            ]
-        },
-        {
-            "name": "arm64-windows-llvm+static-release",
-            "inherits": [
-                "base",
-                "arm64-windows-llvm",
-                "reldbg",
-                "static"
-            ]
-        },
-        {
-            "name": "arm64-windows-msvc-debug",
-            "inherits": [
-                "base",
-                "arm64-windows-msvc",
-                "debug"
-            ]
-        },
-        {
-            "name": "arm64-windows-msvc-release",
-            "inherits": [
-                "base",
-                "arm64-windows-msvc",
-                "reldbg"
-            ]
-        },
-        {
-            "name": "arm64-windows-msvc+static-release",
-            "inherits": [
-                "base",
-                "arm64-windows-msvc",
-                "reldbg",
-                "static"
-            ]
-        },
-        {
-            "name": "x64-windows-msvc-debug",
-            "inherits": [
-                "base",
-                "debug"
-            ]
-        },
-        {
-            "name": "x64-windows-msvc-release",
-            "inherits": [
-                "base",
-                "reldbg"
-            ]
-        },
-        {
-            "name": "x64-windows-msvc+static-release",
-            "inherits": [
-                "base",
-                "reldbg",
-                "static"
-            ]
-        },
-        {
-            "name": "x64-windows-sycl-debug",
-            "inherits": [
-                "sycl-base",
-                "debug"
-            ]
-        },
-        {
-            "name": "x64-windows-sycl-release",
-            "inherits": [
-                "sycl-base",
-                "release"
-            ]
-        },
-        {
-            "name": "clang10",
-            "displayName": "Clang 10.0.0 x86_64-pc-linux-gnu",
-            "description": "Using compilers: C = /usr/bin/clang, CXX = /usr/bin/clang++",
-            "binaryDir": "${sourceDir}/out/build/${presetName}",
-            "cacheVariables": {
-                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
-                "CMAKE_C_COMPILER": "/usr/bin/clang",
-                "CMAKE_CXX_COMPILER": "/usr/bin/clang++",
-                "CMAKE_RC_COMPILER": "/usr/bin/llvm-rc-10",
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
-        },
-        {
-            "name": "gcc8.4",
-            "displayName": "GCC 8.4.0 x86_64-linux-gnu",
-            "description": "Using compilers: C = /usr/bin/gcc, CXX = /usr/bin/g++",
-            "binaryDir": "${sourceDir}/out/build/${presetName}",
-            "cacheVariables": {
-                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
-                "CMAKE_C_COMPILER": "/usr/bin/gcc",
-                "CMAKE_CXX_COMPILER": "/usr/bin/g++",
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
+  "version": 4,
+  "configurePresets": [
+    {
+        "name":  "base",
+        "hidden": true,
+        "generator":   "Ninja",
+        "binaryDir":   "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
         }
-    ]
+    },
+    {
+        "name": "sycl-base",
+        "hidden": true,
+        "generator": "Ninja",
+        "binaryDir": "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_CXX_COMPILER": "icx",
+            "CMAKE_C_COMPILER": "cl",
+            "GGML_SYCL": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
+    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
+
+    {
+        "name": "arm64-windows-msvc", "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+        }
+    },
+
+    {
+        "name": "arm64-windows-llvm", "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+        }
+    },
+
+    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
+
+    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
+
+    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
+
+    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
+  ]
 }
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 247d52c6d3454..67b3d27747850 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,6 @@ else()
     endif()
     add_subdirectory(save-load-state)
     add_subdirectory(simple)
-    #add_subdirectory(speculative)
+    add_subdirectory(speculative)
     add_subdirectory(tokenize)
 endif()
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 0939a1a6a7a38..8580839bc58ed 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -24,6 +24,14 @@ struct seq_draft {
     struct llama_sampling_context * ctx_sampling;
 };
 
+// static void switch_active_threadpool(
+//     llama_context* cur,
+//     llama_context* nxt
+// ) {
+//     ggml_pause_threadpool(cur);
+//     ggml_resume_threadpool(nxt);
+// }
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
@@ -68,15 +76,43 @@ int main(int argc, char ** argv) {
     // load the target model
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
+    ggml_threadpool_params tpp_batch_tgt =
+        ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
+    ggml_threadpool_params tpp_tgt = ggml_threadpool_params_from_cpu_params(params.cpuparams);
+    struct ggml_compute_threadpool * threadpool_batch_tgt = ggml_create_threadpool(&tpp_batch_tgt);
+    if (!threadpool_batch_tgt) {
+        LOG_TEE("%s: target batch threadpool create failed : n_threads %d\n", __func__, tpp_batch_tgt.n_threads);
+        exit(1);
+    }
+    ggml_compute_threadpool * threadpool_tgt = ggml_create_threadpool(&tpp_tgt);
+    if (!threadpool_tgt) {
+        LOG_TEE("%s: target threadpool create failed : n_threads %d\n", __func__, tpp_tgt.n_threads);
+        exit(1);
+    }
+    llama_attach_batch_threadpool(ctx_tgt, threadpool_batch_tgt);
+    llama_attach_threadpool(ctx_tgt, threadpool_tgt);
+
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
-    if (params.n_threads_draft > 0) {
-        params.n_threads = params.n_threads_draft;
-    }
-    params.n_threads_batch = params.n_threads_batch_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
+    ggml_threadpool_params tpp_batch_dft =
+        ggml_threadpool_params_from_cpu_params(params.draft_cpuparams_batch);
+    ggml_threadpool_params tpp_dft = ggml_threadpool_params_from_cpu_params(params.draft_cpuparams);
+    struct ggml_compute_threadpool * threadpool_batch_dft = ggml_create_threadpool(&tpp_batch_dft);
+    if (!threadpool_batch_dft) {
+        LOG_TEE("%s: draft batch threadpool create failed : n_threads %d\n", __func__, tpp_batch_dft.n_threads);
+        exit(1);
+    }
+    ggml_compute_threadpool * threadpool_dft = ggml_create_threadpool(&tpp_dft);
+    if (!threadpool_dft) {
+        LOG_TEE("%s: draft threadpool create failed : n_threads %d\n", __func__, tpp_dft.n_threads);
+        exit(1);
+    }
+    llama_attach_batch_threadpool(ctx_dft, threadpool_batch_tgt);
+    llama_attach_threadpool(ctx_dft, threadpool_dft);
+
     const bool vocab_type_tgt = llama_vocab_type(model_tgt);
     LOG("vocab_type tgt: %d\n", vocab_type_tgt);
 
@@ -154,6 +190,7 @@ int main(int argc, char ** argv) {
     // eval the prompt with both models
     llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
     llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
+    llama_pause_threadpools(ctx_tgt);
     llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));
 
     const auto t_enc_end = ggml_time_us();
@@ -550,6 +587,8 @@ int main(int argc, char ** argv) {
                 break;
             }
         }
+        llama_pause_threadpools(ctx_dft);
+
 
         // evaluate the target model on the drafted tokens
         {
@@ -560,6 +599,7 @@ int main(int argc, char ** argv) {
 
             // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             llama_decode(ctx_tgt, batch_tgt);
+            llama_pause_threadpools(ctx_tgt);
             ++n_past_tgt;
         }
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d2deb0f6530dd..f47035ce19c6a 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18751,7 +18751,7 @@ static bool __thread_affinity(const bool * mask) {
 
     for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
         if (mask[i]) {
-            printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
+            GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
             CPU_SET(i, &cpuset);
         }
     }
diff --git a/include/llama.h b/include/llama.h
index 9a2ccb1710f9a..88b56e6a15ac2 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -441,6 +441,8 @@ extern "C" {
     LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
     LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
 
+    // Pauses all attached threadpools
+    LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);
 
     // Call once at the end of the program - currently only used for MPI
     LLAMA_API void llama_backend_free(void);
diff --git a/src/llama.cpp b/src/llama.cpp
index 30a39e34d60b3..e8aead263fec5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16435,6 +16435,15 @@ void llama_detach_threadpools(struct llama_context * ctx) {
     llama_detach_batch_threadpool(ctx);
 }
 
+void llama_pause_threadpools(struct llama_context * ctx) {
+    if (ctx->threadpool) {
+        ggml_pause_threadpool(ctx->threadpool);
+    }
+    if (ctx->threadpool_batch) {
+        ggml_pause_threadpool(ctx->threadpool_batch);
+    }
+}
+
 void llama_backend_free(void) {
     ggml_quantize_free();
 }

From e317ab61b98e613718e23c30771fdd66de04ad4d Mon Sep 17 00:00:00 2001
From: fmz <quic_fzaghlou@quic.com>
Date: Thu, 25 Jul 2024 15:09:11 -0400
Subject: [PATCH 4/4] add _GNU_SOURCE

---
 common/common.cpp                    | 1 -
 examples/export-lora/export-lora.cpp | 2 +-
 ggml/src/ggml.c                      | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 7eb0850d7f889..3ad5f33818d8e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1911,7 +1911,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
     options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
     options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
     options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
 
     printf("usage: %s [options]\n", argv[0]);
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 124ee167d2255..69467004bd7c2 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
 
     g_verbose = (params.verbosity == 1);
     try {
-        lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.cpuparams.n_threads);
         ctx.run_merge();
     } catch (const std::exception & err) {
         fprintf(stderr, "%s\n", err.what());
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index f47035ce19c6a..3d662caf8bdb0 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -18740,6 +18740,7 @@ static bool __thread_priority(int32_t prio) {
 
 #ifndef __USE_GNU
 #define __USE_GNU
+#define _GNU_SOURCE
 #endif
 #include <sched.h>