From 9328133b3b3a57747442c7f3566edbbd4cf5ae9f Mon Sep 17 00:00:00 2001 From: fmz Date: Thu, 11 Jul 2024 07:52:23 -0700 Subject: [PATCH 1/4] Introduce ggml_compute_threadpool - OpenMP functional: check - Vanilla ggml functional: Check - ggml w/threadpool functional: Check - OpenMP no regression: No glaring problems - Vanilla ggml no regression: No glaring problems - ggml w/threadpool no regression: No glaring problems --- CMakePresets.json | 256 ++++-- common/common.cpp | 291 ++++++- common/common.h | 29 +- examples/CMakeLists.txt | 2 +- examples/baby-llama/baby-llama.cpp | 2 +- examples/benchmark/benchmark-matmult.cpp | 2 +- .../cvector-generator/cvector-generator.cpp | 4 +- examples/llama-bench/llama-bench.cpp | 51 ++ examples/llava/llava-cli.cpp | 4 +- examples/main/main.cpp | 30 + examples/server/server.cpp | 4 +- ggml/CMakeLists.txt | 2 +- ggml/include/ggml-alloc.h | 5 +- ggml/include/ggml-backend.h | 1 + ggml/include/ggml.h | 27 +- ggml/src/ggml-backend.c | 16 +- ggml/src/ggml.c | 800 ++++++++++++++---- include/llama.h | 12 + src/llama.cpp | 96 ++- tests/test-rope.cpp | 2 +- 20 files changed, 1360 insertions(+), 276 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index bdad38952d3cb..ae2bf25c12786 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,65 +1,197 @@ { - "version": 4, - "configurePresets": [ - { - "name": "base", - "hidden": true, - "generator": "Ninja", - "binaryDir": "${sourceDir}/build-${presetName}", - "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", - "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." + "version": 4, + "configurePresets": [ + { + "name": "base", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build-${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." + } + }, + { + "name": "sycl-base", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build-${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_CXX_COMPILER": "icx", + "CMAKE_C_COMPILER": "cl", + "GGML_SYCL": "ON", + "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." + } + }, + { + "name": "debug", + "hidden": true, + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug" + } + }, + { + "name": "release", + "hidden": true, + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release" + } + }, + { + "name": "reldbg", + "hidden": true, + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo" + } + }, + { + "name": "static", + "hidden": true, + "cacheVariables": { + "GGML_STATIC": "ON" + } + }, + { + "name": "arm64-windows-msvc", + "hidden": true, + "architecture": { + "value": "arm64", + "strategy": "external" + }, + "toolset": { + "value": "host=x86_64", + "strategy": "external" + }, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake" + } + }, + { + "name": "arm64-windows-llvm", + "hidden": true, + "architecture": { + "value": "arm64", + "strategy": "external" + }, + "toolset": { + "value": "host=x86_64", + "strategy": "external" + }, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake" + } + }, + { + "name": "arm64-windows-llvm-debug", + "inherits": [ + "base", + "arm64-windows-llvm", + "debug" + ] + }, + { + "name": "arm64-windows-llvm-release", + "inherits": [ + "base", + "arm64-windows-llvm", + "reldbg" + ] + }, + { + "name": "arm64-windows-llvm+static-release", + "inherits": [ + "base", + "arm64-windows-llvm", + "reldbg", + "static" + ] + }, + { + "name": "arm64-windows-msvc-debug", + "inherits": [ + "base", + "arm64-windows-msvc", + "debug" + ] + }, + { + "name": "arm64-windows-msvc-release", + "inherits": [ + "base", + "arm64-windows-msvc", + "reldbg" + ] + }, + { + "name": "arm64-windows-msvc+static-release", + "inherits": [ + "base", + "arm64-windows-msvc", + "reldbg", + "static" + ] + }, + { + "name": "x64-windows-msvc-debug", + "inherits": [ + "base", + "debug" + ] + }, + { + "name": "x64-windows-msvc-release", + "inherits": [ + "base", + "reldbg" + ] + }, + { + "name": "x64-windows-msvc+static-release", + "inherits": [ + "base", + "reldbg", + "static" + ] + }, + { + "name": "x64-windows-sycl-debug", + "inherits": [ + "sycl-base", + "debug" + ] + }, + { + "name": "x64-windows-sycl-release", + "inherits": [ + "sycl-base", + "release" + ] + }, + { + "name": "clang10", + "displayName": "Clang 10.0.0 x86_64-pc-linux-gnu", + "description": "Using compilers: C = /usr/bin/clang, CXX = /usr/bin/clang++", + "binaryDir": "${sourceDir}/out/build/${presetName}", + "cacheVariables": { + "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", + "CMAKE_C_COMPILER": "/usr/bin/clang", + "CMAKE_CXX_COMPILER": "/usr/bin/clang++", + "CMAKE_RC_COMPILER": "/usr/bin/llvm-rc-10", + "CMAKE_BUILD_TYPE": "Debug" + } + }, + { + "name": "gcc8.4", + "displayName": "GCC 8.4.0 x86_64-linux-gnu", + "description": "Using compilers: C = /usr/bin/gcc, CXX = /usr/bin/g++", + "binaryDir": "${sourceDir}/out/build/${presetName}", + "cacheVariables": { + "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", + "CMAKE_C_COMPILER": "/usr/bin/gcc", + "CMAKE_CXX_COMPILER": "/usr/bin/g++", + "CMAKE_BUILD_TYPE": "Debug" + } } - }, - { - "name": "sycl-base", - "hidden": true, - "generator": "Ninja", - "binaryDir": "${sourceDir}/build-${presetName}", - "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", - "CMAKE_CXX_COMPILER": "icx", - "CMAKE_C_COMPILER": "cl", - "GGML_SYCL": "ON", - "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." - } - }, - { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, - { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, - { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, - { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, - - { - "name": "arm64-windows-msvc", "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x86_64", "strategy": "external" }, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake" - } - }, - - { - "name": "arm64-windows-llvm", "hidden": true, - "architecture": { "value": "arm64", "strategy": "external" }, - "toolset": { "value": "host=x86_64", "strategy": "external" }, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake" - } - }, - - { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, - { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, - { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, - - { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, - { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, - { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, - - { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, - { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, - { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, - - { "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] }, - { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] } - ] + ] } diff --git a/common/common.cpp b/common/common.cpp index ec44a05521c9d..7eb0850d7f889 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -222,6 +222,36 @@ void gpt_params_handle_model_default(gpt_params & params) { } } +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { + int32_t n_set = 0; + + if (cpuparams.n_threads < 0) { + // Assuming everything about cpuparams is invalid + if (role_model != nullptr) { + cpuparams = *role_model; + } else { + cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + + for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + if (cpuparams.cpumask[i]) { + n_set++; + } + } + + if (n_set == 0) { + // You hit the jackpot! + memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS); + n_set = GGML_MAX_N_THREADS; + } + + if (n_set < cpuparams.n_threads) { + // Not enough set bits, may experience performance issues. + fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); + } +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -241,6 +271,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } @@ -285,6 +320,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } +bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { + size_t dash_loc = range.find('-'); + if (dash_loc == std::string::npos) { + fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); + return false; + } + + size_t start_i; + size_t end_i; + + if (dash_loc == 0) { + start_i = 0; + } else { + start_i = std::stoull(range.substr(0, dash_loc)); + if (start_i >= GGML_MAX_N_THREADS) { + fprintf(stderr, "Start index out of bounds!\n"); + return false; + } + } + + if (dash_loc == range.length() - 1) { + end_i = GGML_MAX_N_THREADS - 1; + } else { + end_i = std::stoull(range.substr(dash_loc + 1)); + if (end_i >= GGML_MAX_N_THREADS) { + fprintf(stderr, "End index out of bounds!\n"); + return false; + } + } + + for (size_t i = start_i; i <= end_i; i++) { + boolmask[i] = true; + } + + return true; +} + +bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) { + // Discard potential 0x prefix + size_t start_i = 0; + if (mask.length() >= 2 && mask.substr(0, 2) == "0x") { + start_i = 2; + } + + size_t num_digits = mask.length() - start_i; + if (num_digits > 128) num_digits = 128; + + size_t end_i = num_digits + start_i; + + for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) { + char c = mask.at(i); + int8_t id = c; + + if ((c >= '0' && c <= '9')) { + id -= '0'; + } else if (c >= 'a' && c <= 'f') { + id -= 'a' - 10; + } else if (c >= 'A' && c <= 'F') { + id -= 'A' - 10; + } else { + fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i)); + return false; + } + + boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0); + boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0); + boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0); + boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0); + } + + return true; +} + #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { @@ -301,36 +409,137 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "-t" || arg == "--threads") { CHECK_ARG - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); + params.cpuparams.n_threads = std::stoi(argv[i]); + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-C" || arg == "--cpu-mask") { + CHECK_ARG + std::string mask = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); + return true; + } + if (arg == "-Cr" || arg == "--cpu-range") { + CHECK_ARG + std::string range = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask); + return true; + } + if (arg == "--prio") { + CHECK_ARG + params.cpuparams.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict") { + params.cpuparams.strict_cpu = true; + return true; + } + if (arg == "--poll") { + params.cpuparams.poll = true; + return true; + } if (arg == "-tb" || arg == "--threads-batch") { CHECK_ARG - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); + params.cpuparams_batch.n_threads = std::stoi(argv[i]); + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-Cb" || arg == "--cpu-mask-batch") { + CHECK_ARG + std::string mask = argv[i]; + params.cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask); + return true; + } + if (arg == "-Crb" || arg == "--cpu-range_batch") { + CHECK_ARG + std::string range = argv[i]; + params.cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask); + return true; + } + if (arg == "--prio-batch") { + CHECK_ARG + params.cpuparams_batch.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-batch") { + params.cpuparams_batch.strict_cpu = true; + return true; + } + if (arg == "--poll-batch") { + params.cpuparams_batch.poll = true; + return true; + } if (arg == "-td" || arg == "--threads-draft") { CHECK_ARG - params.n_threads_draft = std::stoi(argv[i]); - if (params.n_threads_draft <= 0) { - params.n_threads_draft = std::thread::hardware_concurrency(); + params.draft_cpuparams.n_threads = std::stoi(argv[i]); + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); } return true; + } + if (arg == "-Cd" || arg == "--cpu-mask-draft") { + CHECK_ARG + std::string mask = argv[i]; + params.draft_cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask); + return true; + } + if (arg == "-Crd" || arg == "--cpu-range-draft") { + CHECK_ARG + std::string range = argv[i]; + params.draft_cpuparams.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask); + return true; + } + if (arg == "--prio-draft") { + CHECK_ARG + params.draft_cpuparams.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-draft") { + params.draft_cpuparams.strict_cpu = true; + return true; + } + if (arg == "--poll-draft") { + params.draft_cpuparams.poll = true; + return true; } if (arg == "-tbd" || arg == "--threads-batch-draft") { CHECK_ARG - params.n_threads_batch_draft = std::stoi(argv[i]); - if (params.n_threads_batch_draft <= 0) { - params.n_threads_batch_draft = std::thread::hardware_concurrency(); + params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]); + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } return true; } + if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") { + CHECK_ARG + std::string range = argv[i]; + params.draft_cpuparams_batch.mask_valid = true; + invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask); + return true; + } + if (arg == "--prio-batch-draft") { + CHECK_ARG + params.draft_cpuparams_batch.priority = std::stoul(argv[i]); + return true; + } + if (arg == "--cpu-strict-batch-draft") { + params.draft_cpuparams_batch.strict_cpu = true; + return true; + } + if (arg == "--poll-batch-draft") { + params.draft_cpuparams_batch.poll = true; + return true; + } if (arg == "-p" || arg == "--prompt") { CHECK_ARG params.prompt = argv[i]; @@ -1401,11 +1610,38 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); - options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads }); + options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"}); + options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"}); + options.push_back({ "*", " --cpu-strict", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu}); + options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority}); + options.push_back({ "*", " --poll", "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll}); options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); + options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"}); + options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", + "ranges of CPUs for affinity. Complements --cpu-mask-batch"}); + options.push_back({ "*", " --cpu-strict-batch", "use strict CPU placement (default: same as --cpu-strict)"}); + options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"}); + options.push_back({ "*", " --poll-batch", "use polling to wait for work (default: --poll)"}); options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); + options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"}); + options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft"}); + options.push_back({ "speculative", " --cpu-strict-draft", "Use strict CPU placement for draft model (default: same as --cpu-strict)"}); + options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"}); + options.push_back({ "speculative", " --poll-draft", "Use polling to wait for draft model work (default: same as --poll])"}); options.push_back({ "speculative", "-tbd, --threads-batch-draft N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); + options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M", + "Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"}); + options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"}); + options.push_back({ "speculative", " --cpu-strict-batch-draft", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)"}); + options.push_back({ "speculative", " --priority-batch-draft N", + "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"}); + options.push_back({ "speculative", " --poll-batch-draft", "Use polling to wait for draft model work (default: --poll-draft)"}); + options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", @@ -1707,9 +1943,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; - os << "system_info: n_threads = " << params.n_threads; - if (params.n_threads_batch != -1) { - os << " (n_threads_batch = " << params.n_threads_batch << ")"; + os << "system_info: n_threads = " << params.cpuparams.n_threads; + if (params.cpuparams_batch.n_threads != -1) { + os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")"; } os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); @@ -2192,8 +2428,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_seq_max = params.n_parallel; cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + cparams.n_threads = params.cpuparams.n_threads; + cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? + params.cpuparams.n_threads : params.cpuparams_batch.n_threads; cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; @@ -2219,6 +2456,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param return cparams; } +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) { + struct ggml_threadpool_params tpp; + + tpp.mask_specified = params.mask_valid; + if (params.mask_valid) { + std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS); + } + + tpp.n_threads = params.n_threads; + tpp.prio = params.priority; + tpp.poll = params.poll; + tpp.strict_cpu = params.strict_cpu; + + return tpp; +} + #ifdef LLAMA_USE_CURL static bool starts_with(const std::string & str, const std::string & prefix) { @@ -3215,7 +3468,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); diff --git a/common/common.h b/common/common.h index 8240ff99b8e2a..f9aa61edf4a6b 100644 --- a/common/common.h +++ b/common/common.h @@ -58,13 +58,18 @@ enum dimre_method { DIMRE_METHOD_MEAN, }; +struct cpu_params { + int32_t n_threads = -1; + bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. + bool mask_valid = false; // Default: any CPU + int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) + bool strict_cpu = false; // Use strict CPU placement + bool poll = false; // Use polling (busywait) to wait for work +}; + struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = cpu_get_num_math(); - int32_t n_threads_draft = -1; - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -91,6 +96,11 @@ struct gpt_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + struct cpu_params cpuparams; + struct cpu_params cpuparams_batch; + struct cpu_params draft_cpuparams; + struct cpu_params draft_cpuparams_batch; + ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -195,7 +205,7 @@ struct gpt_params { int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int32_t n_threads_http = -1; // number of threads to process HTTP requests + int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) std::string hostname = "127.0.0.1"; std::string public_path = ""; @@ -268,6 +278,10 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params); +bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); +bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); + // // String utils // @@ -311,8 +325,9 @@ std::string fs_get_cache_file(const std::string & filename); // TODO: avoid tuplue, use struct std::tuple llama_init_from_gpt_params(gpt_params & params); -struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); -struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); +struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); +struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params); +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 67b3d27747850..247d52c6d3454 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -50,6 +50,6 @@ else() endif() add_subdirectory(save-load-state) add_subdirectory(simple) - add_subdirectory(speculative) + #add_subdirectory(speculative) add_subdirectory(tokenize) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 4f6c3746a106c..22818304fc6ff 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -19,7 +19,7 @@ constexpr float rms_norm_eps = 5e-6f; #endif static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 47cb16c69d536..e78f6b388ef6e 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -21,7 +21,7 @@ #endif static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index d4e126ac22e6f..fa40be670268f 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -485,8 +485,8 @@ int main(int argc, char ** argv) { if (use_pca) { // run PCA PCA::pca_params pca_params; - pca_params.n_threads = params.n_threads; - pca_params.n_batch = params.n_pca_batch; + pca_params.n_threads = params.cpuparams.n_threads; + pca_params.n_batch = params.n_pca_batch; pca_params.n_iterations = params.n_pca_iterations; PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); } else { diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a6497b6e0bf82..a4b3b3bb8cd8f 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -206,6 +206,7 @@ struct cmd_params { std::vector use_mmap; std::vector embeddings; ggml_numa_strategy numa; + cpu_params cpuparams; int reps; bool verbose; output_formats output_format; @@ -232,6 +233,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ {true}, /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, + /* cpuparams */ {}, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN, @@ -260,6 +262,11 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); + printf(" -mt, --max-threads (default: %d)\n", cmd_params_defaults.cpuparams.n_threads); + printf(" -C, --cpu-mask (default: 0x0)\n"); + printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu); + printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority); + printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -463,6 +470,30 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } } + } else if (arg == "-mt" || arg == "--max-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cpuparams.n_threads = std::stoi(argv[i]); + } else if (arg == "-C" || arg == "--cpu-mask") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string mask = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); + } else if (arg == "--prio") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cpuparams.priority = std::stoul(argv[i]); + } else if (arg == "--cpu-strict") { + params.cpuparams.strict_cpu = true; + } else if (arg == "--poll") { + params.cpuparams.poll = true; } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { invalid_param = true; @@ -1373,6 +1404,23 @@ int main(int argc, char ** argv) { llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; + postprocess_cpu_params(params.cpuparams); + + struct ggml_threadpool_params tpp; + tpp.n_threads = params.cpuparams.n_threads; + tpp.mask_specified = params.cpuparams.mask_valid; + tpp.strict_cpu = params.cpuparams.strict_cpu; + tpp.prio = params.cpuparams.priority; + tpp.poll = params.cpuparams.poll; + + std::memcpy(&tpp.cpumask[0], ¶ms.cpuparams.cpumask[0], GGML_MAX_N_THREADS); + + struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + for (const auto & inst : params_instances) { // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { @@ -1398,6 +1446,7 @@ int main(int argc, char ** argv) { test t(inst, lmodel, ctx); llama_kv_cache_clear(ctx); + llama_attach_threadpool(ctx, threadpool); // warmup run if (t.n_prompt > 0) { @@ -1439,6 +1488,8 @@ int main(int argc, char ** argv) { llama_free(ctx); } + ggml_release_threadpool(threadpool); + llama_free_model(lmodel); if (p) { diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 8c7dd2ae3d0dc..86b39f20eea6e 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para if (!params->image.empty()) { LOG_TEE("using base64 encoded image instead of command line image path\n"); } - embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); + embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); if (!embed) { LOG_TEE("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); + embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embed) { fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); return NULL; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 61e960ea2abe6..ef5b0946c0a1e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -218,6 +218,33 @@ int main(int argc, char ** argv) { return 1; } + LOG("%s: llama threadpool init = n_threads = %d\n", + __func__, + (int32_t) params.cpuparams.n_threads + ); + struct ggml_threadpool_params tpp_batch = + ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); + struct ggml_threadpool_params tpp = + ggml_threadpool_params_from_cpu_params(params.cpuparams); + + struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch); + if (!threadpool_batch) { + LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + exit(1); + } + struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + + llama_attach_batch_threadpool(ctx, threadpool_batch); + llama_attach_threadpool(ctx, threadpool); + if (ctx_guidance) { + llama_attach_batch_threadpool(ctx_guidance, threadpool_batch); + llama_attach_threadpool(ctx_guidance, threadpool); + } + const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); LOG("n_ctx: %d\n", n_ctx); @@ -986,6 +1013,9 @@ int main(int argc, char ** argv) { llama_sampling_free(ctx_sampling); llama_backend_free(); + ggml_release_threadpool(threadpool); + ggml_release_threadpool(threadpool_batch); + #ifndef LOG_DISABLE_LOGS LOG_TEE("Log end\n"); #endif // LOG_DISABLE_LOGS diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7813a2957d6bc..96404fc53d544 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2520,8 +2520,8 @@ int main(int argc, char ** argv) { }); LOG_INFO("system info", { - {"n_threads", params.n_threads}, - {"n_threads_batch", params.n_threads_batch}, + {"n_threads", params.cpuparams.n_threads}, + {"n_threads_batch", params.cpuparams_batch.n_threads}, {"total_threads", std::thread::hardware_concurrency()}, {"system_info", llama_print_system_info()}, }); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index be22a74606c0b..a8050fa67b473 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -139,7 +139,7 @@ option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING "ggml: metal minimum macOS version") set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") -option(GGML_OPENMP "ggml: use OpenMP" ON) +option(GGML_OPENMP "ggml: use OpenMP" OFF) option(GGML_RPC "ggml: use RPC" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 434c13b34a929..cd85b6ee70560 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -7,8 +7,9 @@ extern "C" { #endif typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; -typedef struct ggml_backend_buffer * ggml_backend_buffer_t; -typedef struct ggml_backend * ggml_backend_t; +typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +typedef struct ggml_backend * ggml_backend_t; +typedef struct ggml_compute_threadpool * ggml_compute_threadpool_t; // Tensor allocator struct ggml_tallocr { diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 5f3f1e286990e..c59f9f54a44b9 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -102,6 +102,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 548661b9bb636..e58ef9f340d77 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -231,6 +231,8 @@ #define GGML_MAX_SRC 10 #ifndef GGML_MAX_NAME #define GGML_MAX_NAME 64 +#define GGML_MAX_N_THREADS 512 + #endif #define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 @@ -617,6 +619,17 @@ extern "C" { // If it returns true, the computation is aborted typedef bool (*ggml_abort_callback)(void * data); + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; + bool mask_specified; + int32_t n_threads; + int32_t prio; + bool poll; + bool strict_cpu; + }; + + struct ggml_compute_threadpool; // forward declaration, see ggml.c + // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -624,6 +637,7 @@ extern "C" { uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` int n_threads; + struct ggml_compute_threadpool * threadpool; // abort ggml_graph_compute when true ggml_abort_callback abort_callback; @@ -2003,10 +2017,19 @@ extern "C" { GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); + GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params); + GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool); + GGML_API int32_t ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool); + GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool); + GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool); + // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int n_threads, + struct ggml_compute_threadpool * threadpool); + GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index d39cfed8886f4..d63f49cfdaf8c 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -727,7 +727,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { #endif struct ggml_backend_cpu_context { - int n_threads; + int n_threads; + ggml_compute_threadpool_t threadpool; + void * work_data; size_t work_size; @@ -764,7 +766,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); - cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { @@ -801,7 +803,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; - struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); if (cpu_ctx->work_size < cplan.work_size) { free(cpu_ctx->work_data); @@ -878,6 +880,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->threadpool = NULL; ctx->work_data = NULL; ctx->work_size = 0; ctx->abort_callback = NULL; @@ -908,6 +911,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->threadpool = threadpool; +} + void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f65837e856ac3..b0630729f9175 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1779,28 +1779,104 @@ struct ggml_context_container { struct ggml_context context; }; -struct ggml_compute_state_shared { - const struct ggml_cgraph * cgraph; - const struct ggml_cplan * cplan; +// +// Threading defs +// + +typedef pthread_t ggml_thread_t; + +#if defined(_WIN32) + +typedef CONDITION_VARIABLE ggml_cond_t; +typedef SRWLOCK ggml_mutex_t; + +#define ggml_mutex_init(m) InitializeSRWLock(m) +#define ggml_mutex_destroy(m) +#define ggml_mutex_lock(m) AcquireSRWLockExclusive(m) +#define ggml_mutex_unlock(m) ReleaseSRWLockExclusive(m) +#define ggml_mutex_lock_shared(m) AcquireSRWLockShared(m) +#define ggml_mutex_unlock_shared(m) ReleaseSRWLockShared(m) + +#define ggml_cond_init(c) InitializeConditionVariable(c) +#define ggml_cond_destroy(c) +#define ggml_cond_wait(c, m) SleepConditionVariableSRW(c, m, INFINITE, CONDITION_VARIABLE_LOCKMODE_SHARED) +#define ggml_cond_broadcast(c) WakeAllConditionVariable(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#else + +typedef pthread_cond_t ggml_cond_t; +typedef pthread_mutex_t ggml_mutex_t; - int n_threads; +#define ggml_mutex_init(m) pthread_mutex_init(m, NULL) +#define ggml_mutex_destroy(m) pthread_mutex_destroy(m) +#define ggml_mutex_lock(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock(m) pthread_mutex_unlock(m) +#define ggml_mutex_lock_shared(m) pthread_mutex_lock(m) +#define ggml_mutex_unlock_shared(m) pthread_mutex_unlock(m) + +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#define ggml_lock_lock(x) _mm_pause() +#else +#define ggml_lock_lock(x) UNUSED(x) +#endif +#define ggml_lock_unlock(x) UNUSED(x) + +#define GGML_LOCK_INITIALIZER 0 +#define ggml_cond_init(c) pthread_cond_init(c, NULL) +#define ggml_cond_destroy(c) pthread_cond_destroy(c) +#define ggml_cond_wait(c, m) pthread_cond_wait(c, m) +#define ggml_cond_broadcast(c) pthread_cond_broadcast(c) + +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join + +#endif + +// Threadpool def +struct ggml_compute_threadpool { + ggml_mutex_t mutex; // mutex for cond.var + ggml_cond_t cond; // cond.var for waiting for new work + + struct ggml_cgraph * cgraph; + struct ggml_cplan * cplan; // synchronization primitives atomic_int n_barrier; atomic_int n_barrier_passed; + atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. + + volatile bool stop; // Used for stopping the threadpool altogether + volatile bool pause; // Used for pausing the threadpool or individual threads + volatile bool new_work; // Set when there is work to be done, unset after it's done + + struct ggml_compute_state * workers; // per thread state + int32_t n_threads_max; // number of threads in the pool + int32_t n_threads_cur; // number of threads used in the current graph + + int32_t prio; // Scheduling priority + bool disposable; // Doesn't initialize a conv-var + bool poll; // Use polling (busywait) // TODO ggml_abort_callback abort_callback; // abort ggml_graph_compute when true void * abort_callback_data; - atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads - enum ggml_status ec; }; +// Per-thread state struct ggml_compute_state { +#ifndef GGML_USE_OPENMP ggml_thread_t thrd; + bool cpumask[GGML_MAX_N_THREADS]; + bool mask_specified; +#endif + struct ggml_compute_threadpool * threadpool; int ith; - struct ggml_compute_state_shared * shared; }; struct ggml_compute_params { @@ -1811,7 +1887,7 @@ struct ggml_compute_params { size_t wsize; void * wdata; - struct ggml_compute_state_shared * shared; + struct ggml_compute_threadpool * threadpool; }; // @@ -2906,23 +2982,23 @@ inline static void ggml_critical_section_start(void) { } #ifdef GGML_USE_OPENMP -static void ggml_barrier(struct ggml_compute_state_shared * shared) { - if (shared->n_threads == 1) { +static void ggml_barrier(struct ggml_compute_threadpool * threadpool) { + if (threadpool->n_threads_cur == 1) { return; } #pragma omp barrier } #else -static void ggml_barrier(struct ggml_compute_state_shared * shared) { - if (shared->n_threads == 1) { +static void ggml_barrier(struct ggml_compute_threadpool * threadpool) { + if (threadpool->n_threads_cur == 1) { return; } - atomic_int * n_barrier = &shared->n_barrier; - atomic_int * n_barrier_passed = &shared->n_barrier_passed; + atomic_int * n_barrier = &threadpool->n_barrier; + atomic_int * n_barrier_passed = &threadpool->n_barrier_passed; - int n_threads = shared->n_threads; + int n_threads = threadpool->n_threads_cur; int passed_old = atomic_load(n_barrier_passed); if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { @@ -9904,7 +9980,7 @@ static void ggml_compute_forward_acc_f32( ((char *) src0->data), ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } const int ith = params->ith; @@ -12278,10 +12354,10 @@ UseGgmlGemm1:; if (ith == 0) { // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - atomic_store(¶ms->shared->current_chunk, nth); + atomic_store(¶ms->threadpool->current_chunk, nth); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { @@ -12389,7 +12465,7 @@ UseGgmlGemm2:; break; } - current_chunk = atomic_fetch_add(¶ms->shared->current_chunk, 1); + current_chunk = atomic_fetch_add(¶ms->threadpool->current_chunk, 1); } } @@ -12484,7 +12560,7 @@ static void ggml_compute_forward_mul_mat_id( } } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); // compute each matrix multiplication in sequence for (int cur_a = 0; cur_a < n_as; ++cur_a) { @@ -12638,7 +12714,7 @@ static void ggml_compute_forward_out_prod_f32( if (ith == 0) { ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); // dst[:,:,:,:] = 0 // for i2,i3: @@ -12756,7 +12832,7 @@ static void ggml_compute_forward_out_prod_q_f32( if (ith == 0) { ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); // parallelize by last three dimensions @@ -12942,7 +13018,7 @@ static void ggml_compute_forward_set_f32( ((char *) src0->data), ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } const int ith = params->ith; @@ -13521,7 +13597,7 @@ static void ggml_compute_forward_diag_mask_f32( ((char *) src0->data), ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } // TODO: handle transposed/permuted matrices @@ -14297,7 +14373,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( // need to zero dst since we are accumulating into it memset(dst->data, 0, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -14385,7 +14461,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( // need to zero dst since we are accumulating into it memset(dst->data, 0, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -14672,7 +14748,7 @@ static void ggml_compute_forward_conv_transpose_2d( memset(dst->data, 0, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int32_t stride = ggml_get_op_params_i32(dst, 0); @@ -15406,7 +15482,7 @@ static void ggml_compute_forward_flash_attn_back_f32( if (ith == 0) { memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const int64_t elem_q = ggml_nelements(q); const int64_t elem_k = ggml_nelements(k); @@ -16178,7 +16254,7 @@ static void ggml_compute_forward_add_rel_pos_f32( if (params->ith == 0) { memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); } // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 @@ -16463,7 +16539,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); const double eps = 1e-9; @@ -16511,7 +16587,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( } #endif } - ggml_barrier(params->shared); + ggml_barrier(params->threadpool); if (ith == 0) { float * dp = (float *) dst->data; @@ -18282,65 +18358,6 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) { memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *)); } -// -// thread data -// -// synchronization is done via busy loops -// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops -// - -#ifdef __APPLE__ - -//#include -// -//typedef os_unfair_lock ggml_lock_t; -// -//#define ggml_lock_init(x) UNUSED(x) -//#define ggml_lock_destroy(x) UNUSED(x) -//#define ggml_lock_lock os_unfair_lock_lock -//#define ggml_lock_unlock os_unfair_lock_unlock -// -//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#else - -//typedef pthread_spinlock_t ggml_lock_t; - -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock - -typedef int ggml_lock_t; - -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) -#endif -#define ggml_lock_unlock(x) UNUSED(x) - -#define GGML_LOCK_INITIALIZER 0 - -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join - -#endif - // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) static void set_numa_thread_affinity(int thread_n) { @@ -18617,9 +18634,292 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { return n_tasks; } -struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { +static thread_ret_t ggml_graph_compute_secondary_thread(void* data); + +enum { + SCHED_PRIO_NORMAL, + SCHED_PRIO_MEDIUM, + SCHED_PRIO_HIGH, + SCHED_PRIO_REALTIME +}; + +#if defined(_WIN32) +#include "windows.h" + +// TODO: support > 64 CPUs +static bool __thread_affinity(bool * mask) { + HANDLE h = GetCurrentThread(); + uint64_t bitmask = 0ULL; + + assert(GGML_MAX_N_THREADS >= 64); + + for (int32_t i = 0; i < 8; i++) { + int32_t idx = i * 8; + uint8_t val = 0; + val |= mask[idx + 0] << 0; + val |= mask[idx + 1] << 1; + val |= mask[idx + 2] << 2; + val |= mask[idx + 3] << 3; + val |= mask[idx + 4] << 4; + val |= mask[idx + 5] << 5; + val |= mask[idx + 6] << 6; + val |= mask[idx + 7] << 7; + bitmask |= (uint64_t)val << idx; + } + + for (int32_t i = 64; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { + fprintf(stderr, "warn: setting thread-affinity for > 64 CPUs isn't supported on windows!\n"); + break; + } + } + + DWORD_PTR m = (DWORD_PTR)bitmask; + + m = SetThreadAffinityMask(h, m); + + return m != 0; +} + +static bool __process_priority(int32_t prio) { + DWORD p = NORMAL_PRIORITY_CLASS; + + switch (prio) { + case SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; + case SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; + case SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; + case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break; + } + + return SetPriorityClass(GetCurrentProcess(), p); +} + +static bool __thread_priority(int32_t prio) { + DWORD p = NORMAL_PRIORITY_CLASS; + + switch (prio) { + case SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; + case SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; + case SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; + case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; + } + + return SetThreadPriority(GetCurrentThread(), p); + +} + +#elif defined(__APPLE__) +#include +#include + +static bool __thread_affinity(const bool * mask) { + UNUSED(mask); + return true; +} + +static bool __process_priority(int32_t prio) { + int32_t p = 0; + + switch (prio) { + case SCHED_PRIO_NORMAL: p = 0; break; + case SCHED_PRIO_MEDIUM: p = -5; break; + case SCHED_PRIO_HIGH: p = -10; break; + case SCHED_PRIO_REALTIME: p = -20; break; + } + + int32_t r = setpriority(PRIO_PROCESS, 0, p); + return r != -1; +} + +static bool __thread_priority(int32_t prio) { + UNUSED(prio); + return true; +} + +#else // posix? + +#ifndef __USE_GNU +#define __USE_GNU +#endif +#include + +static bool __thread_affinity(const bool * mask) { + cpu_set_t cpuset; + int32_t err; + + CPU_ZERO(&cpuset); + + for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { + printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i); + CPU_SET(i, &cpuset); + } + } + +#ifdef __ANDROID__ + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { + err = errno; + } +#else + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); +#endif + if (err != 0) { + //fprintf(stderr, "warn: failed to set affinity mask 0x%llx (err %d: %s)\n", (unsigned long long)mask, err, strerror(err)); + return false; + } + + return true; +} + +static bool __process_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + + switch (prio) { + case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + int32_t err = sched_setscheduler(0, policy, &p); + if (err != 0) { + //fprintf(stderr, "warn: failed to set process priority %d (err %d)\n", prio, err); + return false; + } + + return true; +} + +static bool __thread_priority(int32_t prio) { + struct sched_param p; + int32_t policy = SCHED_OTHER; + switch (prio) { + case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + } + + int32_t err = pthread_setschedparam(pthread_self(), policy, &p); + if (err != 0) { + //fprintf(stderr, "warn: failed to set thread priority %d (err %d)\n", prio, err); + return false; + } + + return true; +} + +#endif + +#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) ) +static inline void __cpu_relax(void) { + __asm__ volatile("yield" ::: "memory"); +} +#elif defined(__x86_64__) +static inline void __cpu_relax(void) { + _mm_pause(); +} +#else +static inline void __cpu_relax(void) {;} +#endif + +static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { + if (!global_mask) { + memset(local_mask, 1, GGML_MAX_N_THREADS); + return; + } + if (!strict) { + memcpy(local_mask, global_mask, GGML_MAX_N_THREADS); + return; + } else { + memset(local_mask, 0, GGML_MAX_N_THREADS); + int32_t base_idx = *iter; + for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) { + int32_t idx = base_idx + i; + if (idx >= GGML_MAX_N_THREADS) { + // Just a cheaper modulo + idx -= GGML_MAX_N_THREADS; + } + if (global_mask[idx]) { + local_mask[idx] = 1; + *iter = idx + 1; + return; + } + } + } +} + +void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) { + if (!threadpool) return; + +#ifndef GGML_USE_OPENMP + struct ggml_compute_state* workers = threadpool->workers; + const int32_t n_threads = threadpool->n_threads_max; + + if (!threadpool->disposable) { + ggml_mutex_lock(&threadpool->mutex); + } + threadpool->n_threads_cur = n_threads; + threadpool->stop = true; + threadpool->pause = false; + if (!threadpool->disposable) { + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); + } + + for (int32_t j = 1; j < n_threads; j++) { + int32_t rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == GGML_EXIT_SUCCESS || rc == GGML_EXIT_ABORTED); + UNUSED(rc); + } + + GGML_ALIGNED_FREE(workers); + + if (!threadpool->disposable) { + ggml_mutex_destroy(&threadpool->mutex); + ggml_cond_destroy(&threadpool->cond); + } +#endif // GGML_USE_OPENMP + + GGML_ALIGNED_FREE(threadpool); +} + +void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) { +#ifndef GGML_USE_OPENMP + GGML_ASSERT(!threadpool->disposable); + GGML_PRINT_DEBUG("Pausing threadpool\n"); + threadpool->pause = true; +#else + UNUSED(threadpool); +#endif +} + +void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) { +#ifndef GGML_USE_OPENMP + GGML_ASSERT(!threadpool->disposable); + GGML_PRINT_DEBUG("Resuming threadpool\n"); + + ggml_mutex_lock(&threadpool->mutex); + threadpool->pause = false; + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); +#else + UNUSED(threadpool); +#endif +} + +struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int32_t n_threads, + struct ggml_compute_threadpool * threadpool) { + + if (threadpool == NULL) { + GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool\n"); + } if (n_threads <= 0) { - n_threads = GGML_DEFAULT_N_THREADS; + n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS; } size_t work_size = 0; @@ -18775,12 +19075,13 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } if (work_size > 0) { - work_size += CACHE_LINE_SIZE*(n_threads - 1); + work_size += CACHE_LINE_SIZE*(n_threads); } - cplan.n_threads = MIN(max_tasks, n_threads); - cplan.work_size = work_size; - cplan.work_data = NULL; + cplan.threadpool = threadpool; + cplan.n_threads = MIN(max_tasks, n_threads); + cplan.work_size = work_size; + cplan.work_data = NULL; return cplan; } @@ -18788,36 +19089,206 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - const struct ggml_cgraph * cgraph = state->shared->cgraph; - const struct ggml_cplan * cplan = state->shared->cplan; + const struct ggml_cgraph * cgraph = state->threadpool->cgraph; + const struct ggml_cplan * cplan = state->threadpool->cplan; set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { - /*.ith =*/ state->ith, - /*.nth =*/ state->shared->n_threads, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - /*.shared=*/ state->shared, + /*.ith =*/ state->ith, + /*.nth =*/ state->threadpool->n_threads_cur, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + /*.threadpool=*/ state->threadpool, }; - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - struct ggml_tensor * node = cgraph->nodes[node_n]; + struct ggml_tensor * node = cgraph->nodes[0]; + + ggml_compute_forward(¶ms, node); + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->threadpool->ec = GGML_STATUS_ABORTED; + } + for (int node_n = 1; node_n < cgraph->n_nodes; node_n++) { + ggml_barrier(state->threadpool); + + if (state->threadpool->ec != GGML_STATUS_SUCCESS) { + break; + } + + node = cgraph->nodes[node_n]; ggml_compute_forward(¶ms, node); if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->shared->ec = GGML_STATUS_ABORTED; + state->threadpool->ec = GGML_STATUS_ABORTED; } + } - ggml_barrier(state->shared); + if (!state->threadpool->disposable && state->ith == 0) { + state->threadpool->new_work = false; + } - if (state->shared->ec != GGML_STATUS_SUCCESS) { - break; + ggml_barrier(state->threadpool); + + return 0; +} + + + +#ifndef GGML_USE_OPENMP + +static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) { + struct ggml_compute_threadpool * threadpool = state->threadpool; + + do { + if (threadpool->poll) { + while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) { + // No new work. Yield and keep polling. + //__cpu_relax(); + } + } else { + ggml_mutex_lock_shared(&threadpool->mutex); + while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) { + // No new work. Wait for the signal. + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + } + ggml_mutex_unlock_shared(&threadpool->mutex); + } + } while (state->ith >= threadpool->n_threads_cur); + return threadpool->new_work; +} + +static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_compute_threadpool * threadpool = state->threadpool; + + GGML_ASSERT(!threadpool->disposable); + + __thread_priority(threadpool->prio); + if (state->mask_specified) + __thread_affinity(state->cpumask); + + while (true) { + // Check if we need to sleep + while (threadpool->pause) { + GGML_PRINT_DEBUG("thread #%d inside pause loop\n", state->ith); + ggml_mutex_lock_shared(&threadpool->mutex); + if (threadpool->pause) { + ggml_cond_wait(&threadpool->cond, &threadpool->mutex); + } + GGML_PRINT_DEBUG("thread #%d resuming after wait\n", state->ith); + ggml_mutex_unlock_shared(&threadpool->mutex); + } + // This needs to be checked for after the cond_wait + if (threadpool->stop) break; + + // Check if there is new work + // The main thread is the only one that can dispatch new work + + bool new_work = ggml_graph_compute_check_for_work(state); + if (new_work) { + int64_t ret = (int64_t) ggml_graph_compute_thread(state); + if (ret == GGML_EXIT_ABORTED) + return (thread_ret_t) ret; + + if (ret != GGML_EXIT_SUCCESS && ret != GGML_EXIT_ABORTED) { + fprintf(stderr, "ggml_graph_compute_thread exited with an unexpected error: %lld\n", (long long int) ret); + GGML_ASSERT(false); + } } } - return 0; + return (thread_ret_t) 0; +} + +#endif // GGML_USE_OPENMP + +static struct ggml_compute_threadpool * ggml_create_threadpool_impl( + struct ggml_threadpool_params * tpp, + bool disposable, + struct ggml_cgraph * cgraph, + struct ggml_cplan * cplan) { + + struct ggml_compute_threadpool * threadpool = + GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_threadpool)); + { + threadpool->cgraph = cgraph; + threadpool->cplan = cplan; + threadpool->n_barrier = 0; + threadpool->n_barrier_passed = 0; + threadpool->current_chunk = 0; + threadpool->stop = false; + threadpool->pause = disposable ? false : true; + threadpool->new_work = false; + threadpool->workers = NULL; + threadpool->n_threads_max = tpp->n_threads; + threadpool->n_threads_cur = disposable ? tpp->n_threads : 0; + threadpool->disposable = disposable; + threadpool->poll = tpp->poll; + threadpool->prio = tpp->prio; + + threadpool->abort_callback = NULL; + threadpool->abort_callback_data = NULL; + threadpool->ec = GGML_STATUS_SUCCESS; + } + +#ifndef GGML_USE_OPENMP + if (!disposable) { + ggml_mutex_init(&threadpool->mutex); + ggml_cond_init(&threadpool->cond); + } +#endif // GGML_USE_OPENMP + + struct ggml_compute_state * workers = + GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_state) * tpp->n_threads); + + threadpool->workers = workers; + +#ifdef GGML_USE_OPENMP + for (int j = 0; j < tpp->n_threads; j++) { + workers[j] = (struct ggml_compute_state) { + .threadpool = threadpool, + .ith = j + }; + } +#else // Not using OPENMP + int32_t cpumask_iter = 0; + + __process_priority(tpp->prio); + __thread_priority(tpp->prio); + + for (int j = 0; j < tpp->n_threads; j++) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .mask_specified = tpp->mask_specified, + .threadpool = threadpool, + .ith = j + }; + + if (tpp->mask_specified) { + __cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); + } + + // Disposable threadpools need to have a valid cplan and cgraph immediately. + thread_ret_t (*thread_entrypoint)(void*) = disposable ? ggml_graph_compute_thread : ggml_graph_compute_secondary_thread; + // Spin threads for all secondary workers + if (j > 0) { + int32_t rc = ggml_thread_create( + &workers[j].thrd, + NULL, + thread_entrypoint, + &workers[j] + ); + GGML_ASSERT(rc == 0); + } + } +#endif // GGML_USE_OPENMP + + return threadpool; +} + +struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) { + return ggml_create_threadpool_impl(tpp, false, NULL, NULL); } enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { @@ -18825,19 +19296,41 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl GGML_ASSERT(cplan->n_threads > 0); GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); - int n_threads = cplan->n_threads; - - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.cgraph_plan =*/ cplan, - /*.n_threads =*/ n_threads, - /*.n_barrier =*/ 0, - /*.n_barrier_passed =*/ 0, - /*.abort_callback =*/ NULL, - /*.abort_callback_data =*/ NULL, - /*.current_chunk =*/ 0, - /*.ec =*/ GGML_STATUS_SUCCESS, - }; + int32_t n_threads = cplan->n_threads; + struct ggml_compute_threadpool * threadpool = cplan->threadpool; + + bool disposable_threadpool = false; + + if (threadpool == NULL) { + GGML_PRINT_DEBUG("NOTE: No threadpool was specified in this cplan. Will create a disposable threadpool\n"); + disposable_threadpool = true; + + struct ggml_threadpool_params ttp = { + .mask_specified = false, + .n_threads = n_threads, + .prio = 1, + .poll = false, + .strict_cpu = false + }; + + threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan); + } else if (n_threads > threadpool->n_threads_max) { + GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n"); + } + + // Set up work + threadpool->cgraph = cgraph; + threadpool->cplan = cplan; + threadpool->n_threads_cur = n_threads; + + if (!disposable_threadpool) { + // Reset some of the paramters that need resetting + // No worker threads should be accessing the parameters below at this stage + threadpool->n_barrier = 0; + threadpool->n_barrier_passed = 0; + threadpool->current_chunk = 0; + threadpool->ec = GGML_STATUS_SUCCESS; + } #ifdef GGML_USE_OPENMP if (n_threads > 1) { @@ -18847,63 +19340,52 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl { // update the number of threads from the actual number of threads that we got from OpenMP n_threads = omp_get_num_threads(); - state_shared.n_threads = n_threads; + threadpool->n_threads_cur = n_threads; } struct ggml_compute_state worker = { - .thrd = 0, - .ith = omp_get_thread_num(), - .shared = &state_shared, + .ith = omp_get_thread_num(), + .threadpool = threadpool, }; ggml_graph_compute_thread(&worker); } } else { struct ggml_compute_state worker = { - .thrd = 0, - .ith = 0, - .shared = &state_shared, + .ith = 0, + .threadpool = threadpool, }; ggml_graph_compute_thread(&worker); } #else - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); - - for (int j = 0; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - }; - } - - // create thread pool - for (int j = 1; j < n_threads; ++j) { - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); + if (!disposable_threadpool) { + // Update main thread affinity to match the current threadpool + if (threadpool->workers[0].mask_specified) { + __thread_affinity(threadpool->workers[0].cpumask); + } - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - UNUSED(rc); + threadpool->new_work = true; + if (!threadpool->poll) { + ggml_mutex_lock(&threadpool->mutex); + ggml_cond_broadcast(&threadpool->cond); + ggml_mutex_unlock(&threadpool->mutex); } } + // this is a work thread too + ggml_graph_compute_thread(&threadpool->workers[0]); #endif // don't leave affinity set on the main thread clear_numa_thread_affinity(); - return state_shared.ec; + if (disposable_threadpool) { + ggml_release_threadpool(threadpool); + } + + return threadpool->ec; } enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { - struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); @@ -19698,7 +20180,7 @@ static enum ggml_opt_result ggml_opt_adam( float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; @@ -20045,7 +20527,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( opt->iter = iter; } - struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads, NULL); struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; diff --git a/include/llama.h b/include/llama.h index 413070d95a5c4..9a2ccb1710f9a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -430,6 +430,18 @@ extern "C" { //optional: LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); + // Optional: an auto threadpool gets created in ggml if not passed explicitly + LLAMA_API void llama_attach_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool); + LLAMA_API void llama_attach_batch_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool); + LLAMA_API void llama_detach_threadpool(struct llama_context * ctx); + LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx); + LLAMA_API void llama_detach_threadpools(struct llama_context * ctx); + + // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); diff --git a/src/llama.cpp b/src/llama.cpp index 972f870b072b8..30a39e34d60b3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2690,6 +2690,9 @@ struct llama_context { #endif ggml_backend_t backend_cpu = nullptr; + ggml_compute_threadpool_t threadpool = nullptr; + ggml_compute_threadpool_t threadpool_batch = nullptr; + bool has_evaluated_once = false; int64_t t_start_us; @@ -14384,11 +14387,11 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { return n_outputs_max; } - static void llama_graph_compute( - llama_context & lctx, - ggml_cgraph * gf, - int n_threads) { + llama_context & lctx, + ggml_cgraph * gf, + int n_threads, + ggml_compute_threadpool * threadpool) { #ifdef GGML_USE_METAL if (ggml_backend_is_metal(lctx.backend_metal)) { ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); @@ -14397,6 +14400,7 @@ static void llama_graph_compute( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } #ifdef GGML_USE_BLAS @@ -14410,6 +14414,42 @@ static void llama_graph_compute( // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); } +// Optionally swaps the batch and single-tok threadpools. +// Returns the number of threads, and if a valid threadpool exists, returns it too. +static std::pair llama_swap_threadpools( + llama_context & lctx, + int32_t n_tokens) { + + const auto & cparams = lctx.cparams; + int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + + ggml_compute_threadpool_t threadpool = nullptr; // nullptr -> disposable threadpool + + // A batch threadpool without a non-batch threadpool isn't supported. + GGML_ASSERT(!lctx.threadpool_batch || lctx.threadpool); + + if (lctx.threadpool_batch && lctx.threadpool) { + // Switch between the 2 threadpools as needed + if (n_tokens > 1) { + ggml_pause_threadpool(lctx.threadpool); + ggml_resume_threadpool(lctx.threadpool_batch); + threadpool = lctx.threadpool_batch; + n_threads = cparams.n_threads_batch; + } else { + ggml_pause_threadpool(lctx.threadpool_batch); + ggml_resume_threadpool(lctx.threadpool); + threadpool = lctx.threadpool; + n_threads = cparams.n_threads; + } + } else if (lctx.threadpool) { + ggml_resume_threadpool(lctx.threadpool); + threadpool = lctx.threadpool; + n_threads = cparams.n_threads; + } + return std::make_pair(n_threads, threadpool); +} + + // decode a batch of tokens by evaluating the transformer // // - lctx: llama context @@ -14533,7 +14573,12 @@ static int llama_decode_internal( lctx.n_outputs = n_outputs_new; } - int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + std::pair threads = + llama_swap_threadpools(lctx, n_tokens); + + int32_t n_threads = threads.first; + ggml_compute_threadpool_t threadpool = threads.second; + GGML_ASSERT(n_threads > 0); // helpers for smoother batch API transition @@ -14618,7 +14663,7 @@ static int llama_decode_internal( llama_set_inputs(lctx, u_batch); - llama_graph_compute(lctx, gf, n_threads); + llama_graph_compute(lctx, gf, n_threads, threadpool); // update the kv ring buffer { @@ -14779,7 +14824,11 @@ static int llama_encode_internal( lctx.inp_embd_enc = NULL; lctx.n_outputs = n_tokens; - const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; + std::pair threads = + llama_swap_threadpools(lctx, n_tokens); + + int32_t n_threads = threads.first; + ggml_compute_threadpool_t threadpool = threads.second; GGML_ASSERT(n_threads > 0); // helpers for smoother batch API transition @@ -14822,7 +14871,7 @@ static int llama_encode_internal( llama_set_inputs(lctx, batch); - llama_graph_compute(lctx, gf, n_threads); + llama_graph_compute(lctx, gf, n_threads, threadpool); // extract embeddings if (embd) { @@ -15067,7 +15116,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); #endif //const int64_t t_end = ggml_time_us(); @@ -15093,7 +15142,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_set_k_shift(lctx); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; } @@ -15119,7 +15168,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_set_s_copy(lctx); - llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool); need_reserve = true; } @@ -16361,6 +16410,31 @@ void llama_numa_init(enum ggml_numa_strategy numa) { } } +void llama_attach_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool) { + ctx->threadpool = threadpool; +} + +void llama_attach_batch_threadpool( + struct llama_context * ctx, + ggml_compute_threadpool_t threadpool_batch) { + ctx->threadpool_batch = threadpool_batch; +} + +void llama_detach_threadpool(struct llama_context * ctx) { + ctx->threadpool = nullptr; +} + +void llama_detach_batch_threadpool(struct llama_context * ctx) { + ctx->threadpool = nullptr; +} + +void llama_detach_threadpools(struct llama_context * ctx) { + llama_detach_threadpool(ctx); + llama_detach_batch_threadpool(ctx); +} + void llama_backend_free(void) { ggml_quantize_free(); } diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index 8159e276af617..246bb227d1e19 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32( } static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr); if (plan.work_size > 0) { buf.resize(plan.work_size); From a4e97f320c7c1fd82d104a844cafc0fa3bb53d13 Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 23 Jul 2024 06:57:43 -0700 Subject: [PATCH 2/4] uncomment cpu-relax --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b0630729f9175..d2deb0f6530dd 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19144,7 +19144,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) if (threadpool->poll) { while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) { // No new work. Yield and keep polling. - //__cpu_relax(); + __cpu_relax(); } } else { ggml_mutex_lock_shared(&threadpool->mutex); From bc7eaecfe42435aa0aa699f389cefb53c46aa808 Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 23 Jul 2024 12:15:08 -0700 Subject: [PATCH 3/4] re-enable speculative ... facing segfaults on master ... --- CMakePresets.json | 256 +++++++-------------------- examples/CMakeLists.txt | 2 +- examples/speculative/speculative.cpp | 48 ++++- ggml/src/ggml.c | 2 +- include/llama.h | 2 + src/llama.cpp | 9 + 6 files changed, 119 insertions(+), 200 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index ae2bf25c12786..bdad38952d3cb 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,197 +1,65 @@ { - "version": 4, - "configurePresets": [ - { - "name": "base", - "hidden": true, - "generator": "Ninja", - "binaryDir": "${sourceDir}/build-${presetName}", - "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", - "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." - } - }, - { - "name": "sycl-base", - "hidden": true, - "generator": "Ninja", - "binaryDir": "${sourceDir}/build-${presetName}", - "cacheVariables": { - "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", - "CMAKE_CXX_COMPILER": "icx", - "CMAKE_C_COMPILER": "cl", - "GGML_SYCL": "ON", - "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." - } - }, - { - "name": "debug", - "hidden": true, - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Debug" - } - }, - { - "name": "release", - "hidden": true, - "cacheVariables": { - "CMAKE_BUILD_TYPE": "Release" - } - }, - { - "name": "reldbg", - "hidden": true, - "cacheVariables": { - "CMAKE_BUILD_TYPE": "RelWithDebInfo" - } - }, - { - "name": "static", - "hidden": true, - "cacheVariables": { - "GGML_STATIC": "ON" - } - }, - { - "name": "arm64-windows-msvc", - "hidden": true, - "architecture": { - "value": "arm64", - "strategy": "external" - }, - "toolset": { - "value": "host=x86_64", - "strategy": "external" - }, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake" - } - }, - { - "name": "arm64-windows-llvm", - "hidden": true, - "architecture": { - "value": "arm64", - "strategy": "external" - }, - "toolset": { - "value": "host=x86_64", - "strategy": "external" - }, - "cacheVariables": { - "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake" - } - }, - { - "name": "arm64-windows-llvm-debug", - "inherits": [ - "base", - "arm64-windows-llvm", - "debug" - ] - }, - { - "name": "arm64-windows-llvm-release", - "inherits": [ - "base", - "arm64-windows-llvm", - "reldbg" - ] - }, - { - "name": "arm64-windows-llvm+static-release", - "inherits": [ - "base", - "arm64-windows-llvm", - "reldbg", - "static" - ] - }, - { - "name": "arm64-windows-msvc-debug", - "inherits": [ - "base", - "arm64-windows-msvc", - "debug" - ] - }, - { - "name": "arm64-windows-msvc-release", - "inherits": [ - "base", - "arm64-windows-msvc", - "reldbg" - ] - }, - { - "name": "arm64-windows-msvc+static-release", - "inherits": [ - "base", - "arm64-windows-msvc", - "reldbg", - "static" - ] - }, - { - "name": "x64-windows-msvc-debug", - "inherits": [ - "base", - "debug" - ] - }, - { - "name": "x64-windows-msvc-release", - "inherits": [ - "base", - "reldbg" - ] - }, - { - "name": "x64-windows-msvc+static-release", - "inherits": [ - "base", - "reldbg", - "static" - ] - }, - { - "name": "x64-windows-sycl-debug", - "inherits": [ - "sycl-base", - "debug" - ] - }, - { - "name": "x64-windows-sycl-release", - "inherits": [ - "sycl-base", - "release" - ] - }, - { - "name": "clang10", - "displayName": "Clang 10.0.0 x86_64-pc-linux-gnu", - "description": "Using compilers: C = /usr/bin/clang, CXX = /usr/bin/clang++", - "binaryDir": "${sourceDir}/out/build/${presetName}", - "cacheVariables": { - "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", - "CMAKE_C_COMPILER": "/usr/bin/clang", - "CMAKE_CXX_COMPILER": "/usr/bin/clang++", - "CMAKE_RC_COMPILER": "/usr/bin/llvm-rc-10", - "CMAKE_BUILD_TYPE": "Debug" - } - }, - { - "name": "gcc8.4", - "displayName": "GCC 8.4.0 x86_64-linux-gnu", - "description": "Using compilers: C = /usr/bin/gcc, CXX = /usr/bin/g++", - "binaryDir": "${sourceDir}/out/build/${presetName}", - "cacheVariables": { - "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", - "CMAKE_C_COMPILER": "/usr/bin/gcc", - "CMAKE_CXX_COMPILER": "/usr/bin/g++", - "CMAKE_BUILD_TYPE": "Debug" - } + "version": 4, + "configurePresets": [ + { + "name": "base", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build-${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." } - ] + }, + { + "name": "sycl-base", + "hidden": true, + "generator": "Ninja", + "binaryDir": "${sourceDir}/build-${presetName}", + "cacheVariables": { + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_CXX_COMPILER": "icx", + "CMAKE_C_COMPILER": "cl", + "GGML_SYCL": "ON", + "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.." + } + }, + { "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } }, + { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } }, + { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, + + { + "name": "arm64-windows-msvc", "hidden": true, + "architecture": { "value": "arm64", "strategy": "external" }, + "toolset": { "value": "host=x86_64", "strategy": "external" }, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake" + } + }, + + { + "name": "arm64-windows-llvm", "hidden": true, + "architecture": { "value": "arm64", "strategy": "external" }, + "toolset": { "value": "host=x86_64", "strategy": "external" }, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake" + } + }, + + { "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] }, + { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] }, + { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] }, + + { "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] }, + { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, + { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, + + { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] }, + { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, + { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, + + { "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] }, + { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] } + ] } diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 247d52c6d3454..67b3d27747850 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -50,6 +50,6 @@ else() endif() add_subdirectory(save-load-state) add_subdirectory(simple) - #add_subdirectory(speculative) + add_subdirectory(speculative) add_subdirectory(tokenize) endif() diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 0939a1a6a7a38..8580839bc58ed 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -24,6 +24,14 @@ struct seq_draft { struct llama_sampling_context * ctx_sampling; }; +// static void switch_active_threadpool( +// llama_context* cur, +// llama_context* nxt +// ) { +// ggml_pause_threadpool(cur); +// ggml_resume_threadpool(nxt); +// } + int main(int argc, char ** argv) { gpt_params params; @@ -68,15 +76,43 @@ int main(int argc, char ** argv) { // load the target model std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); + ggml_threadpool_params tpp_batch_tgt = + ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); + ggml_threadpool_params tpp_tgt = ggml_threadpool_params_from_cpu_params(params.cpuparams); + struct ggml_compute_threadpool * threadpool_batch_tgt = ggml_create_threadpool(&tpp_batch_tgt); + if (!threadpool_batch_tgt) { + LOG_TEE("%s: target batch threadpool create failed : n_threads %d\n", __func__, tpp_batch_tgt.n_threads); + exit(1); + } + ggml_compute_threadpool * threadpool_tgt = ggml_create_threadpool(&tpp_tgt); + if (!threadpool_tgt) { + LOG_TEE("%s: target threadpool create failed : n_threads %d\n", __func__, tpp_tgt.n_threads); + exit(1); + } + llama_attach_batch_threadpool(ctx_tgt, threadpool_batch_tgt); + llama_attach_threadpool(ctx_tgt, threadpool_tgt); + // load the draft model params.model = params.model_draft; params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.n_threads_draft > 0) { - params.n_threads = params.n_threads_draft; - } - params.n_threads_batch = params.n_threads_batch_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + ggml_threadpool_params tpp_batch_dft = + ggml_threadpool_params_from_cpu_params(params.draft_cpuparams_batch); + ggml_threadpool_params tpp_dft = ggml_threadpool_params_from_cpu_params(params.draft_cpuparams); + struct ggml_compute_threadpool * threadpool_batch_dft = ggml_create_threadpool(&tpp_batch_dft); + if (!threadpool_batch_dft) { + LOG_TEE("%s: draft batch threadpool create failed : n_threads %d\n", __func__, tpp_batch_dft.n_threads); + exit(1); + } + ggml_compute_threadpool * threadpool_dft = ggml_create_threadpool(&tpp_dft); + if (!threadpool_dft) { + LOG_TEE("%s: draft threadpool create failed : n_threads %d\n", __func__, tpp_dft.n_threads); + exit(1); + } + llama_attach_batch_threadpool(ctx_dft, threadpool_batch_tgt); + llama_attach_threadpool(ctx_dft, threadpool_dft); + const bool vocab_type_tgt = llama_vocab_type(model_tgt); LOG("vocab_type tgt: %d\n", vocab_type_tgt); @@ -154,6 +190,7 @@ int main(int argc, char ** argv) { // eval the prompt with both models llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); + llama_pause_threadpools(ctx_tgt); llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); const auto t_enc_end = ggml_time_us(); @@ -550,6 +587,8 @@ int main(int argc, char ** argv) { break; } } + llama_pause_threadpools(ctx_dft); + // evaluate the target model on the drafted tokens { @@ -560,6 +599,7 @@ int main(int argc, char ** argv) { // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); llama_decode(ctx_tgt, batch_tgt); + llama_pause_threadpools(ctx_tgt); ++n_past_tgt; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d2deb0f6530dd..f47035ce19c6a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18751,7 +18751,7 @@ static bool __thread_affinity(const bool * mask) { for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { if (mask[i]) { - printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i); + GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i); CPU_SET(i, &cpuset); } } diff --git a/include/llama.h b/include/llama.h index 9a2ccb1710f9a..88b56e6a15ac2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -441,6 +441,8 @@ extern "C" { LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx); LLAMA_API void llama_detach_threadpools(struct llama_context * ctx); + // Pauses all attached threadpools + LLAMA_API void llama_pause_threadpools(struct llama_context * ctx); // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void); diff --git a/src/llama.cpp b/src/llama.cpp index 30a39e34d60b3..e8aead263fec5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16435,6 +16435,15 @@ void llama_detach_threadpools(struct llama_context * ctx) { llama_detach_batch_threadpool(ctx); } +void llama_pause_threadpools(struct llama_context * ctx) { + if (ctx->threadpool) { + ggml_pause_threadpool(ctx->threadpool); + } + if (ctx->threadpool_batch) { + ggml_pause_threadpool(ctx->threadpool_batch); + } +} + void llama_backend_free(void) { ggml_quantize_free(); } From e317ab61b98e613718e23c30771fdd66de04ad4d Mon Sep 17 00:00:00 2001 From: fmz Date: Thu, 25 Jul 2024 15:09:11 -0400 Subject: [PATCH 4/4] add _GNU_SOURCE --- common/common.cpp | 1 - examples/export-lora/export-lora.cpp | 2 +- ggml/src/ggml.c | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 7eb0850d7f889..3ad5f33818d8e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1911,7 +1911,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads }); options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); printf("usage: %s [options]\n", argv[0]); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 124ee167d2255..69467004bd7c2 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -393,7 +393,7 @@ int main(int argc, char ** argv) { g_verbose = (params.verbosity == 1); try { - lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads); + lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.cpuparams.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f47035ce19c6a..3d662caf8bdb0 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18740,6 +18740,7 @@ static bool __thread_priority(int32_t prio) { #ifndef __USE_GNU #define __USE_GNU +#define _GNU_SOURCE #endif #include