Skip to content
300 changes: 280 additions & 20 deletions common/common.cpp

Large diffs are not rendered by default.

29 changes: 22 additions & 7 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,18 @@ enum dimre_method {
DIMRE_METHOD_MEAN,
};

struct cpu_params {
int32_t n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
bool poll = true; // Use polling (busywait) to wait for work (default matches OpenMP)
};

struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed

int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
Expand All @@ -91,6 +96,11 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;

ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;

Expand Down Expand Up @@ -195,7 +205,7 @@ struct gpt_params {
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)

std::string hostname = "127.0.0.1";
std::string public_path = "";
Expand Down Expand Up @@ -268,6 +278,10 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

std::string gpt_params_get_system_info(const gpt_params & params);

bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);

//
// String utils
//
Expand Down Expand Up @@ -315,8 +329,9 @@ struct llama_init_result {

struct llama_init_result llama_init_from_gpt_params(gpt_params & params);

struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
Expand Down
2 changes: 1 addition & 1 deletion examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
Expand Down
2 changes: 1 addition & 1 deletion examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
Expand Down
4 changes: 2 additions & 2 deletions examples/cvector-generator/cvector-generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_threads = params.cpuparams.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else {
Expand Down
2 changes: 1 addition & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ int main(int argc, char ** argv) {

g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
Expand Down
53 changes: 53 additions & 0 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ struct cmd_params {
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
cpu_params cpuparams;
int reps;
bool verbose;
output_formats output_format;
Expand All @@ -232,6 +233,7 @@ static const cmd_params cmd_params_defaults = {
/* use_mmap */ {true},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* cpuparams */ {},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN,
Expand Down Expand Up @@ -260,6 +262,10 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
Expand Down Expand Up @@ -463,6 +469,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.priority = std::stoul(argv[i]);
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.strict_cpu = std::stoul(argv[i]);
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.poll = std::stoul(argv[i]);
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1373,6 +1405,8 @@ int main(int argc, char ** argv) {
llama_model * lmodel = nullptr;
const cmd_params_instance * prev_inst = nullptr;

postprocess_cpu_params(params.cpuparams);

for (const auto & inst : params_instances) {
// keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
Expand All @@ -1399,6 +1433,23 @@ int main(int argc, char ** argv) {

llama_kv_cache_clear(ctx);

struct ggml_threadpool_params tpp;
tpp.n_threads = t.n_threads;
tpp.mask_specified = params.cpuparams.mask_valid;
tpp.strict_cpu = params.cpuparams.strict_cpu;
tpp.prio = params.cpuparams.priority;
tpp.poll = params.cpuparams.poll;

std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);

struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}

llama_attach_threadpool(ctx, threadpool);

// warmup run
if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
Expand Down Expand Up @@ -1437,6 +1488,8 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);

llama_free(ctx);

ggml_release_threadpool(threadpool);
}

llama_free_model(lmodel);
Expand Down
4 changes: 2 additions & 2 deletions examples/llava/llava-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;
Expand Down
30 changes: 30 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,33 @@ int main(int argc, char ** argv) {
return 1;
}

LOG("%s: llama threadpool init = n_threads = %d\n",
__func__,
(int32_t) params.cpuparams.n_threads
);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);

struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1);
}
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}

llama_attach_batch_threadpool(ctx, threadpool_batch);
llama_attach_threadpool(ctx, threadpool);
if (ctx_guidance) {
llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
llama_attach_threadpool(ctx_guidance, threadpool);
}

const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
Expand Down Expand Up @@ -989,6 +1016,9 @@ int main(int argc, char ** argv) {
llama_sampling_free(ctx_sampling);
llama_backend_free();

ggml_release_threadpool(threadpool);
ggml_release_threadpool(threadpool_batch);

#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS
Expand Down
4 changes: 2 additions & 2 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2523,8 +2523,8 @@ int main(int argc, char ** argv) {
});

LOG_INFO("system info", {
{"n_threads", params.n_threads},
{"n_threads_batch", params.n_threads_batch},
{"n_threads", params.cpuparams.n_threads},
{"n_threads_batch", params.cpuparams_batch.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});
Expand Down
7 changes: 4 additions & 3 deletions examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
if (params.n_threads_draft > 0) {
params.n_threads = params.n_threads_draft;
if (params.draft_cpuparams.n_threads > 0) {
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
}
params.n_threads_batch = params.n_threads_batch_draft;

params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
model_dft = llama_init_dft.model;
ctx_dft = llama_init_dft.context;
Expand Down
2 changes: 1 addition & 1 deletion ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library"
set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"ggml: metal minimum macOS version")
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_OPENMP "ggml: use OpenMP" OFF)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
Expand Down
5 changes: 3 additions & 2 deletions ggml/include/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ extern "C" {
#endif

typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_compute_threadpool * ggml_compute_threadpool_t;

// Tensor allocator
struct ggml_tallocr {
Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ extern "C" {

GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

// Create a backend buffer from an existing pointer
Expand Down
28 changes: 26 additions & 2 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64
#define GGML_MAX_N_THREADS 512

#endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
Expand Down Expand Up @@ -622,13 +624,25 @@ extern "C" {
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);

struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS];
bool mask_specified;
int32_t n_threads;
int32_t prio;
bool poll;
bool strict_cpu;
};

struct ggml_compute_threadpool; // forward declaration, see ggml.c

// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
struct ggml_compute_threadpool * threadpool;

// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
Expand Down Expand Up @@ -2010,10 +2024,20 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params);
GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool);
GGML_API int32_t ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool);
GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool);

// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_compute_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);

// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
Expand Down
Loading