diff --git a/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch b/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch index 811bf30..b5ffbd6 100644 --- a/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch +++ b/kleidiai-examples/llama_cpp/0001-Use-KleidiAI-Int4-Matmul-micro-kernels-in-llama.cpp.patch @@ -1,8 +1,13 @@ -From 453e52a763043e95b23c88176792e065377189ad Mon Sep 17 00:00:00 2001 -From: Charles Xu -Date: Tue, 9 Jul 2024 08:49:27 +0200 -Subject: [PATCH] Updated to be able to build on Linux +From 617486784d5394fbb54f4d99a4860a050318a4e8 Mon Sep 17 00:00:00 2001 +From: Gian Marco Iodice +Date: Tue, 16 Jul 2024 17:28:50 +0100 +Subject: [PATCH] Use KleidiAI Int4 Matmul micro-kernels in llama.cpp +- Update CMake file to fetch the Int4 micro-kernels from the KleidiAI +repository +- Implement a KleidiAI backend for llama.cpp + +Signed-off-by: Gian Marco Iodice --- CMakeLists.txt | 48 ++++ ggml-alloc.c | 13 ++ @@ -15,7 +20,7 @@ Subject: [PATCH] Updated to be able to build on Linux create mode 100644 ggml-kleidiai.h diff --git a/CMakeLists.txt b/CMakeLists.txt -index 08481334..99382573 100644 +index 08481334..22504ad2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,6 +548,53 @@ if (LLAMA_VULKAN) @@ -26,9 +31,9 @@ index 08481334..99382573 100644 + + # Fetch KleidiAI sources: + include(FetchContent) -+ set(KLEIDIAI_COMMIT_SHA "d6c3b987e445e5e1daeda94e3c2888efaa07ca50") ++ set(KLEIDIAI_COMMIT_SHA "187d9aacddfb678c09f0831b18f87401b1b353c3") + set(KLEIDIAI_DOWNLOAD_URL "https://gitlab.arm.com/kleidi/kleidiai/-/archive/${KLEIDIAI_COMMIT_SHA}/kleidiai-${KLEIDIAI_COMMIT_SHA}.tar.gz") -+ set(KLEIDIAI_ARCHIVE_MD5 "8e94e73bfa00ea038fd6e3d13f59080f") ++ set(KLEIDIAI_ARCHIVE_MD5 "4a1eee013cb20464b534cb01212d19c9") + + if (POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) @@ -113,7 +118,7 @@ index bd367c42..ed4ce0ae 100644 if (this_size > max_size) { diff --git a/ggml-kleidiai.cpp b/ggml-kleidiai.cpp new file mode 100644 -index 00000000..aa53086d +index 00000000..6800f63e --- /dev/null +++ b/ggml-kleidiai.cpp @@ -0,0 +1,560 @@ @@ -171,8 +176,8 @@ index 00000000..aa53086d +static bool g_kai_loaded = false; + +// Basic backend memory allocator -+static uint8_t* extra_mem[MAX_EXTRA_BUFFERS]; -+static int32_t extra_mem_idx = 0; ++static uint8_t* g_extra_mem[MAX_EXTRA_BUFFERS]; ++static int32_t g_extra_mem_idx = 0; + +typedef void (*kai_matmul_func_t)(const struct ggml_compute_params * params, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + @@ -546,7 +551,7 @@ index 00000000..aa53086d + free(reshaped_data); + cur->extra = cur->data; +#else -+ extra_mem[extra_mem_idx++] = reshaped_data; ++ g_extra_mem[g_extra_mem_idx++] = reshaped_data; + cur->extra = reshaped_data; +#endif + } else { @@ -671,10 +676,10 @@ index 00000000..aa53086d +} + +GGML_CALL void ggml_kai_free_extra_mem(void) { -+ for(int32_t i = extra_mem_idx - 1; i >= 0; i--) { -+ free(extra_mem[i]); ++ for(int32_t i = g_extra_mem_idx - 1; i >= 0; i--) { ++ free(g_extra_mem[i]); + } -+ extra_mem_idx = 0; ++ g_extra_mem_idx = 0; +} +#endif // defined(__aarch64__) && (defined(__ANDROID__) || defined(__linux__)) diff --git a/ggml-kleidiai.h b/ggml-kleidiai.h @@ -840,5 +845,5 @@ index 05591aa4..735dde04 100644 } -- -2.34.1 +2.25.1 diff --git a/kleidiai-examples/llama_cpp/README.md b/kleidiai-examples/llama_cpp/README.md index b8128ab..1400ed8 100644 --- a/kleidiai-examples/llama_cpp/README.md +++ b/kleidiai-examples/llama_cpp/README.md @@ -101,12 +101,12 @@ cmake -DLLAMA_KLEIDIAI=ON -DCMAKE_TOOLCHAIN_FILE=${NDK_PATH}/build/cmake/android make -j4 ``` -Build the llama.cpp project for Linux: +Build the llama.cpp project for Linux®: ```bash mkdir build && cd build -cmake -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm -DLLAMA_KLEIDIAI=ON +cmake -DLLAMA_KLEIDIAI=ON -DCMAKE_C_FLAGS=-march=armv8.2-a+dotprod+i8mm -DCMAKE_CXX_FLAGS=-march=armv8.2-a+dotprod+i8mm .. make -j4 ```