Add a new x8-packq microkernel that packs and per-row dynamically q…

…uantizes `fp32` to `qp8`. The microkernels themselves are just wrappers for the corresponding KleidiAI kernels. PiperOrigin-RevId: 633914713
google · May 15, 2024 · 7130ee3 · 7130ee3
1 parent 8d5f4e0
commit 7130ee3
Show file tree

Hide file tree

Showing 21 changed files with 952 additions and 10 deletions.
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -134,6 +134,7 @@ MICROKERNEL_HDRS = [
     "src/xnnpack/lut.h",
     "src/xnnpack/maxpool.h",
     "src/xnnpack/packb.h",
+    "src/xnnpack/packq.h",
     "src/xnnpack/packw.h",
     "src/xnnpack/packx.h",
     "src/xnnpack/pad.h",

diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel
@@ -20,18 +20,18 @@ load(
 
 MICROKERNEL_BENCHMARK_DEPS = [
     ":bench_utils",
+    "@FP16",
     "//:aligned_allocator",
     "//:bench_microkernels",
     "//:common",
     "//:enable_assembly",
     "//:jit",
     "//:microkernels_h",
+    "//:microparams_init",
+    "//:microparams",
     "//:packing",
     "//:params",
-    "//:microparams",
-    "//:microparams_init",
     "//:xnnpack_h",
-    "@FP16",
 ]
 
 OPERATOR_BENCHMARK_DEPS = [
@@ -1259,6 +1259,31 @@ xnnpack_benchmark(
     deps = MICROKERNEL_BENCHMARK_DEPS,
 )
 
+xnnpack_cc_library(
+    name = "packq_benchmark",
+    srcs = [
+        "bgemm.h",
+        "packq-benchmark.cc",
+    ],
+    hdrs = ["packq-benchmark.h"],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        "@com_google_benchmark//:benchmark",
+    ],
+)
+
+xnnpack_benchmark(
+    name = "x8_packq_bench",
+    srcs = [
+        "bgemm.h",
+        "x8-packq.cc",
+    ],
+    deps = MICROKERNEL_BENCHMARK_DEPS + [
+        ":packq_benchmark",
+        "//:allocator",
+        "//:math",
+    ],
+)
+
 xnnpack_benchmark(
     name = "x8_packw_bench",
     srcs = [

diff --git a/bench/f16-f32acc-rdsum.cc b/bench/f16-f32acc-rdsum.cc
@@ -49,6 +49,46 @@
 #endif  // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
 
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c16,
+                    xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(BenchmarkBatch)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c32,
+                    xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(BenchmarkBatch)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c64,
+                    xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(BenchmarkBatch)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c128,
+                    xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128,
+                    xnn_init_f16_f32acc_scale_avx_params,
+                    benchmark::utils::CheckF16C)
+    ->Apply(BenchmarkBatch)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, avx512skx_c16,
                     xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c16,

diff --git a/bench/packq-benchmark.cc b/bench/packq-benchmark.cc
@@ -0,0 +1,86 @@
+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "packq-benchmark.h"
+
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/pack.h>
+#include <xnnpack/packq.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+
+void x8_packq(benchmark::State& state, xnn_x8_packq_f32qp8_ukernel_fn packq,
+              size_t mr, size_t kr, size_t sr,
+              benchmark::utils::IsaCheckFunction isa_check) {
+  if (isa_check != nullptr && !isa_check(state)) {
+    return;
+  }
+
+  const size_t batch = state.range(0);
+  const size_t dim_m = state.range(2);
+  const size_t dim_k = state.range(3);
+
+  const size_t rounded_n = benchmark::utils::RoundUp(dim_m, mr);
+  const size_t rounded_k = benchmark::utils::RoundUp(dim_k, kr);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = [&]() {
+    return std::uniform_real_distribution<float>(-10, 10)(rng);
+  };
+
+  // Compute a num_buffers that fit cache with source weights + packed_weights.
+  const size_t num_buffers =
+      1 + benchmark::utils::DivideRoundUp<size_t>(
+              benchmark::utils::GetMaxCacheSize(),
+              sizeof(int8_t) * batch *
+                  (dim_m * dim_k + rounded_n * rounded_k + rounded_n));
+
+  std::vector<float, AlignedAllocator<float, 64>> input(num_buffers * batch *
+                                                        dim_m * dim_k);
+  std::generate(input.begin(), input.end(), f32rng);
+  const size_t packed_size =
+      xnn_x8_packq_f32qp8_packed_size(batch * dim_m, dim_k, mr, kr);
+  std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights(num_buffers *
+                                                                   packed_size);
+
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    if (++buffer_index == num_buffers) {
+      buffer_index = 0;
+    }
+
+    packq(batch * dim_m, dim_k, mr, kr, sr,
+          /*m_idx_start=*/buffer_index * dim_m,
+          input.data() + buffer_index * batch * dim_m * dim_k,
+          dim_k * sizeof(float), packed_weights.data());
+  }
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t elements_per_iteration = batch * dim_m * dim_k;
+  state.counters["elements"] = benchmark::Counter(
+      static_cast<uint64_t>(state.iterations()) * elements_per_iteration,
+      benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration =
+      (elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n)) *
+      sizeof(int8_t);
+  state.counters["bytes"] = benchmark::Counter(
+      static_cast<uint64_t>(state.iterations()) * bytes_per_iteration,
+      benchmark::Counter::kIsRate);
+}
diff --git a/bench/packq-benchmark.h b/bench/packq-benchmark.h
@@ -0,0 +1,24 @@
+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef __XNNPACK_BENCH_PACKQ_BENCHMARK_H
+#define __XNNPACK_BENCH_PACKQ_BENCHMARK_H
+
+#include <xnnpack/aligned-allocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/microfnptr.h>
+#include <xnnpack/pack.h>
+#include <xnnpack/packq.h>
+
+#include <cstddef>
+
+#include "bench/utils.h"
+#include <benchmark/benchmark.h>
+
+void x8_packq(benchmark::State& state, xnn_x8_packq_f32qp8_ukernel_fn packq,
+              size_t mr, size_t kr, size_t sr,
+              benchmark::utils::IsaCheckFunction isa_check = nullptr);
+
+#endif  // __XNNPACK_TEST_PACKQ_MICROKERNEL_TESTER_H
diff --git a/bench/x8-packq.cc b/bench/x8-packq.cc
@@ -0,0 +1,46 @@
+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/x8-packq.yaml
+//   Generator: tools/generate-packq-test.py
+
+
+#include <xnnpack/common.h>
+#include <xnnpack/packq.h>
+
+#include <benchmark/benchmark.h>
+#include "bench/bgemm.h"
+#include "bench/packq-benchmark.h"
+
+
+static void x8_packq_f32qp8_ukernel__scalar_u1_mr1(
+    benchmark::State& state, const char* net) {
+  x8_packq(state,
+    xnn_x8_packq_f32qp8_ukernel__scalar_u1,
+    /*mr=*/1, /*kr=*/1, /*sr=*/1);
+}
+BENCHMARK_BGEMM(x8_packq_f32qp8_ukernel__scalar_u1_mr1)
+
+static void x8_packq_f32qp8_ukernel__scalar_u1_mr2(
+    benchmark::State& state, const char* net) {
+  x8_packq(state,
+    xnn_x8_packq_f32qp8_ukernel__scalar_u1,
+    /*mr=*/2, /*kr=*/1, /*sr=*/1);
+}
+BENCHMARK_BGEMM(x8_packq_f32qp8_ukernel__scalar_u1_mr2)
+
+static void x8_packq_f32qp8_ukernel__scalar_u1_mr4(
+    benchmark::State& state, const char* net) {
+  x8_packq(state,
+    xnn_x8_packq_f32qp8_ukernel__scalar_u1,
+    /*mr=*/4, /*kr=*/1, /*sr=*/1);
+}
+BENCHMARK_BGEMM(x8_packq_f32qp8_ukernel__scalar_u1_mr4)
+
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake
@@ -1052,6 +1052,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/x8-lut/gen/x8-lut-scalar-u4.c
   src/x8-lut/gen/x8-lut-scalar-u8.c
   src/x8-lut/gen/x8-lut-scalar-u16.c
+  src/x8-packq/x8-packq-scalar-f32qp8-u1.c
   src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u2.c
   src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u4.c
   src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-int-u2.c

diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl
@@ -1048,6 +1048,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
     "src/x8-lut/gen/x8-lut-scalar-u4.c",
     "src/x8-lut/gen/x8-lut-scalar-u8.c",
     "src/x8-lut/gen/x8-lut-scalar-u16.c",
+    "src/x8-packq/x8-packq-scalar-f32qp8-u1.c",
     "src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u2.c",
     "src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u4.c",
     "src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-int-u2.c",

diff --git a/include/xnnpack.h b/include/xnnpack.h
@@ -219,20 +219,30 @@ enum xnn_datatype {
   xnn_datatype_fp32 = 1,
   /// IEEE754 half-precision floating-point.
   xnn_datatype_fp16 = 2,
-  /// Quantized 8-bit signed integer with shared per-Value quantization parameters.
+  /// Quantized 8-bit signed integer with shared per-Value quantization
+  /// parameters.
   xnn_datatype_qint8 = 3,
-  /// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
+  /// Quantized 8-bit unsigned integer with shared per-Value quantization
+  /// parameters.
   xnn_datatype_quint8 = 4,
-  /// Quantized 32-bit signed integer with shared per-Value quantization parameters.
+  /// Quantized 32-bit signed integer with shared per-Value quantization
+  /// parameters.
   xnn_datatype_qint32 = 5,
-  /// Quantized 8-bit signed integer with shared per-channel quantization parameters.
+  /// Quantized 8-bit signed integer with shared per-channel quantization
+  /// parameters.
   xnn_datatype_qcint8 = 6,
-  /// Quantized 32-bit signed integer with shared per-channel quantization parameters.
+  /// Quantized 32-bit signed integer with shared per-channel quantization
+  /// parameters.
   xnn_datatype_qcint32 = 7,
-  /// Quantized 4-bit signed integer with shared per-channel quantization parameters.
+  /// Quantized 4-bit signed integer with shared per-channel quantization
+  /// parameters.
   xnn_datatype_qcint4 = 8,
-  /// Dynamically quantized 8-bit signed integer with per-batch quantization parameters.
+  /// Dynamically quantized 8-bit signed integer with per-batch quantization
+  /// parameters.
   xnn_datatype_qdint8 = 9,
+  /// Dynamically quantized 8-bit signed integers packed with their per-row
+  /// quantization parameters.
+  xnn_datatype_qpint8 = 10,
 };
 
 /// Define a tensor-type Value and add it to a Subgraph.

diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
@@ -7,6 +7,9 @@
 ### Tests for packing micro-kernels
 tools/generate-pack-test.py --spec test/x32-packx.yaml --output test/x32-packx.cc &
 
+### Tests for Pack quantized micro-kernels
+tools/generate-packq-test.py --spec test/x8-packq.yaml --output test/x8-packq.cc --output-bench bench/x8-packq.cc &
+
 ### Tests for Pack Weights micro-kernels
 tools/generate-packw-test.py --spec test/x8-packw.yaml --output test/x8-packw.cc --output-bench bench/x8-packw.cc &
 tools/generate-packw-test.py --spec test/x16-packw.yaml --output test/x16-packw.cc --output-bench bench/x16-packw.cc &

diff --git a/src/enums/datatype-strings.c b/src/enums/datatype-strings.c
@@ -35,6 +35,8 @@ const char* xnn_datatype_to_string(enum xnn_datatype type) {
       return "QCINT32";
     case xnn_datatype_qdint8:
       return "QDINT8";
+    case xnn_datatype_qpint8:
+      return "QPINT8";
   }
   XNN_UNREACHABLE;
   return NULL;

diff --git a/src/tensor.c b/src/tensor.c
@@ -513,6 +513,7 @@ size_t xnn_tensor_get_size(const struct xnn_value* value)
     case xnn_datatype_qint8:
     case xnn_datatype_quint8:
     case xnn_datatype_qcint8:
+    case xnn_datatype_qpint8:
       size = 1;
       break;
     case xnn_datatype_qint32:
@@ -528,6 +529,11 @@ size_t xnn_tensor_get_size(const struct xnn_value* value)
   // Adjustments for nibbles, assume that we can't have sizes are byte-aligned (rounded up).
   if (value->datatype == xnn_datatype_qcint4) {
     size = round_up_po2(size, 2) >> 1;
+  } else if (value->datatype == xnn_datatype_qpint8) {
+    // TODO(b/340399245): Compute the correct size depending on the shape and
+    // packing constraints/alignment.
+    xnn_log_fatal("Support for %s is not yet implemented.",
+                  xnn_datatype_to_string(value->datatype));
   }
 
   return size;