Skip to content

Commit

Permalink
Add a new x8-packq microkernel that packs and per-row dynamically q…
Browse files Browse the repository at this point in the history
…uantizes `fp32` to `qp8`.

The microkernels themselves are just wrappers for the corresponding KleidiAI kernels.

PiperOrigin-RevId: 633914713
  • Loading branch information
gonnet authored and xnnpack-bot committed May 15, 2024
1 parent 8d5f4e0 commit 7130ee3
Show file tree
Hide file tree
Showing 21 changed files with 952 additions and 10 deletions.
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ MICROKERNEL_HDRS = [
"src/xnnpack/lut.h",
"src/xnnpack/maxpool.h",
"src/xnnpack/packb.h",
"src/xnnpack/packq.h",
"src/xnnpack/packw.h",
"src/xnnpack/packx.h",
"src/xnnpack/pad.h",
Expand Down
31 changes: 28 additions & 3 deletions bench/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@ load(

MICROKERNEL_BENCHMARK_DEPS = [
":bench_utils",
"@FP16",
"//:aligned_allocator",
"//:bench_microkernels",
"//:common",
"//:enable_assembly",
"//:jit",
"//:microkernels_h",
"//:microparams_init",
"//:microparams",
"//:packing",
"//:params",
"//:microparams",
"//:microparams_init",
"//:xnnpack_h",
"@FP16",
]

OPERATOR_BENCHMARK_DEPS = [
Expand Down Expand Up @@ -1259,6 +1259,31 @@ xnnpack_benchmark(
deps = MICROKERNEL_BENCHMARK_DEPS,
)

xnnpack_cc_library(
name = "packq_benchmark",
srcs = [
"bgemm.h",
"packq-benchmark.cc",
],
hdrs = ["packq-benchmark.h"],
deps = MICROKERNEL_BENCHMARK_DEPS + [
"@com_google_benchmark//:benchmark",
],
)

xnnpack_benchmark(
name = "x8_packq_bench",
srcs = [
"bgemm.h",
"x8-packq.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS + [
":packq_benchmark",
"//:allocator",
"//:math",
],
)

xnnpack_benchmark(
name = "x8_packw_bench",
srcs = [
Expand Down
40 changes: 40 additions & 0 deletions bench/f16-f32acc-rdsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,46 @@
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c16,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c32,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c64,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, f16c_c128,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, avx512skx_c16,
xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c16,
Expand Down
86 changes: 86 additions & 0 deletions bench/packq-benchmark.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include "packq-benchmark.h"

#include <xnnpack/aligned-allocator.h>
#include <xnnpack/common.h>
#include <xnnpack/microfnptr.h>
#include <xnnpack/pack.h>
#include <xnnpack/packq.h>

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <random>
#include <vector>

#include "bench/utils.h"
#include <benchmark/benchmark.h>

void x8_packq(benchmark::State& state, xnn_x8_packq_f32qp8_ukernel_fn packq,
size_t mr, size_t kr, size_t sr,
benchmark::utils::IsaCheckFunction isa_check) {
if (isa_check != nullptr && !isa_check(state)) {
return;
}

const size_t batch = state.range(0);
const size_t dim_m = state.range(2);
const size_t dim_k = state.range(3);

const size_t rounded_n = benchmark::utils::RoundUp(dim_m, mr);
const size_t rounded_k = benchmark::utils::RoundUp(dim_k, kr);

std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = [&]() {
return std::uniform_real_distribution<float>(-10, 10)(rng);
};

// Compute a num_buffers that fit cache with source weights + packed_weights.
const size_t num_buffers =
1 + benchmark::utils::DivideRoundUp<size_t>(
benchmark::utils::GetMaxCacheSize(),
sizeof(int8_t) * batch *
(dim_m * dim_k + rounded_n * rounded_k + rounded_n));

std::vector<float, AlignedAllocator<float, 64>> input(num_buffers * batch *
dim_m * dim_k);
std::generate(input.begin(), input.end(), f32rng);
const size_t packed_size =
xnn_x8_packq_f32qp8_packed_size(batch * dim_m, dim_k, mr, kr);
std::vector<int8_t, AlignedAllocator<int8_t, 64>> packed_weights(num_buffers *
packed_size);

size_t buffer_index = 0;
for (auto _ : state) {
if (++buffer_index == num_buffers) {
buffer_index = 0;
}

packq(batch * dim_m, dim_k, mr, kr, sr,
/*m_idx_start=*/buffer_index * dim_m,
input.data() + buffer_index * batch * dim_m * dim_k,
dim_k * sizeof(float), packed_weights.data());
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}

const size_t elements_per_iteration = batch * dim_m * dim_k;
state.counters["elements"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * elements_per_iteration,
benchmark::Counter::kIsRate);

const size_t bytes_per_iteration =
(elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n)) *
sizeof(int8_t);
state.counters["bytes"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * bytes_per_iteration,
benchmark::Counter::kIsRate);
}
24 changes: 24 additions & 0 deletions bench/packq-benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#ifndef __XNNPACK_BENCH_PACKQ_BENCHMARK_H
#define __XNNPACK_BENCH_PACKQ_BENCHMARK_H

#include <xnnpack/aligned-allocator.h>
#include <xnnpack/common.h>
#include <xnnpack/microfnptr.h>
#include <xnnpack/pack.h>
#include <xnnpack/packq.h>

#include <cstddef>

#include "bench/utils.h"
#include <benchmark/benchmark.h>

void x8_packq(benchmark::State& state, xnn_x8_packq_f32qp8_ukernel_fn packq,
size_t mr, size_t kr, size_t sr,
benchmark::utils::IsaCheckFunction isa_check = nullptr);

#endif // __XNNPACK_TEST_PACKQ_MICROKERNEL_TESTER_H
46 changes: 46 additions & 0 deletions bench/x8-packq.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
//
// Auto-generated file. Do not edit!
// Specification: test/x8-packq.yaml
// Generator: tools/generate-packq-test.py


#include <xnnpack/common.h>
#include <xnnpack/packq.h>

#include <benchmark/benchmark.h>
#include "bench/bgemm.h"
#include "bench/packq-benchmark.h"


static void x8_packq_f32qp8_ukernel__scalar_u1_mr1(
benchmark::State& state, const char* net) {
x8_packq(state,
xnn_x8_packq_f32qp8_ukernel__scalar_u1,
/*mr=*/1, /*kr=*/1, /*sr=*/1);
}
BENCHMARK_BGEMM(x8_packq_f32qp8_ukernel__scalar_u1_mr1)

static void x8_packq_f32qp8_ukernel__scalar_u1_mr2(
benchmark::State& state, const char* net) {
x8_packq(state,
xnn_x8_packq_f32qp8_ukernel__scalar_u1,
/*mr=*/2, /*kr=*/1, /*sr=*/1);
}
BENCHMARK_BGEMM(x8_packq_f32qp8_ukernel__scalar_u1_mr2)

static void x8_packq_f32qp8_ukernel__scalar_u1_mr4(
benchmark::State& state, const char* net) {
x8_packq(state,
xnn_x8_packq_f32qp8_ukernel__scalar_u1,
/*mr=*/4, /*kr=*/1, /*sr=*/1);
}
BENCHMARK_BGEMM(x8_packq_f32qp8_ukernel__scalar_u1_mr4)


#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif
1 change: 1 addition & 0 deletions cmake/gen/scalar_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,7 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
src/x8-lut/gen/x8-lut-scalar-u4.c
src/x8-lut/gen/x8-lut-scalar-u8.c
src/x8-lut/gen/x8-lut-scalar-u16.c
src/x8-packq/x8-packq-scalar-f32qp8-u1.c
src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u2.c
src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u4.c
src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-int-u2.c
Expand Down
1 change: 1 addition & 0 deletions gen/scalar_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -1048,6 +1048,7 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
"src/x8-lut/gen/x8-lut-scalar-u4.c",
"src/x8-lut/gen/x8-lut-scalar-u8.c",
"src/x8-lut/gen/x8-lut-scalar-u16.c",
"src/x8-packq/x8-packq-scalar-f32qp8-u1.c",
"src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u2.c",
"src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-int-u4.c",
"src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-int-u2.c",
Expand Down
24 changes: 17 additions & 7 deletions include/xnnpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,20 +219,30 @@ enum xnn_datatype {
xnn_datatype_fp32 = 1,
/// IEEE754 half-precision floating-point.
xnn_datatype_fp16 = 2,
/// Quantized 8-bit signed integer with shared per-Value quantization parameters.
/// Quantized 8-bit signed integer with shared per-Value quantization
/// parameters.
xnn_datatype_qint8 = 3,
/// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
/// Quantized 8-bit unsigned integer with shared per-Value quantization
/// parameters.
xnn_datatype_quint8 = 4,
/// Quantized 32-bit signed integer with shared per-Value quantization parameters.
/// Quantized 32-bit signed integer with shared per-Value quantization
/// parameters.
xnn_datatype_qint32 = 5,
/// Quantized 8-bit signed integer with shared per-channel quantization parameters.
/// Quantized 8-bit signed integer with shared per-channel quantization
/// parameters.
xnn_datatype_qcint8 = 6,
/// Quantized 32-bit signed integer with shared per-channel quantization parameters.
/// Quantized 32-bit signed integer with shared per-channel quantization
/// parameters.
xnn_datatype_qcint32 = 7,
/// Quantized 4-bit signed integer with shared per-channel quantization parameters.
/// Quantized 4-bit signed integer with shared per-channel quantization
/// parameters.
xnn_datatype_qcint4 = 8,
/// Dynamically quantized 8-bit signed integer with per-batch quantization parameters.
/// Dynamically quantized 8-bit signed integer with per-batch quantization
/// parameters.
xnn_datatype_qdint8 = 9,
/// Dynamically quantized 8-bit signed integers packed with their per-row
/// quantization parameters.
xnn_datatype_qpint8 = 10,
};

/// Define a tensor-type Value and add it to a Subgraph.
Expand Down
3 changes: 3 additions & 0 deletions scripts/generate-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
### Tests for packing micro-kernels
tools/generate-pack-test.py --spec test/x32-packx.yaml --output test/x32-packx.cc &

### Tests for Pack quantized micro-kernels
tools/generate-packq-test.py --spec test/x8-packq.yaml --output test/x8-packq.cc --output-bench bench/x8-packq.cc &

### Tests for Pack Weights micro-kernels
tools/generate-packw-test.py --spec test/x8-packw.yaml --output test/x8-packw.cc --output-bench bench/x8-packw.cc &
tools/generate-packw-test.py --spec test/x16-packw.yaml --output test/x16-packw.cc --output-bench bench/x16-packw.cc &
Expand Down
2 changes: 2 additions & 0 deletions src/enums/datatype-strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ const char* xnn_datatype_to_string(enum xnn_datatype type) {
return "QCINT32";
case xnn_datatype_qdint8:
return "QDINT8";
case xnn_datatype_qpint8:
return "QPINT8";
}
XNN_UNREACHABLE;
return NULL;
Expand Down
6 changes: 6 additions & 0 deletions src/tensor.c
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,7 @@ size_t xnn_tensor_get_size(const struct xnn_value* value)
case xnn_datatype_qint8:
case xnn_datatype_quint8:
case xnn_datatype_qcint8:
case xnn_datatype_qpint8:
size = 1;
break;
case xnn_datatype_qint32:
Expand All @@ -528,6 +529,11 @@ size_t xnn_tensor_get_size(const struct xnn_value* value)
// Adjustments for nibbles, assume that we can't have sizes are byte-aligned (rounded up).
if (value->datatype == xnn_datatype_qcint4) {
size = round_up_po2(size, 2) >> 1;
} else if (value->datatype == xnn_datatype_qpint8) {
// TODO(b/340399245): Compute the correct size depending on the shape and
// packing constraints/alignment.
xnn_log_fatal("Support for %s is not yet implemented.",
xnn_datatype_to_string(value->datatype));
}

return size;
Expand Down

0 comments on commit 7130ee3

Please sign in to comment.